In [None]:
pip install gpytorch
#data is here /shared/share_mala/yuanzhe/adaptive_sampling/pipeline_datasets



In [None]:
import torch
import gpytorch
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
# Define parameters for the model
mean_constant = 0.0  # Mean of the GP
length_scale = 1.0   # Length scale of the RBF kernel
noise_std = 0.1      # Standard deviation of the noise

# Initialize multidimensional training data
num_samples = 2000
input_dim = 1
train_x = torch.randn((num_samples, input_dim))  # Random inputs in multi-dimensions
train_y = torch.zeros(num_samples)  # Placeholder for training targets

In [None]:
class CustomizableGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood, mean_constant, length_scale, noise_std):
        super(CustomizableGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.mean_module.constant = mean_constant
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        self.covar_module.base_kernel.lengthscale = length_scale
        self.likelihood = likelihood
        self.likelihood.noise_covar.noise = noise_std**2

    def forward(self, x):
        return gpytorch.distributions.MultivariateNormal(self.mean_module(x), self.covar_module(x))



In [None]:
# Define the likelihood
likelihood = gpytorch.likelihoods.GaussianLikelihood()

# Create the GP model with specified parameters
model = CustomizableGPModel(train_x, train_y, likelihood, mean_constant, length_scale, noise_std)


In [None]:
# Sample from the prior for training data
model.eval()
likelihood.eval()
with torch.no_grad():
    prior_dist = likelihood(model(train_x))
    train_y = prior_dist.sample()  # Synthetic training targets



In [None]:
train_x_array = train_x.numpy()
train_y_array = train_y.numpy()

In [None]:
train_init_data, test_data, train_init_labels, test_labels = train_test_split(train_x_array, train_y_array, test_size=0.75)


In [None]:
pool_data, test_final_data, pool_labels, test_final_labels = train_test_split(test_data, test_labels, test_size=0.33333333)

In [None]:
import pandas as pd

In [None]:
column_names = [f'Column{i}' for i in range(train_init_data.shape[1])]


In [None]:
df_train_init_data = pd.DataFrame(train_init_data, columns=column_names)
df_pool_data = pd.DataFrame(pool_data, columns=column_names)
df_test_final_data = pd.DataFrame(test_final_data, columns=column_names)

In [None]:
train_init_labels_2d = train_init_labels.reshape(-1, 1)
pool_labels_2d =  pool_labels.reshape(-1, 1)
test_final_labels_2d = test_final_labels.reshape(-1, 1)
df_train_init_labels_2d = pd.DataFrame(train_init_labels_2d, columns='EVENT_LABEL')
df_pool_labels_2d = pd.DataFrame(pool_labels_2d, columns='EVENT_LABEL')
df_test_final_labels_2d = pd.DataFrame(test_final_labels_2d, columns='EVENT_LABEL')

In [None]:
df_appended_train_init_data = pd.concat([df_train_init_data, df_train_init_labels_2d], axis=1)
df_appended_pool_data = pd.concat([df_pool_data, df_pool_labels_2d], axis=1)
df_appended_test_final_data = pd.concat([df_test_final_data, df_test_final_labels_2d], axis=1)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
directory = '/content/drive/MyDrive/pipeline_datasets/'
df_appended_train_init_data.to_csv(directory+'/'+'input_dim_'+str(input_dim)+'/'+'train_init_data_'+'mean_'+str(mean_constant)+'ln_'+str(length_scale)+'sig_'+str(noise_std)+'no.'+str(num_samples)+'.csv', index=False)
df_appended_pool_data.to_csv(directory+'/'+'input_dim_'+str(input_dim)+'/'+'pool_data_'+'mean_'+str(mean_constant)+'ln_'+str(length_scale)+'sig_'+str(noise_std)+'no.'+str(num_samples)+'.csv', index=False)
df_appended_test_final_data.to_csv(directory+'/'+'input_dim_'+str(input_dim)+'/'+'train_init_data_'+'mean_'+str(mean_constant)+'ln_'+str(length_scale)+'sig_'+str(noise_std)+'no.'+str(num_samples)+'.csv', index=False)
