In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Define the model class
class LinearRegressor(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearRegressor, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)

class LogisticRegression(torch.nn.Module):    
    # build the constructor
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear = torch.nn.Linear(input_size, output_size)
    # make predictions
    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

In [17]:
# Example Data (Replace with your actual data)
# X_train: tensor of shape [n_samples, n_features]
# y_train: tensor of shape [n_samples, 1]
#data generated from https://github.com/dakshmittal30/Adaptive_sampling/blob/7cf3996c786ce33db90fcb7aef8584054169557c/src/notebooks/Selection_bias.ipynb


#below is biased training data

directory = '/shared/share_mala/yuanzhe/adaptive_sampling/pipeline_datasets/'
train_csv_name = directory + '/biased_new/input_dim_1_train_init_data_mean_0.0ln_1.0sig_0.1no.2000_random_prop_score_selected_2_16.0__.csv'
test_csv_name = directory + 'input_dim_1_test_final_data_mean_0.0ln_1.0sig_0.1no.2000.csv'
pool_csv_name = directory + 'input_dim_1_pool_data_mean_0.0ln_1.0sig_0.1no.2000.csv'

file_list = [train_csv_name, test_csv_name, pool_csv_name]
name_list = ['train','test','pool']

df = pd.read_csv(train_csv_name)
X_train = np.array(df[['Column0']])
y_train = np.array(df[['EVENT_LABEL']])  
#y_train = y_train >0 #convert regression into classifier

# Convert data to PyTorch tensors if they aren't already
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)

# Hyperparameters
epochs = 1000  # Number of training iterations
learning_rate = 0.01

torch.manual_seed(123)

# Model, Loss and Optimizer
model = LogisticRegression(input_size=X_train.shape[1], output_size=1)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    # Forward pass: Compute predicted y by passing X to the model
    y_pred = model(X_train)
    
    #print test/ pool loss 
    #print X/Y train/ test/ pool
    # Compute and print loss
    loss = criterion(y_pred, y_train)
    if epoch % 10 == 0:  # Print every 10th epoch
        print(f'Epoch {epoch}, Loss: {loss.item()}')

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

#torch.save(model, url + 'predictor.pkl')

model_scripted = torch.jit.script(model) # Export to TorchScript
model_scripted.save(url + 'predictor_0214_regression.pt') # Save

#https://stackoverflow.com/questions/55488795/unpickling-saved-pytorch-model-throws-attributeerror-cant-get-attribute-net

Epoch 0, Loss: 0.3715963661670685
Epoch 10, Loss: 0.3563101291656494
Epoch 20, Loss: 0.34123674035072327
Epoch 30, Loss: 0.3264453709125519
Epoch 40, Loss: 0.3120012581348419
Epoch 50, Loss: 0.297963410615921
Epoch 60, Loss: 0.2843833565711975
Epoch 70, Loss: 0.27130401134490967
Epoch 80, Loss: 0.25875863432884216
Epoch 90, Loss: 0.24677112698554993
Epoch 100, Loss: 0.23535597324371338
Epoch 110, Loss: 0.22451893985271454
Epoch 120, Loss: 0.21425795555114746
Epoch 130, Loss: 0.20456427335739136
Epoch 140, Loss: 0.19542361795902252
Epoch 150, Loss: 0.18681728839874268
Epoch 160, Loss: 0.17872336506843567
Epoch 170, Loss: 0.17111767828464508
Epoch 180, Loss: 0.16397465765476227
Epoch 190, Loss: 0.15726816654205322
Epoch 200, Loss: 0.15097184479236603
Epoch 210, Loss: 0.14505991339683533
Epoch 220, Loss: 0.1395072489976883
Epoch 230, Loss: 0.1342896968126297
Epoch 240, Loss: 0.1293843686580658
Epoch 250, Loss: 0.12476955354213715
Epoch 260, Loss: 0.12042485922574997
Epoch 270, Loss: 0.116

In [18]:
##TBD

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def l2_loss(x_test, y_test, model, device): #input is dataloader_test and classifier/ model c, output is true recall given labels
 
    prediction_list = model(x_test)
    res = torch.square(torch.subtract(prediction_list, y_test))
    return torch.mean(res)


def compute_l2_loss(csv_file):
    df = pd.read_csv(csv_file)
    X_test = np.array(df[['Column0']])
    y_test = np.array(df[['EVENT_LABEL']]) 
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    
    return float(l2_loss(X_test,y_test,model,device))
    
for i,f in enumerate(file_list):
    print('l2_loss', name_list[i], compute_l2_loss(f))

l2_loss train 0.0307949036359787
l2_loss test 0.07647431641817093
l2_loss pool 0.08316905051469803


In [18]:
#convert reg to classification, no need to run again
# url = '/shared/share_mala/yuanzhe/adaptive_sampling/pipeline_datasets/'
# train_csv_name = 'input_dim_1_train_init_data_mean_0.0ln_1.0sig_0.1no.2000.csv'
# test_csv_name = 'input_dim_1_test_final_data_mean_0.0ln_1.0sig_0.1no.2000.csv'
# pool_csv_name = 'input_dim_1_pool_data_mean_0.0ln_1.0sig_0.1no.2000.csv'
# file_list  = [train_csv_name, test_csv_name, pool_csv_name]
# #convert y into 0/1

# for f in file_list:
#     df = pd.read_csv(url + f)
#     df['EVENT_LABEL'] = df['EVENT_LABEL'] > 0
#     df.to_csv(url+'classifier_'+ f, index = False)
# #/user/ym2865/Adaptive Sampling/src

In [7]:
# for i,f in enumerate(file_list):
#     df = pd.read_csv(f)
#     plt.title(name_list[i]+'_distributions')
#     sns.histplot(df, x="Column0", hue="EVENT_LABEL", element="poly")

#     plt.show()
    
