In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
# Define the model class
class LinearRegressor(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearRegressor, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)

class LogisticRegression(torch.nn.Module):    
    # build the constructor
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear = torch.nn.Linear(input_size, output_size)
    # make predictions
    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

In [16]:
# Example Data (Replace with your actual data)
# X_train: tensor of shape [n_samples, n_features]
# y_train: tensor of shape [n_samples, 1]
#data generated from https://github.com/dakshmittal30/Adaptive_sampling/blob/7cf3996c786ce33db90fcb7aef8584054169557c/src/notebooks/Selection_bias.ipynb


url = '/shared/share_mala/yuanzhe/adaptive_sampling/pipeline_datasets/'
csv_file = 'input_dim_1_train_init_data_mean_0.0ln_1.0sig_0.1no.2000.csv'
#below is biased training data
csv_file =  '/biased_new/classifier_input_dim_1_train_init_data_mean_0.0ln_1.0sig_0.1no.2000_random_prop_score_selected_2_16.0__.csv'
df = pd.read_csv(url + csv_file)
X_train = np.array(df[['Column0']])
y_train = np.array(df[['EVENT_LABEL']])  
y_train = y_train >0 #convert regression into classifier

# Convert data to PyTorch tensors if they aren't already
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)

# Hyperparameters
epochs = 200  # Number of training iterations
learning_rate = 0.01

# Model, Loss and Optimizer
model = LogisticRegression(input_size=X_train.shape[1], output_size=1)
criterion = nn.MSELoss()
criterion = torch.nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    # Forward pass: Compute predicted y by passing X to the model
    y_pred = model(X_train)

    # Compute and print loss
    loss = criterion(y_pred, y_train)
    if epoch % 10 == 0:  # Print every 10th epoch
        print(f'Epoch {epoch}, Loss: {loss.item()}')

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

#torch.save(model, url + 'predictor.pkl')

model_scripted = torch.jit.script(model) # Export to TorchScript
model_scripted.save(url + 'predictor_0206.pt') # Save

#https://stackoverflow.com/questions/55488795/unpickling-saved-pytorch-model-throws-attributeerror-cant-get-attribute-net

Epoch 0, Loss: 1.0371397733688354
Epoch 10, Loss: 1.005934476852417
Epoch 20, Loss: 0.9767131805419922
Epoch 30, Loss: 0.9494389295578003
Epoch 40, Loss: 0.9240630269050598
Epoch 50, Loss: 0.900526762008667
Epoch 60, Loss: 0.8787612318992615
Epoch 70, Loss: 0.858690619468689
Epoch 80, Loss: 0.8402321338653564
Epoch 90, Loss: 0.8232988715171814
Epoch 100, Loss: 0.8078004121780396
Epoch 110, Loss: 0.793645441532135
Epoch 120, Loss: 0.7807420492172241
Epoch 130, Loss: 0.7689998149871826
Epoch 140, Loss: 0.7583305835723877
Epoch 150, Loss: 0.7486493587493896
Epoch 160, Loss: 0.7398746013641357
Epoch 170, Loss: 0.7319294810295105
Epoch 180, Loss: 0.7247414588928223
Epoch 190, Loss: 0.7182427644729614


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def Recall_True(x_test, y_test, model, device): #input is dataloader_test and classifier/ model c, output is true recall given labels
 
    prediction_list = model(x_test)
    predicted_class = torch.argmax(prediction_list)
    predicted_class = prediction_list >= 0.5 #may need to use the previous code if model predicts probs of two classes

    x = torch.sum(torch.mul(y_test, predicted_class))
    y = torch.sum(y_test)
    return x/y

test_csv_name = url + 'classifier_input_dim_1_test_final_data_mean_0.0ln_1.0sig_0.1no.2000.csv'
df_test = pd.read_csv(test_csv_name)
X_test = np.array(df_test[['Column0']])
y_test = np.array(df_test[['EVENT_LABEL']]) 
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

print('Recall on training data', Recall_True(X_train,y_train,model,device))
print('Recall on test data', Recall_True(X_test,y_test,model,device))

Recall on training data tensor(0.9780)
Recall on test data tensor(0.4907)


In [18]:
#convert reg to classification, no need to run again
# url = '/shared/share_mala/yuanzhe/adaptive_sampling/pipeline_datasets/'
# train_csv_name = 'input_dim_1_train_init_data_mean_0.0ln_1.0sig_0.1no.2000.csv'
# test_csv_name = 'input_dim_1_test_final_data_mean_0.0ln_1.0sig_0.1no.2000.csv'
# pool_csv_name = 'input_dim_1_pool_data_mean_0.0ln_1.0sig_0.1no.2000.csv'
# file_list  = [train_csv_name, test_csv_name, pool_csv_name]
# #convert y into 0/1

# for f in file_list:
#     df = pd.read_csv(url + f)
#     df['EVENT_LABEL'] = df['EVENT_LABEL'] > 0
#     df.to_csv(url+'classifier_'+ f, index = False)
# #/user/ym2865/Adaptive Sampling/src