In [327]:
import os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.metrics import RocCurveDisplay

import util_unsupervised as util_u

import matplotlib.pyplot as plt
import seaborn

In [328]:
torch.manual_seed(764)

<torch._C.Generator at 0x150214757250>

In [329]:
class Net256(nn.Module):
    def __init__(self, n_classes):
        super(Net256, self).__init__()

        # input layer takes the first 256 principal components 
        self.lin_in = nn.Linear(256, 512)
        nn.init.xavier_uniform_(self.lin_in.weight)
        self.bn1 = nn.BatchNorm1d(512)
        self.relu1 = nn.ReLU()

        # 1. hidden layer
        self.hidden1 = nn.Linear(512, 128)
        nn.init.xavier_uniform_(self.hidden1.weight)
        self.bn2 = nn.BatchNorm1d(128)
        self.relu2 = nn.ReLU()

        # 2. hidden layer
        self.hidden2 = nn.Linear(128, 32)
        nn.init.xavier_uniform_(self.hidden2.weight)
        self.bn3 = nn.BatchNorm1d(32)
        self.relu3 = nn.ReLU()

        # 3. hidden layer
        self.hidden3 = nn.Linear(128, 32)
        nn.init.xavier_uniform_(self.hidden3.weight)
        self.bn4 = nn.BatchNorm1d(32)
        self.relu4 = nn.ReLU()

        # 4. hidden layer
        self.hidden4 = nn.Linear(64, 32)
        nn.init.xavier_uniform_(self.hidden4.weight)
        self.bn5 = nn.BatchNorm1d(32)
        self.relu5 = nn.ReLU()
        
        # output layer
        self.lin_out = nn.Linear(32, n_classes)

    def forward(self, x):
        # input layer
        x = self.relu1(self.bn1(self.lin_in(x)))

        # first hidden layer
        x = self.relu2(self.bn2(self.hidden1(x)))

        # second hidden layer
        x = self.relu3(self.bn3(self.hidden2(x)))

        # x = self.relu4(self.bn4(self.hidden3(x)))

        # x = self.relu5(self.bn5(self.hidden4(x)))
        
        # output layer. We don't add a softmax layer because we weill use
        # cross entropy loss as a loss funtion
        out = self.lin_out(x)

        return out

In [330]:
class EmbedVec256Dataset(Dataset):
    def __init__(self, df_dataset, label, data_prefix, label_dict):
        self.df_data = df_dataset
        self.data_label = label
        self.data_prefix = data_prefix
        self.data_cols = [c for c in self.df_data.columns if c.startswith(self.data_prefix)]
        self.label_dict = label_dict
        
    def __len__(self):
        return self.df_data.shape[0]
        
    def __getitem__(self, idx):
        moa_label = self.df_data.loc[idx, self.data_label]
        moa_tensor = torch.tensor(self.label_dict[moa_label])
        label = F.one_hot(moa_tensor, num_classes=len(self.label_dict)).type(torch.FloatTensor)
        
        embed_vector = self.df_data.loc[idx, self.data_cols]
        embed_tensor = torch.from_numpy(embed_vector.to_numpy().astype(np.float32))
        
        return embed_tensor, label

In [331]:
data_dir = "~/siads696/data"

random_state = 764
n_classes = 13 # number of MoA

learning_rate = 1e-3
n_epochs = 25
batch_size = 32
chunk_print = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [332]:
df_data = pd.read_parquet(os.path.join(data_dir, "well_grouped_256.parquet"))
print(f"Embedding vector dataset shape: {df_data.shape}")

print(f"Embedding vectors MoA assigned: {df_data[~df_data["Metadata_MoA"].isnull()].shape[0]}")

Embedding vector dataset shape: (3300, 261)
Embedding vectors MoA assigned: 1540


In [333]:
df_train_test = pd.read_csv("~/SIADS696_Milestone2/data/compound_moas_trainVtest.csv")
df_train = df_train_test[~df_train_test["in_testset"]]
df_test = df_train_test[df_train_test["in_testset"]]
print(f"Train/test set shape: {df_train_test.shape}")
print(f"Training set shape: {df_train.shape}")
print(f"Test set shape: {df_test.shape}")
print(f"MoA in training set:\n {df_train["MoA"].unique().tolist()}")
print(f"MoA in test set:\n {df_test["MoA"].unique().tolist()}")
print(f"Compounds in training set:\n {df_train["Compound"].unique().tolist()}")
print(f"Compounds in test set:\n {df_test["Compound"].unique().tolist()}")

Train/test set shape: (39, 4)
Training set shape: (29, 4)
Test set shape: (10, 4)
MoA in training set:
 ['Actin disruptors', 'Aurora kinase inhibitors', 'Cholesterol-lowering', 'DMSO', 'DNA damage', 'DNA replication', 'Eg5 inhibitors', 'Epithelial', 'Kinase inhibitors', 'Microtubule destabilizers', 'Microtubule stabilizers', 'Protein degradation', 'Protein synthesis']
MoA in test set:
 ['Actin disruptors', 'Aurora kinase inhibitors', 'DNA damage', 'DNA replication', 'Epithelial', 'Kinase inhibitors', 'Microtubule destabilizers', 'Microtubule stabilizers', 'Protein degradation', 'Protein synthesis']
Compounds in training set:
 ['cytochalasin B', 'cytochalasin D', 'AZ-A', 'AZ258', 'mevinolin/lovastatin', 'simvastatin', 'DMSO', 'chlorambucil', 'cisplatin', 'etoposide', 'camptothecin', 'floxuridine', 'methotrexate', 'AZ-C', 'AZ138', 'AZ-J', 'AZ-U', 'PD-169316', 'alsterpaullone', 'colchicine', 'demecolcine', 'nocodazole', 'docetaxel', 'epothilone B', 'ALLN', 'MG-132', 'lactacystin', 'anisom

In [334]:
df_data_train = df_data.merge(df_train, left_on="Metadata_Compound", right_on="Compound", how="inner")
print(f"Training data shape: {df_data_train.shape}")

df_data_test = df_data.merge(df_test, left_on="Metadata_Compound", right_on="Compound", how="inner")
print(f"Test data shape: {df_data_test.shape}")

print(f"Total embedding vectors in training/test set: {df_data_train.shape[0]+df_data_test.shape[0]}")

Training data shape: (986, 265)
Test data shape: (554, 265)
Total embedding vectors in training/test set: 1540


In [335]:
moa_list = df_data[~df_data["Metadata_MoA"].isnull()].loc[:, "Metadata_MoA"].unique().tolist()
moa_dict = {moa: idx for moa, idx in zip(moa_list, range(len(moa_list)))}
print(f"MoA label dictionary:\n{moa_dict}")

MoA label dictionary:
{'Protein degradation': 0, 'Aurora kinase inhibitors': 1, 'Eg5 inhibitors': 2, 'Epithelial': 3, 'DMSO': 4, 'Kinase inhibitors': 5, 'Protein synthesis': 6, 'DNA replication': 7, 'DNA damage': 8, 'Microtubule destabilizers': 9, 'Actin disruptors': 10, 'Microtubule stabilizers': 11, 'Cholesterol-lowering': 12}


In [336]:
train_dataset = EmbedVec256Dataset(df_data_train, "Metadata_MoA", "PC", moa_dict)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = EmbedVec256Dataset(df_data_test, "Metadata_MoA", "PC", moa_dict)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [337]:
# train_features, train_labels = next(iter(train_dataloader))
# print(f"Embedding vectors batch shape: {train_features.size()}")
# print(f"Labels batch shape: {train_labels.size()}")
# embed_vec = train_features[0]
# label = train_labels[0]
# print(f"Label: {label}")

In [338]:
model = Net256(n_classes)
# print(model)

In [339]:
# using cross entropy loss for multiclass classification
loss_func = nn.CrossEntropyLoss()

# use Adam as optimizer for this NN
optimizer = torch.optim.Adam(model.parameters(), foreach=True, lr=learning_rate)

In [340]:
# emb_vec = torch.rand(32, 256)
# result = model(emb_vec)
# print(result)

In [341]:
print("Begin Training")

model.train()
for epoch in range(n_epochs):
    running_loss = 0.0
    size = len(train_dataloader.dataset)
    for batch, (X, y) in enumerate(train_dataloader):
        
        output = model(X)
        optimizer.zero_grad()
        loss = loss_func(output, y)
        loss.backward()
        optimizer.step()
    
        running_loss += loss.item()
        
        if (batch+1) % chunk_print == 0:
          print(f"Epoch [{epoch+1}/{n_epochs}], Step [{batch+1}/{size//batch_size}], Loss: {running_loss/chunk_print}")
          running_loss = 0.0


print("Stop Training")

Begin Training
Epoch [1/25], Step [10/30], Loss: 2.2822383642196655
Epoch [1/25], Step [20/30], Loss: 1.9757553577423095
Epoch [1/25], Step [30/30], Loss: 1.7114137530326843
Epoch [2/25], Step [10/30], Loss: 1.4941835403442383
Epoch [2/25], Step [20/30], Loss: 1.3351672649383546
Epoch [2/25], Step [30/30], Loss: 1.2560648679733277
Epoch [3/25], Step [10/30], Loss: 1.07661771774292
Epoch [3/25], Step [20/30], Loss: 0.9832516789436341
Epoch [3/25], Step [30/30], Loss: 0.9189135432243347
Epoch [4/25], Step [10/30], Loss: 0.779290622472763
Epoch [4/25], Step [20/30], Loss: 0.7072518289089202
Epoch [4/25], Step [30/30], Loss: 0.655372929573059
Epoch [5/25], Step [10/30], Loss: 0.5691891193389893
Epoch [5/25], Step [20/30], Loss: 0.5067409306764603
Epoch [5/25], Step [30/30], Loss: 0.49022342562675475
Epoch [6/25], Step [10/30], Loss: 0.39623338282108306
Epoch [6/25], Step [20/30], Loss: 0.37012653350830077
Epoch [6/25], Step [30/30], Loss: 0.32341012060642244
Epoch [7/25], Step [10/30], Los

In [342]:
model.eval()
size = len(test_dataloader.dataset)
num_batches = len(test_dataloader)
test_loss, correct = 0, 0

# Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
# also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
with torch.no_grad():
    for X, y in test_dataloader:
        pred = model(X)
        test_loss += loss_func(pred, y).item()
        correct += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()

test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

Test Error: 
 Accuracy: 80.1%, Avg loss: 0.847818 

