In [169]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [170]:
import os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

import ms2_model
from ms2_model import Net256
from ms2_dataset import EmbedVec256Dataset

import matplotlib.pyplot as plt
import seaborn

In [171]:
torch.manual_seed(764)

<torch._C.Generator at 0x152b452bb750>

In [172]:
def balance_train_data(df, random_state):
  df_grp = df.groupby(["Metadata_MoA"])["Metadata_Compound"].count().reset_index(name="count")
  mean_count = int(df_grp.drop(df_grp[df_grp["Metadata_MoA"] == "DMSO"].index)["count"].mean().round())

  df_dmso = df[df["Metadata_MoA"] == "DMSO"].sample(n=mean_count, random_state=random_state)
  df_other = df.drop(df[df["Metadata_MoA"] == "DMSO"].index)

  df_all = pd.concat([df_other, df_dmso], axis=0)
  return df_all.reset_index(drop=True)

In [173]:
data_dir = "~/siads696/data"

random_state = 764
cv_splits = 5

learning_rate = 0.02
n_epochs = 14
batch_size = 64
chunk_print = 10

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = "cpu"
print(f"Using device: {device}")
# if device == "cuda:0":
#     torch.backends.cudnn.deterministic = True

Using device: cuda:0


In [174]:
df_data = pd.read_parquet(os.path.join(data_dir, "bbbc021_image_embed_batchcorr_256.parquet"))
# df_data = pd.read_parquet(os.path.join(data_dir, "well_grouped_256.parquet"))

print(f"Embedding vector dataset shape: {df_data.shape}")
print(f"Embedding vectors MoA assigned: {df_data[~df_data['Metadata_MoA'].isnull()].shape[0]}")

Embedding vector dataset shape: (13200, 265)
Embedding vectors MoA assigned: 6160


In [175]:
df_train_test = pd.read_csv(os.path.join(data_dir, "compound_moas_trainVtest.csv"))
df_train = df_train_test[~df_train_test["in_testset"]]
df_test = df_train_test[df_train_test["in_testset"]]

In [176]:
print(f"Train/test set shape: {df_train_test.shape}")
print(f"Training set shape: {df_train.shape}")
print(f"Test set shape: {df_test.shape}")
print(f"MoA in training set:\n {df_train['MoA'].unique().tolist()}")
print(f"MoA in test set:\n {df_test['MoA'].unique().tolist()}")
print(f"Compounds in training set:\n {df_train['Compound'].unique().tolist()}")
print(f"Compounds in test set:\n {df_test['Compound'].unique().tolist()}")

Train/test set shape: (39, 4)
Training set shape: (29, 4)
Test set shape: (10, 4)
MoA in training set:
 ['Actin disruptors', 'Aurora kinase inhibitors', 'Cholesterol-lowering', 'DMSO', 'DNA damage', 'DNA replication', 'Eg5 inhibitors', 'Epithelial', 'Kinase inhibitors', 'Microtubule destabilizers', 'Microtubule stabilizers', 'Protein degradation', 'Protein synthesis']
MoA in test set:
 ['Actin disruptors', 'Aurora kinase inhibitors', 'DNA damage', 'DNA replication', 'Epithelial', 'Kinase inhibitors', 'Microtubule destabilizers', 'Microtubule stabilizers', 'Protein degradation', 'Protein synthesis']
Compounds in training set:
 ['cytochalasin B', 'cytochalasin D', 'AZ-A', 'AZ258', 'mevinolin/lovastatin', 'simvastatin', 'DMSO', 'chlorambucil', 'cisplatin', 'etoposide', 'camptothecin', 'floxuridine', 'methotrexate', 'AZ-C', 'AZ138', 'AZ-J', 'AZ-U', 'PD-169316', 'alsterpaullone', 'colchicine', 'demecolcine', 'nocodazole', 'docetaxel', 'epothilone B', 'ALLN', 'MG-132', 'lactacystin', 'anisom

In [177]:
data_cols = [c for c in df_data.columns if c.startswith("Z")]

df_data_train = df_data.merge(df_train, left_on="Metadata_Compound", right_on="Compound", how="inner")

df_data_train = balance_train_data(df_data_train, random_state)
data_matrix = df_data_train[data_cols]

df_data_test = df_data.merge(df_test, left_on="Metadata_Compound", right_on="Compound", how="inner")
data_matrix = df_data_test[data_cols]

In [178]:
print(f"Training data shape: {df_data_train.shape}")
print(f"Training data balanced shape: {df_data_train.shape}")
print(f"Test data shape: {df_data_test.shape}")
print(f"Total embedding vectors in training/test set: {df_data_train.shape[0]+df_data_test.shape[0]}")

Training data shape: (2843, 269)
Training data balanced shape: (2843, 269)
Test data shape: (2216, 269)
Total embedding vectors in training/test set: 5059


In [179]:
moa_list = df_data[~df_data["Metadata_MoA"].isnull()].loc[:, "Metadata_MoA"].unique().tolist()
moa_dict = {moa: idx for moa, idx in zip(moa_list, range(len(moa_list)))}
n_classes = len(moa_dict.keys())

In [180]:
print(f"MoA label dictionary:\n{moa_dict}")
print(f"Number of classes: {n_classes}")

MoA label dictionary:
{'Protein degradation': 0, 'Kinase inhibitors': 1, 'Protein synthesis': 2, 'DNA replication': 3, 'DNA damage': 4, 'Microtubule destabilizers': 5, 'Actin disruptors': 6, 'Microtubule stabilizers': 7, 'Cholesterol-lowering': 8, 'Epithelial': 9, 'Eg5 inhibitors': 10, 'Aurora kinase inhibitors': 11, 'DMSO': 12}
Number of classes: 13


In [181]:
train_dataset = EmbedVec256Dataset(df_data_train, "Metadata_MoA", "PC", moa_dict)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = EmbedVec256Dataset(df_data_test, "Metadata_MoA", "PC", moa_dict)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [182]:
model = Net256(n_classes)

In [183]:
# using cross entropy loss for multiclass classification
loss_func = nn.CrossEntropyLoss()

# use Adam as optimizer for this NN
optimizer = torch.optim.Adam(model.parameters(), foreach=True, lr=learning_rate)

In [184]:
# print("Begin Training")

# for epoch in range(n_epochs):
#     running_loss = 0.0
#     loss, accuracy = ms2_model.train_model(model, optimizer, loss_func, train_dataloader)
#     print(f"Epoch {epoch+1}  Loss: {loss:>5f}  Accuracy: {accuracy:>5f}")
# print("Stop Training")

In [185]:
# y_label = df_data_test["Metadata_MoA"].map(moa_dict).tolist()

# avg_loss, yhat_label = ms2_model.test_model(model, loss_func, test_dataloader)

# print(avg_loss)
# print(accuracy_score(y_label, yhat_label))
# print(f1_score(y_label, yhat_label, average="weighted"))
# print(precision_score(y_label, yhat_label, average=None))
# print(recall_score(y_label, yhat_label, zero_division=np.nan, average=None))

In [186]:
kfold = KFold(n_splits=cv_splits, shuffle=True, random_state=random_state)

for fold ,(train_idx, valid_idx) in enumerate(kfold.split(np.arange(len(train_dataset)))):
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
    valid_dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=valid_sampler)

    model = Net256(n_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), foreach=True, lr=learning_rate)
    loss_func = nn.CrossEntropyLoss()

    print(f"Fold {fold+1}")
    for epoch in range(n_epochs):
        train_loss, train_acc = ms2_model.train_model(model, optimizer, loss_func, train_dataloader, device)
        valid_loss, valid_acc = ms2_model.valid_model(model, loss_func, valid_dataloader, device)
        tl = train_loss / len(train_dataloader.sampler)
        ta = train_acc / len(train_dataloader.sampler)
        vl = valid_loss / len(valid_dataloader.sampler)
        va = valid_acc / len(valid_dataloader.sampler)
        print(f"Epoch {epoch+1}: Train loss: {tl:>5f}  Train accuracy: {ta:>5f}  Valid loss: {vl:>5f}  Valid accuracy: {va:>5f}")
torch.save(model, "../models/kc_nn_Net256.pt")

Fold 1
Epoch 1: Train loss: 0.982877  Train accuracy: 0.695690  Valid loss: 0.762471  Valid accuracy: 0.787346
Epoch 2: Train loss: 0.337028  Train accuracy: 0.886544  Valid loss: 0.540558  Valid accuracy: 0.831283
Epoch 3: Train loss: 0.217787  Train accuracy: 0.928320  Valid loss: 0.594312  Valid accuracy: 0.815466
Epoch 4: Train loss: 0.169937  Train accuracy: 0.943712  Valid loss: 0.619682  Valid accuracy: 0.836555
Epoch 5: Train loss: 0.107398  Train accuracy: 0.967018  Valid loss: 0.590191  Valid accuracy: 0.841828
Epoch 6: Train loss: 0.093686  Train accuracy: 0.968777  Valid loss: 0.653703  Valid accuracy: 0.855888
Epoch 7: Train loss: 0.131639  Train accuracy: 0.963940  Valid loss: 0.579206  Valid accuracy: 0.852373
Epoch 8: Train loss: 0.085720  Train accuracy: 0.969657  Valid loss: 0.753462  Valid accuracy: 0.833040
Epoch 9: Train loss: 0.054200  Train accuracy: 0.982410  Valid loss: 0.688708  Valid accuracy: 0.848858
Epoch 10: Train loss: 0.060707  Train accuracy: 0.983289 

In [187]:
trained_model = torch.load("../models/kc_nn_Net256.pt").to(device)

y_label = df_data_test["Metadata_MoA"].map(moa_dict).tolist()
test_loss, yhat_label = ms2_model.test_model(trained_model, loss_func, test_dataloader, device)

print(test_loss / len(test_dataloader.dataset))
print(accuracy_score(y_label, yhat_label))

2.2941183206164664
0.6773465703971119
