In [1025]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1136]:
import os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

# import util_unsupervised as util_u
import ms2_model
from ms2_model import Net256
from ms2_dataset import EmbedVec256Dataset

import matplotlib.pyplot as plt
import seaborn

In [1137]:
torch.manual_seed(764)

<torch._C.Generator at 0x1512ec633250>

In [1138]:
def balance_train_data(df, random_state):
  df_grp = df.groupby(["Metadata_MoA"])["Metadata_Compound"].count().reset_index(name="count")
  mean_count = int(df_grp.drop(df_grp[df_grp["Metadata_MoA"] == "DMSO"].index)["count"].mean().round())

  df_dmso = df[df["Metadata_MoA"] == "DMSO"].sample(n=mean_count, random_state=random_state)
  df_other = df.drop(df[df["Metadata_MoA"] == "DMSO"].index)

  df_all = pd.concat([df_other, df_dmso], axis=0)
  return df_all.reset_index(drop=True)

In [1139]:
data_dir = "~/siads696/data"

random_state = 764
n_classes = 13 # number of MoA

learning_rate = 0.01
n_epochs = 12
batch_size = 16
chunk_print = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [1140]:
# df_data = pd.read_parquet(os.path.join(data_dir, "bbbc021_image_embed_batchcorr_256.parquet"))
df_data = pd.read_parquet(os.path.join(data_dir, "well_grouped_256.parquet"))
print(f"Embedding vector dataset shape: {df_data.shape}")

print(f"Embedding vectors MoA assigned: {df_data[~df_data['Metadata_MoA'].isnull()].shape[0]}")

Embedding vector dataset shape: (3300, 261)
Embedding vectors MoA assigned: 1540


In [1141]:
df_train_test = pd.read_csv(os.path.join(data_dir, "compound_moas_trainVtest.csv"))
df_train = df_train_test[~df_train_test["in_testset"]]
df_test = df_train_test[df_train_test["in_testset"]]
print(f"Train/test set shape: {df_train_test.shape}")
print(f"Training set shape: {df_train.shape}")
print(f"Test set shape: {df_test.shape}")
print(f"MoA in training set:\n {df_train['MoA'].unique().tolist()}")
print(f"MoA in test set:\n {df_test['MoA'].unique().tolist()}")
print(f"Compounds in training set:\n {df_train['Compound'].unique().tolist()}")
print(f"Compounds in test set:\n {df_test['Compound'].unique().tolist()}")

Train/test set shape: (39, 4)
Training set shape: (29, 4)
Test set shape: (10, 4)
MoA in training set:
 ['Actin disruptors', 'Aurora kinase inhibitors', 'Cholesterol-lowering', 'DMSO', 'DNA damage', 'DNA replication', 'Eg5 inhibitors', 'Epithelial', 'Kinase inhibitors', 'Microtubule destabilizers', 'Microtubule stabilizers', 'Protein degradation', 'Protein synthesis']
MoA in test set:
 ['Actin disruptors', 'Aurora kinase inhibitors', 'DNA damage', 'DNA replication', 'Epithelial', 'Kinase inhibitors', 'Microtubule destabilizers', 'Microtubule stabilizers', 'Protein degradation', 'Protein synthesis']
Compounds in training set:
 ['cytochalasin B', 'cytochalasin D', 'AZ-A', 'AZ258', 'mevinolin/lovastatin', 'simvastatin', 'DMSO', 'chlorambucil', 'cisplatin', 'etoposide', 'camptothecin', 'floxuridine', 'methotrexate', 'AZ-C', 'AZ138', 'AZ-J', 'AZ-U', 'PD-169316', 'alsterpaullone', 'colchicine', 'demecolcine', 'nocodazole', 'docetaxel', 'epothilone B', 'ALLN', 'MG-132', 'lactacystin', 'anisom

In [1142]:
data_cols = [c for c in df_data.columns if c.startswith("Z")]

df_data_train = df_data.merge(df_train, left_on="Metadata_Compound", right_on="Compound", how="inner")
print(f"Training data shape: {df_data_train.shape}")

df_data_train = balance_train_data(df_data_train, random_state)
data_matrix = df_data_train[data_cols]
print(f"Training data balanced shape: {df_data_train.shape}")

df_data_test = df_data.merge(df_test, left_on="Metadata_Compound", right_on="Compound", how="inner")
data_matrix = df_data_test[data_cols]
print(f"Test data shape: {df_data_test.shape}")

print(f"Total embedding vectors in training/test set: {df_data_train.shape[0]+df_data_test.shape[0]}")

Training data shape: (986, 265)
Training data balanced shape: (711, 265)
Test data shape: (554, 265)
Total embedding vectors in training/test set: 1265


In [1143]:
moa_list = df_data[~df_data["Metadata_MoA"].isnull()].loc[:, "Metadata_MoA"].unique().tolist()
moa_dict = {moa: idx for moa, idx in zip(moa_list, range(len(moa_list)))}
print(f"MoA label dictionary:\n{moa_dict}")

MoA label dictionary:
{'Protein degradation': 0, 'Aurora kinase inhibitors': 1, 'Eg5 inhibitors': 2, 'Epithelial': 3, 'DMSO': 4, 'Kinase inhibitors': 5, 'Protein synthesis': 6, 'DNA replication': 7, 'DNA damage': 8, 'Microtubule destabilizers': 9, 'Actin disruptors': 10, 'Microtubule stabilizers': 11, 'Cholesterol-lowering': 12}


In [1144]:
train_dataset = EmbedVec256Dataset(df_data_train, "Metadata_MoA", "PC", moa_dict)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = EmbedVec256Dataset(df_data_test, "Metadata_MoA", "PC", moa_dict)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [1145]:
model = Net256(n_classes)
# print(model)

In [1146]:
# using cross entropy loss for multiclass classification
loss_func = nn.CrossEntropyLoss()

# use Adam as optimizer for this NN
optimizer = torch.optim.Adam(model.parameters(), foreach=True, lr=learning_rate)

In [1147]:
print("Begin Training")

for epoch in range(n_epochs):
    running_loss = 0.0
    print(f"Epoch: {epoch}")
    ms2_model.train_model(model, optimizer, loss_func, train_dataloader)

print("Stop Training")

Begin Training
Epoch: 0
Loss: 2.586987  [   16/  711]
Loss: 1.724248  [  176/  711]
Loss: 1.847078  [  336/  711]
Loss: 1.067451  [  496/  711]
Loss: 0.702649  [  656/  711]
Epoch: 1
Loss: 1.087100  [   16/  711]
Loss: 1.028741  [  176/  711]
Loss: 0.616889  [  336/  711]
Loss: 0.643309  [  496/  711]
Loss: 0.463142  [  656/  711]
Epoch: 2
Loss: 0.230883  [   16/  711]
Loss: 0.959985  [  176/  711]
Loss: 0.806252  [  336/  711]
Loss: 0.471388  [  496/  711]
Loss: 0.368740  [  656/  711]
Epoch: 3
Loss: 0.475975  [   16/  711]
Loss: 0.454128  [  176/  711]
Loss: 0.263172  [  336/  711]
Loss: 0.303269  [  496/  711]
Loss: 0.253100  [  656/  711]
Epoch: 4
Loss: 0.242493  [   16/  711]
Loss: 0.285210  [  176/  711]
Loss: 0.344818  [  336/  711]
Loss: 0.089949  [  496/  711]
Loss: 0.393543  [  656/  711]
Epoch: 5
Loss: 0.107519  [   16/  711]
Loss: 0.059222  [  176/  711]
Loss: 0.121427  [  336/  711]
Loss: 0.138285  [  496/  711]
Loss: 0.445281  [  656/  711]
Epoch: 6
Loss: 0.175777  [   16

In [1149]:
avg_loss, yhat_label = ms2_model.test_model(model, loss_func, test_dataloader)
print(avg_loss)
print(accuracy_score(y_label, yhat_label))
print(f1_score(y_label, yhat_label, average="weighted"))
print(precision_score(y_label, yhat_label, average=None))
print(recall_score(y_label, yhat_label, zero_division=np.nan, average=None))

1.0575453713384506
0.8285198555956679
0.8610751759323593
[0.85       1.         0.         0.38461538 0.         0.9
 0.76923077 0.80952381 0.88888889 0.84615385 0.34285714 0.98176292
 0.        ]
[0.70833333 0.75              nan 0.3125            nan 0.5625
 0.83333333 0.70833333 0.66666667 0.91666667 0.5        0.91242938
        nan]
