In [31]:
import json
import tensorflow as tf
from tensorflow.keras import Input
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, BatchNormalization, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
from featurizers import TextCNNFeaturizer

# Configuring TensorFlow to use GPU
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


#### Datasets

In [32]:
# drugs
drugs = pd.read_csv('../data/drugs.csv', index_col=1)
drugs.drop(['Unnamed: 0'], axis=1, inplace=True)

# cell lines
cell_lines = pd.read_csv('../data/gene_expression.csv', index_col=0)

# combinations
combinations = pd.read_csv('../data/combinations.csv', index_col=0)

In [33]:
# reduce cell lines to the 2128 most important genes

with open('../data/2128_genes.pkl', 'rb') as f:
    top_genes = pickle.load(f)

filtered_cell_lines = cell_lines[cell_lines.columns.intersection(top_genes)]

In [34]:
combinations

Unnamed: 0,Drug_ID,Cell Line_ID,Y
0,Camptothecin,HCC1954,-0.251083
1,Camptothecin,HCC1143,1.343315
2,Camptothecin,HCC1187,1.736985
3,Camptothecin,HCC1395,-2.309078
4,Camptothecin,HCC1599,-3.106684
...,...,...,...
92698,JQ1,EFM-192A,3.576583
92699,JQ1,HCC1428,1.402466
92700,JQ1,HDQ-P1,2.762460
92701,JQ1,JIMT-1,3.442930


### Merge data

In [35]:
# check max lenght drug
max_drug = 0
for drug in drugs['Drug']:
    if len(drug) > max_drug:
        max_drug = len(drug)
print(max_drug)

173


In [36]:
# DEFAULT_CHAR_DICT_STR = {'#': 1, '(': 2, ')': 3, '+': 4, '-': 5, '/': 6, '1': 7, '2': 8, '3': 9,
#                              '4': 10, '5': 11, '6': 12, '7': 13, '8': 14, '=': 15, 'C': 16, 'F': 17,
#                              'H': 18, 'I': 19, 'N': 20, 'O': 21, 'P': 22, 'S': 23, '[': 24, '\\': 25,
#                              ']': 26, '_': 27, 'c': 28, 'Cl': 29, 'Br': 30, 'n': 31, 'o': 32, 's': 33,
#                              '.': 34, 'Pt': 35, '@':36}



# featurizer = TextCNNFeaturizer(char_dict=DEFAULT_CHAR_DICT_STR, seq_length=180)
# featurizer.featurize_molecule('CC1=C(SC2=C1C(=N[C@H](C3=NN=C(N32)C)')

In [37]:
# merge cell lines and drugs considering the combinations
final_df = pd.merge(combinations, drugs, on='Drug_ID')
final_df = pd.merge(final_df, filtered_cell_lines, left_on='Cell Line_ID', right_index=True, how = 'inner')
final_df.index = final_df['Drug_ID'] + '_' + final_df['Cell Line_ID']
final_df

Unnamed: 0,Drug_ID,Cell Line_ID,Y,Drug,FUCA2,GCLC,STPG1,CYP26B1,NDUFAB1,ABCB5,...,ATP6V1E2,ZNF345,ATXN7L3B,PRKDC,PBOV1,OR13A1,ZNF253,MRPL46,OR1D5,MYH4
Camptothecin_HCC1954,Camptothecin,HCC1954,-0.251083,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...,7.553067,9.280913,3.372573,3.777488,10.985991,2.902166,...,5.120570,4.638312,5.598942,5.947629,3.032667,3.229141,4.110379,8.629097,3.228033,2.632448
Vinblastine_HCC1954,Vinblastine,HCC1954,-3.058914,CC[C@@]1(CC2C[C@@](C3=C(CCN(C2)C1)C4=CC=CC=C4N...,7.553067,9.280913,3.372573,3.777488,10.985991,2.902166,...,5.120570,4.638312,5.598942,5.947629,3.032667,3.229141,4.110379,8.629097,3.228033,2.632448
Cisplatin_HCC1954,Cisplatin,HCC1954,5.005908,N.N.[Cl-].[Cl-].[Pt+2],7.553067,9.280913,3.372573,3.777488,10.985991,2.902166,...,5.120570,4.638312,5.598942,5.947629,3.032667,3.229141,4.110379,8.629097,3.228033,2.632448
Cytarabine_HCC1954,Cytarabine,HCC1954,3.947056,C1=CN(C(=O)N=C1N)[C@H]2[C@H]([C@@H]([C@H](O2)C...,7.553067,9.280913,3.372573,3.777488,10.985991,2.902166,...,5.120570,4.638312,5.598942,5.947629,3.032667,3.229141,4.110379,8.629097,3.228033,2.632448
Docetaxel_HCC1954,Docetaxel,HCC1954,-4.177968,CC1=C2[C@H](C(=O)[C@@]3([C@H](C[C@@H]4[C@]([C@...,7.553067,9.280913,3.372573,3.777488,10.985991,2.902166,...,5.120570,4.638312,5.598942,5.947629,3.032667,3.229141,4.110379,8.629097,3.228033,2.632448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MK-2206_HCC202,MK-2206,HCC202,1.585069,C1CC(C1)(C2=CC=C(C=C2)C3=C(C=C4C(=N3)C=CN5C4=N...,7.069704,6.405211,3.458080,3.007654,11.139008,3.040279,...,5.509141,4.717170,6.264891,5.572891,2.870909,3.351663,4.704318,8.333207,3.714531,2.647746
Palbociclib_HCC202,Palbociclib,HCC202,4.867165,CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCN...,7.069704,6.405211,3.458080,3.007654,11.139008,3.040279,...,5.509141,4.717170,6.264891,5.572891,2.870909,3.351663,4.704318,8.333207,3.714531,2.647746
Pictilisib_HCC202,Pictilisib,HCC202,0.583059,CS(=O)(=O)N1CCN(CC1)CC2=CC3=C(S2)C(=NC(=N3)C4=...,7.069704,6.405211,3.458080,3.007654,11.139008,3.040279,...,5.509141,4.717170,6.264891,5.572891,2.870909,3.351663,4.704318,8.333207,3.714531,2.647746
5-Fluorouracil_HCC202,5-Fluorouracil,HCC202,5.998151,C1=C(C(=O)NC(=O)N1)F,7.069704,6.405211,3.458080,3.007654,11.139008,3.040279,...,5.509141,4.717170,6.264891,5.572891,2.870909,3.351663,4.704318,8.333207,3.714531,2.647746


### Dataset splitting ###

In [38]:
output = final_df['Y']
input = final_df.drop(['Drug_ID', 'Cell Line_ID', 'Y'], axis=1)

In [39]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(input, output, test_size=0.4, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size=0.5, random_state=42)

In [40]:
x_train_drugs = x_train['Drug']
x_train_cell_lines = x_train.iloc[:, x_train.columns.get_loc('Drug') + 1:]

x_val_drugs = x_val['Drug']
x_val_cell_lines = x_val.iloc[:, x_val.columns.get_loc('Drug') + 1:]

x_test_drugs = x_test['Drug']
x_test_cell_lines = x_test.iloc[:, x_test.columns.get_loc('Drug') + 1:]

In [41]:
DEFAULT_CHAR_DICT_STR = {'#': 1, '(': 2, ')': 3, '+': 4, '-': 5, '/': 6, '1': 7, '2': 8, '3': 9,
                             '4': 10, '5': 11, '6': 12, '7': 13, '8': 14,'=': 15, 'C': 16, 'F': 17,
                             'H': 18, 'I': 19, 'N': 20, 'O': 21, 'P': 22, 'S': 23, '[': 24, '\\': 25,
                             ']': 26, '_': 27, 'c': 28, 'Cl': 29, 'Br': 30, 'n': 31, 'o': 32, 's': 33,
                             '.': 34, 'Pt': 35, '@':36, 'B': 37, 'r': 38, 'l': 39, 'a': 40, 'i': 41, '9': 42}



featurizer = TextCNNFeaturizer(char_dict=DEFAULT_CHAR_DICT_STR, seq_length=180)


x_train_drugs = x_train_drugs.apply(lambda x: featurizer.featurize_molecule(x))
x_val_drugs = x_val_drugs.apply(lambda x: featurizer.featurize_molecule(x))
x_test_drugs = x_test_drugs.apply(lambda x: featurizer.featurize_molecule(x))

In [42]:
# transform each element of the list into a column
x_train_drugs = pd.DataFrame(x_train_drugs.tolist())
x_train_drugs.columns = ['token_' + str(col) for col in x_train_drugs.columns]

x_val_drugs = pd.DataFrame(x_val_drugs.tolist())
x_val_drugs.columns = ['token_' + str(col) for col in x_val_drugs.columns]

x_test_drugs = pd.DataFrame(x_test_drugs.tolist())
x_test_drugs.columns = ['token_' + str(col) for col in x_test_drugs.columns]


### Drug CNN model

In [46]:
from models import DrugCNNModel


# Example usage

model = DrugCNNModel(x_train_cell_lines.shape[1], x_train_drugs.shape[1], optimizer='SGD')
print(model.summary())
model.train(x_train_cell_lines, x_train_drugs, y_train, epochs=10, batch_size=16)

val_loss, val_mae = model.evaluate(x_val_cell_lines, x_val_drugs, y_val)
print(f'Test Loss: {val_loss}, Test MAE: {val_mae}')

predictions = model.predict(x_test_cell_lines, x_test_drugs)
print(predictions)

ValueError: Exception encountered when calling layer "conv1d_4" (type Conv1D).

Negative dimension size caused by subtracting 1 from 0 for '{{node conv1d_4/Conv1D}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], explicit_paddings=[], padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true](conv1d_4/Conv1D/ExpandDims, conv1d_4/Conv1D/ExpandDims_1)' with input shapes: [?,1,0,75], [1,1,75,100].

Call arguments received by layer "conv1d_4" (type Conv1D):
  • inputs=tf.Tensor(shape=(None, 0, 75), dtype=float32)