In [1]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import keras
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout
from keras.optimizers import adam
import sklearn.preprocessing as sk
import random
from tensorflow.python.keras import backend as K
from sklearn.preprocessing import Imputer
import tensorflow as tf
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [4]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

In [2]:
GDSCE = pd.read_csv("../GDSCandCCLE/GDSC.tsv", sep = "\t", index_col=0, decimal = ".")
GDSCE.drop_duplicates(keep='last')
GDSCE = pd.DataFrame.transpose(GDSCE)
GDSCE = GDSCE.loc[:,~GDSCE.columns.duplicated()]
GDSCE.index = GDSCE.index.astype('int64')

CCLEE = pd.read_csv("../GDSCandCCLE/CCLE.tsv", 
                    sep = "\t", index_col=0, decimal = ".")
CCLEE.drop_duplicates(keep='last')
CCLEE = pd.DataFrame.transpose(CCLEE)
CCLEE = CCLEE.loc[:,~CCLEE.columns.duplicated()]

ls = GDSCE.columns.intersection(CCLEE.columns)
land_ls = pd.read_csv("../GDSCandCCLE/landmark_genes.tsv", 
                    sep = "\t", index_col=0, decimal = ".")
ls4 = ls.intersection(land_ls.index)
GDSCEv2 = GDSCE.loc[:,ls4]
CCLEEv2 = CCLEE.loc[:,ls4]

drugs = set()
celllines = set()
for line in open('../GDSCandCCLE/CCLE_CTRPv2.shared_with_GDSC.PharmacoDB.responses.tsv').readlines()[1:]:
    line = line.rstrip().split('\t')
    celllines.add(line[1])
    drugs.add(line[4])

data = {cellline : {drug : [0,0] for drug in drugs} for cellline in celllines}

for line in open('../GDSCandCCLE/CCLE_CTRPv2.shared_with_GDSC.PharmacoDB.responses.tsv').readlines()[1:]:
    line = line.rstrip().split('\t')
    cellline = line[1]
    drug = line[4]
    if not '' == line[-5]:
        IC50 = float(line[-5])
        data[cellline][drug][0] += IC50
        data[cellline][drug][1] += 1

for cellline,drugs in data.items():
    for drug in drugs:
        IC50_sum = data[cellline][drug][0]
        IC50_count = data[cellline][drug][1]
        if IC50_count > 0:
            data[cellline][drug] = IC50_sum/IC50_count
        else:
            data[cellline][drug] = 'NaN'

CCLER = pd.DataFrame.from_dict(data).transpose()
CCLER.to_csv('../GDSCandCCLE/CCLER.csv', sep=',', decimal = ".")

drugs = set()
celllines = set()
for line in open('../GDSCandCCLE/GDSC.shared_with_CCLE_CTRPv2.PharmacoDB.responses.tsv').readlines()[1:]:
    line = line.rstrip().split('\t')
    celllines.add(line[1])
    drugs.add(line[4])

data = {cellline : {drug : [0,0] for drug in drugs} for cellline in celllines}

for line in open('../GDSCandCCLE/GDSC.shared_with_CCLE_CTRPv2.PharmacoDB.responses.tsv').readlines()[1:]:
    line = line.rstrip().split('\t')
    cellline = line[1]
    drug = line[4]
    if not '' == line[-5]:
        IC50 = float(line[-5])
        data[cellline][drug][0] += IC50
        data[cellline][drug][1] += 1

for cellline,drugs in data.items():
    for drug in drugs:
        IC50_sum = data[cellline][drug][0]
        IC50_count = data[cellline][drug][1]
        if IC50_count > 0:
            data[cellline][drug] = IC50_sum/IC50_count
        else:
            data[cellline][drug] = 'NaN'

GDSCR = pd.DataFrame.from_dict(data).transpose()
GDSCR.to_csv('../GDSCandCCLE/GDSCR.csv', sep=',', decimal = ".")

GDSCR.index = GDSCR.index.astype('int64')
ls2 = GDSCEv2.index.intersection(GDSCR.index)
ls3 = CCLEEv2.index.intersection(CCLER.index)

GDSCEv3 = GDSCEv2.loc[ls2,:]
GDSCRv2 = GDSCR.loc[ls2,:]
CCLEEv3 = CCLEEv2.loc[ls3,:]
CCLERv2 = CCLER.loc[ls3,:]

Mask1 = pd.read_csv("../GDSCandCCLE/Masks/M1New.csv", 
                    sep = ",", index_col=0, decimal = ".")
Mask1.drop_duplicates(keep='last')
Mask1 = Mask1.loc[~Mask1.index.duplicated(),:]
Mask1 = Mask1.loc[:,~Mask1.columns.duplicated()]


Mask2 = pd.read_csv("../GDSCandCCLE/Masks/M2New.csv", 
                    sep = ",", index_col=0, decimal = ".")
Mask2 = Mask2.loc[~Mask2.index.duplicated(),:]
Mask2 = Mask2.loc[:,~Mask2.columns.duplicated()]

lsC = GDSCEv3.columns.intersection(Mask1.index)
GDSCEv4 = GDSCEv3.loc[:,lsC]
CCLEEv4 = CCLEEv3.loc[:,lsC]
Mask1 = Mask1.loc[lsC,:]

drugs = ['ATRA', 'Doxorubicin', 'Tamoxifen', 'Gefitinib', 'BIBW2992','masitinib', '17-AAG', 'GDC-0941', 'MK-2206', 'AZD6244', 'PLX4720']
drugIDs = ['15367', '28748', '41774', '49668', '61390', '63450', '64153', '65326', '67271','90227', '90295']
GDSCRv3 = GDSCRv2[drugs]
CCLERv3 = CCLERv2[drugs]

imputer1 = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer1 = imputer1.fit(GDSCRv3[drugs].values)
GDSCRv4 = imputer1.transform(GDSCRv3[drugs].values)

imputer2 = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer2 = imputer2.fit(CCLERv3[drugs].values)
CCLERv4 = imputer2.transform(CCLERv3[drugs].values)

lsP = Mask1.columns.intersection(Mask2.index)

Mask1 = Mask1.loc[:,lsP]
Mask2 = Mask2.loc[lsP,:]

Mask3 = pd.read_csv("../GDSCandCCLE/Masks/M3New.csv", 
                    sep = ",", index_col=0, decimal = ".")
lsD = Mask2.columns.intersection(Mask3.index)
Mask2 = Mask2.loc[:,lsD]
Mask3 = Mask3.loc[lsD,drugIDs]

Mask1v2 = 1- Mask1.values
Mask2v2 = 1-Mask2.values
Mask3v2 = 1-Mask3.values





In [3]:
# Hyperparameters
mbs = 64
hdm1 = 256
hdm2 = 64
epoch = 150
rate1 = 0.03
rate2 = 0.05


In [4]:
# Train GDSC, test CCLE
X_train = GDSCEv4.values
X_test = CCLEEv4.values
y_train = GDSCRv4
y_test = CCLERv4

In [None]:
# Train CCLE, test GDSC
X_train = CCLEEv4.values
X_test = GDSCEv4.values
y_train = CCLERv4
y_train = GDSCRv4

In [5]:
def create_model():
    model = Sequential()
    model.add(Dense(hdm1, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dropout(rate1))
    model.add(Dense(hdm2, activation='relu'))
    model.add(Dropout(rate2))
    model.add(Dense(len(drugs), activation='linear'))
    model.summary()
    model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
    return model

In [6]:
def train_model(X_train, y_train, X_test, y_test, model_file):
    model = create_model()
    # model callbacks 
    early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1, mode='auto')
    mc = ModelCheckpoint(filepath=model_file, monitor='val_loss', mode='auto', verbose=1, save_best_only=True)
    
    scalerGDSC = sk.StandardScaler()
    scalerGDSC.fit(X_train)
    X_trainE = scalerGDSC.transform(X_train)
    X_testE = scalerGDSC.transform(X_test)

    history = model.fit(X_train, y_train, epochs=epoch, batch_size=mbs,  verbose=1, callbacks=[early_stopping,mc])
    return model

In [None]:
train_model(X_train, y_train, X_test, y_test, model_file='best_model_1.h5')

test_score = model.evaluate(X_test, y_test, verbose=2)
print('Test MSE / RMSE = %.3f/%.3f' % (test_score, np.sqrt(test_score)))

train_score = model.evaluate(X_train, y_train, verbose=2)
print('Train MSE / RMSE = %.3f / %.3f' % (train_score, np.sqrt(train_score)))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               155648    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                16448     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 11)                715       
Total params: 172,811
Trainable params: 172,811
Non-trainable params: 0
_________________________________________________________________
Epoch 1/150


In [None]:
X = GDSCEv4.values
Y = GDSCRv4
fold = 1
for train_index, test_index in KFold(10).split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    train_model(X_train, y_train, X_test, y_test, model_file='best_model_CV'+fold+'.h5')
    
    test_score = model.evaluate(X_test, y_test, verbose=2)
    print('Test MSE / RMSE = %.3f/%.3f' % (test_score, np.sqrt(test_score)))
    
    fold+=1