In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold,cross_validate, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
#from yellowbrick.classifier import ClassificationReport
import warnings
warnings.filterwarnings("ignore")

In [2]:
#merging all the required datasets - metabolomics dataset, drug response dataset and the metadata of the cell lines

data = pd.read_excel("D:\\GaTech\\Fall Sem\\BIOL 8901\\Metabolomic Project\\metabolomic_data.xlsx", sheet_name="1-clean data")
data.rename(columns = {'Unnamed: 0': 'ID'}, inplace = True)
last_column = data.iloc[:,-1].name
sample = pd.read_csv("D:\\GaTech\\Fall Sem\\BIOL 8901\\sample_info.csv")
sample.rename(columns = {'CCLE_Name':'ID'}, inplace=True)
merged_data = data.merge(sample, on='ID')
drug = pd.read_csv('D:\\GaTech\\Fall Sem\\BIOL 8901\\sanger-dose-response.csv')
drug.rename(columns={'ARXSPAN_ID':'DepMap_ID'}, inplace = True)
working_data = merged_data.merge(drug, on='DepMap_ID')

In [3]:
#since our data has a lot of NaN's in them, we can fill them using 0 (for now, just for a workaround)

working_data.fillna(0, inplace=True)

In [4]:
X_ = working_data.loc[working_data['DRUG_NAME'] == 'DOXORUBICIN']

In [5]:
'''
reset the index as once we take a subset of the main working_dataset, the indexes will get mixed up.
thus the indices need to be reset before we start working on the model
'''

X_.reset_index(inplace=True)

In [6]:
X_[['DATASET', 'DepMap_ID', 'DRUG_ID', 'MIN_CONC', 'MAX_CONC', 'IC50_PUBLISHED']]

Unnamed: 0,DATASET,DepMap_ID,DRUG_ID,MIN_CONC,MAX_CONC,IC50_PUBLISHED
0,GDSC1,ACH-000698,133,0.004,1.024,1.510152
1,GDSC1,ACH-000431,133,0.004,1.024,0.128724
2,GDSC1,ACH-000522,133,0.004,1.024,0.094252
3,GDSC1,ACH-000613,133,0.004,1.024,0.030426
4,GDSC1,ACH-000614,133,0.004,1.024,0.175453
...,...,...,...,...,...,...
567,GDSC1,ACH-001208,133,0.004,1.024,0.112305
568,GDSC1,ACH-000655,133,0.004,1.024,0.091809
569,GDSC1,ACH-000273,133,0.004,1.024,0.117823
570,GDSC1,ACH-000504,133,0.004,1.024,0.126749


In [7]:
X_['DRUG_ID'].value_counts()

133    572
Name: DRUG_ID, dtype: int64

In [8]:
X_.shape

(572, 271)

In [9]:
X_.reset_index(inplace=True)

In [10]:
#drop the metadata

X_ = X_.select_dtypes('float64')
X_.shape

(572, 244)

In [11]:
#scale the dataframe, at a quick glance the metabolite profiles are not scaled thus scaling of the dataframe is necessary

X_min = X_.min()
X_max = X_.max()
X_range = (X_max-X_min)
X_scaled = (X_-X_min)/(X_range)
X_scaled.head(5)

Unnamed: 0,2-aminoadipate,3-phosphoglycerate,alpha-glycerophosphate,4-pyridoxate,aconitate,adenine,adipate,alpha-ketoglutarate,AMP,citrate,...,IC50_PUBLISHED,AUC_PUBLISHED,upper_limit,ec50,slope,lower_limit,auc,log2.ic50,mse,R2
0,0.3953,0.621262,0.546764,0.459783,0.417356,0.380599,0.619022,0.438061,0.444715,0.587155,...,0.052857,0.892232,0.02806,1.411021e-98,0.615304,0.996073,0.904841,0.996527,0.016769,0.927928
1,0.319421,0.37158,0.441413,0.392502,0.48412,0.383373,0.554267,0.507683,0.381484,0.558086,...,0.004352,0.579193,0.02977,2.161504e-99,0.875207,0.993714,0.691903,0.584964,0.003499,0.997547
2,0.321303,0.624052,0.669008,0.302954,0.412029,0.422752,0.564138,0.194364,0.822279,0.502986,...,0.003142,0.526554,0.029984,2.406236e-99,0.933393,0.992122,0.63597,0.579668,0.012447,0.992595
3,0.436124,0.642319,0.613914,0.242486,0.736116,0.445114,0.580127,0.611088,0.441606,0.767874,...,0.000901,0.321863,0.032396,4.751694999999999e-100,0.923329,0.993096,0.476595,0.301089,0.125172,0.92216
4,0.384736,0.646478,0.609643,0.399926,0.660183,0.346604,0.333732,0.680628,0.474345,0.766004,...,0.005993,0.607641,0.025612,1.731564e-96,0.966079,0.818252,0.707985,0.844233,0.049663,0.921669


In [12]:
#calculate the mean and standard deviation of the IC50_PUBLISHED

drug_mean = X_scaled['IC50_PUBLISHED'].mean()
drug_std = X_scaled['IC50_PUBLISHED'].std()
print(drug_mean, drug_std)

0.03304287941883692 0.09733582721333729


In [13]:
#create labels

ic, labels = X_scaled['IC50_PUBLISHED'], []
for i in range(len(ic)):
    #if the IC50_PUBLISHED value is greater than the mean, add responsive label to the label list
    if ic[i] > drug_mean:
        labels.append('R')
    #if the IC50_PUBLISHED value is less than the mean, add non responsive label to the label list
    elif ic[i] < drug_mean:
        labels.append('NR')
        
y = pd.Series(labels)

In [15]:
X = X_scaled.iloc[:, :225]

In [16]:
#calculate the correlation matrix of the metabolite dataframe
#choose the upper triangle of the correlation matrix
#create a list of features where the correlation value is >0.90
#this list contains the highly correlated features, which will be removed from the dataset

corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column]>0.90)]

In [17]:
corr_matrix

Unnamed: 0,2-aminoadipate,3-phosphoglycerate,alpha-glycerophosphate,4-pyridoxate,aconitate,adenine,adipate,alpha-ketoglutarate,AMP,citrate,...,C56:8 TAG,C56:7 TAG,C56:6 TAG,C56:5 TAG,C56:4 TAG,C56:3 TAG,C56:2 TAG,C58:8 TAG,C58:7 TAG,C58:6 TAG
2-aminoadipate,1.000000,0.040415,0.032334,0.021677,0.162651,0.018065,0.074494,0.151697,0.172648,0.080821,...,0.018588,0.031351,0.040824,0.032557,0.076637,0.105981,0.151599,0.032077,0.047289,0.052389
3-phosphoglycerate,0.040415,1.000000,0.015637,0.227895,0.401851,0.054780,0.251326,0.461036,0.099643,0.385661,...,0.293714,0.380924,0.420941,0.391307,0.284565,0.084361,0.065165,0.400590,0.410387,0.406289
alpha-glycerophosphate,0.032334,0.015637,1.000000,0.037762,0.205940,0.171872,0.078321,0.111415,0.470598,0.188274,...,0.129330,0.128191,0.086406,0.135503,0.161686,0.160643,0.207592,0.105009,0.113995,0.117226
4-pyridoxate,0.021677,0.227895,0.037762,1.000000,0.292986,0.069943,0.289360,0.185326,0.096051,0.241743,...,0.125465,0.216565,0.225618,0.211889,0.143459,0.017074,0.018829,0.174368,0.170625,0.222983
aconitate,0.162651,0.401851,0.205940,0.292986,1.000000,0.109214,0.262718,0.585930,0.070909,0.864515,...,0.388210,0.412712,0.491336,0.442445,0.377051,0.198253,0.212230,0.427662,0.422364,0.446753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C56:3 TAG,0.105981,0.084361,0.160643,0.017074,0.198253,0.050403,0.031965,0.227834,0.242514,0.208033,...,0.240933,0.497920,0.495078,0.601046,0.812623,1.000000,0.801472,0.531575,0.575995,0.568897
C56:2 TAG,0.151599,0.065165,0.207592,0.018829,0.212230,0.070096,0.119124,0.203307,0.301147,0.206993,...,0.212713,0.422739,0.414229,0.451374,0.639338,0.801472,1.000000,0.406685,0.440657,0.445861
C58:8 TAG,0.032077,0.400590,0.105009,0.174368,0.427662,0.073237,0.345921,0.373912,0.168924,0.425320,...,0.606166,0.900099,0.897357,0.869339,0.757935,0.531575,0.406685,1.000000,0.929063,0.907306
C58:7 TAG,0.047289,0.410387,0.113995,0.170625,0.422364,0.067833,0.328740,0.379006,0.171716,0.424464,...,0.546242,0.836085,0.878459,0.869847,0.778519,0.575995,0.440657,0.929063,1.000000,0.890838


In [18]:
#drop the list of highly correlated features computed above

X.drop(to_drop, axis=1, inplace=True)

In [19]:
X.shape, y.shape

((572, 203), (572,))

In [20]:
#run the RFECV model with estimator being Random Forest and StratifiedKFold cross validation with 5 folds.

rfecv = RFECV(estimator = RandomForestClassifier(random_state=101), step=1, cv=StratifiedKFold(5), scoring='accuracy')

In [21]:
#fit the X,y to the RFECV model

rfecv.fit(X, y)

RFECV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
      estimator=RandomForestClassifier(random_state=101), scoring='accuracy')

In [22]:
#choose the features that are selected by the RFECV model

selected_features = rfecv.get_support(1)

In [23]:
#select a subset dataframe that contains only the "optimal" metabolic features returned from the RFECV model

X3 = X[X.columns[selected_features]]
X3.shape

(572, 35)

In [24]:
#using classification_report metrics, run a prediction model using StratifiedKFold cross_validation with k=5 folds
#model being used as the classifier is Random Forest

kf3 = StratifiedKFold(n_splits = 5, shuffle=False)
model=RandomForestClassifier()
i=1
dfs = []
for train_index, test_index in kf3.split(X3, y):
    #select train and test datasets from X and y
    X_train, X_test = X3.iloc[train_index], X3.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train the model
    model.fit(X_train, y_train)
    #predict the test dataset
    predicted = model.predict(X_test)
    #print the classification score report
    report = classification_report(y_test, predicted, output_dict = True)
    df = pd.DataFrame(report).transpose()
    dfs.append(df)
    i+=1
results_df = pd.concat(dfs)

In [25]:
results_df

Unnamed: 0,precision,recall,f1-score,support
NR,0.826087,1.0,0.904762,95.0
R,0.0,0.0,0.0,20.0
accuracy,0.826087,0.826087,0.826087,0.826087
macro avg,0.413043,0.5,0.452381,115.0
weighted avg,0.68242,0.826087,0.747412,115.0
NR,0.823009,0.978947,0.894231,95.0
R,0.0,0.0,0.0,20.0
accuracy,0.808696,0.808696,0.808696,0.808696
macro avg,0.411504,0.489474,0.447115,115.0
weighted avg,0.679877,0.808696,0.738712,115.0


In [26]:
results_df.to_csv("CISPLATIN_Results.tsv", sep="\t")