# Import required packages

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold,cross_validate, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
#from yellowbrick.classifier import ClassificationReport
import warnings
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

# Data preprocessing stage - merging and cleaning of data

### Data Collection and Preprocessing

In [2]:
#merging all the required datasets - metabolomics dataset, drug response dataset and the metadata of the cell lines

data = pd.read_excel("D:\\GaTech\\Fall Sem\\BIOL 8901\\Metabolomic Project\\metabolomic_data.xlsx", sheet_name="1-clean data")
data.rename(columns = {'Unnamed: 0': 'ID'}, inplace = True)
last_column = data.iloc[:,-1].name
sample = pd.read_csv("D:\\GaTech\\Fall Sem\\BIOL 8901\\sample_info.csv")
sample.rename(columns = {'CCLE_Name':'ID'}, inplace=True)
merged_data = data.merge(sample, on='ID')
drug = pd.read_csv('D:\\GaTech\\Fall Sem\\BIOL 8901\\sanger-dose-response.csv')
drug.rename(columns={'ARXSPAN_ID':'DepMap_ID'}, inplace = True)
working_data = merged_data.merge(drug, on='DepMap_ID')

In [3]:
#since our data has a lot of NaN's in them, we can fill them using 0 (for now, just for a workaround)

working_data.fillna(0, inplace=True)

In [4]:
X1 = working_data.loc[working_data['DRUG_NAME'] == 'CISPLATIN']

In [5]:
'''
reset the index as once we take a subset of the main working_dataset, the indexes will get mixed up.
thus the indices need to be reset before we start working on the model
'''

X1.reset_index(inplace=True)

### Remove GDSC1 data (per GDSC instructions) and keep GDSC2 data only

In [6]:
#wherever there is any duplicate within a cell line's IC50 values, take the cell line that originates from the GDSC2 phase 
X_ = X1[~X1.duplicated(['DepMap_ID'], keep=False) | X1['DATASET'].eq('GDSC2')]

In [7]:
X_.shape

(618, 271)

In [8]:
X_.reset_index(inplace=True)

In [9]:
X_.shape

(618, 272)

In [10]:
#drop the metadata

X_ = X_.select_dtypes('float64')
X_.shape

(618, 244)

In [11]:
#calculate the mean and standard deviation of the IC50_PUBLISHED

drug_mean = X_['IC50_PUBLISHED'].mean()
drug_std = X_['IC50_PUBLISHED'].std()
print(drug_mean, drug_std)

300.4605618460755 1083.7364113261324


### Label Creation

In [12]:
#create labels

ic, labels = X_['IC50_PUBLISHED'], []
for i in range(len(ic)):
    #if the IC50_PUBLISHED value is greater than the mean, add responsive label to the label list
    if ic[i] > drug_mean:
        labels.append('R')
    #if the IC50_PUBLISHED value is less than the mean, add non responsive label to the label list
    elif ic[i] < drug_mean:
        labels.append('NR')
        
y = pd.Series(labels)

In [13]:
X__ = X_.iloc[:, :225]
X__.head(1)

Unnamed: 0,2-aminoadipate,3-phosphoglycerate,alpha-glycerophosphate,4-pyridoxate,aconitate,adenine,adipate,alpha-ketoglutarate,AMP,citrate,...,C56:8 TAG,C56:7 TAG,C56:6 TAG,C56:5 TAG,C56:4 TAG,C56:3 TAG,C56:2 TAG,C58:8 TAG,C58:7 TAG,C58:6 TAG
0,6.112727,6.034198,5.896896,6.000532,5.513618,5.868529,5.977177,5.693074,5.923737,5.641242,...,6.070239,6.133433,6.091089,6.257711,6.372732,6.202511,5.939576,6.309821,6.115974,5.999436


### Normalization using StandardScaler

In [20]:
X = StandardScaler().fit_transform(X__, y)
X = pd.DataFrame(X, index=X__.index, columns = X__.columns)
X.head(1)

Unnamed: 0,2-aminoadipate,3-phosphoglycerate,alpha-glycerophosphate,4-pyridoxate,aconitate,adenine,adipate,alpha-ketoglutarate,AMP,citrate,...,C56:8 TAG,C56:7 TAG,C56:6 TAG,C56:5 TAG,C56:4 TAG,C56:3 TAG,C56:2 TAG,C58:8 TAG,C58:7 TAG,C58:6 TAG
0,0.517234,0.453558,-0.104905,0.204157,-1.126902,-0.094262,0.509193,-0.677204,-0.16369,-0.672165,...,0.573658,0.477419,0.413383,0.793657,1.114385,0.800389,-0.043086,0.747652,0.432813,0.254874


### Dimension reduction using Correlation

In [21]:
#calculate the correlation matrix of the metabolite dataframe
#choose the upper triangle of the correlation matrix
#create a list of features where the correlation value is >0.90
#this list contains the highly correlated features, which will be removed from the dataset

corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column]>0.90)]

In [22]:
len(to_drop)

21

In [23]:
#drop the list of highly correlated features computed above

X.drop(to_drop, axis=1, inplace=True)

In [24]:
X.shape, y.shape

((618, 204), (618,))

## Data Processing

### Feature Selection using Recursive Feature Elimination using Cross Validation

In [25]:
#run the RFECV model with estimator being Random Forest and StratifiedKFold cross validation with 5 folds.

rfecv = RFECV(estimator = RandomForestClassifier(random_state=101), step=1, cv=StratifiedKFold(5), scoring='accuracy')

In [26]:
#fit the X,y to the RFECV model

rfecv.fit(X, y)

RFECV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
      estimator=RandomForestClassifier(random_state=101), scoring='accuracy')

In [27]:
#choose the features that are selected by the RFECV model

selected_features = rfecv.get_support(1)

In [28]:
#select a subset dataframe that contains only the "optimal" metabolic features returned from the RFECV model

X3 = X[X.columns[selected_features]]
X3.shape

(618, 23)

### Random Forest 

### Model training and Classification

In [29]:
#using classification_report metrics, run a prediction model using StratifiedKFold cross_validation with k=5 folds
#model being used as the classifier is Random Forest

kf3 = StratifiedKFold(n_splits = 5, shuffle=False)
model=RandomForestClassifier()
i=1
dfs = []
for train_index, test_index in kf3.split(X3, y):
    #select train and test datasets from X and y
    X_train, X_test = X3.iloc[train_index], X3.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train the model
    model.fit(X_train, y_train)
    #predict the test dataset
    predicted = model.predict(X_test)
    #print the classification score report
    report = classification_report(y_test, predicted, output_dict = True)
    df = pd.DataFrame(report).transpose()
    dfs.append(df)
    i+=1
results_df = pd.concat(dfs)

In [30]:
results_df

Unnamed: 0,precision,recall,f1-score,support
NR,0.830645,1.0,0.907489,103.0
R,0.0,0.0,0.0,21.0
accuracy,0.830645,0.830645,0.830645,0.830645
macro avg,0.415323,0.5,0.453744,124.0
weighted avg,0.689971,0.830645,0.753801,124.0
NR,0.85,0.990291,0.914798,103.0
R,0.75,0.142857,0.24,21.0
accuracy,0.846774,0.846774,0.846774,0.846774
macro avg,0.8,0.566574,0.577399,124.0
weighted avg,0.833065,0.846774,0.800518,124.0


In [31]:
#results_df.to_csv("CISPLATIN_Results_RF.tsv", sep="\t")

### SVM

### Model training and classification

In [47]:
#using classification_report metrics, run a prediction model using StratifiedKFold cross_validation with k=5 folds
#model being used as the classifier is Support Vector Machine

kf5 = StratifiedKFold(n_splits = 5, shuffle=False)
model2=svm.SVC(kernel='linear', C=100)
i=1
dfs_svm = []
for train_index, test_index in kf5.split(X3, y):
    #select train and test datasets from X and y
    X_train, X_test = X3.iloc[train_index], X3.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train the model
    model2.fit(X_train, y_train)
    #predict the test dataset
    predicted2 = model2.predict(X_test)
    #print the classification score report
    report = classification_report(y_test, predicted2, output_dict = True)
    df = pd.DataFrame(report).transpose()
    dfs_svm.append(df)
    i+=1
results_df_svm = pd.concat(dfs_svm)

In [48]:
results_df_svm

Unnamed: 0,precision,recall,f1-score,support
NR,0.830645,1.0,0.907489,103.0
R,0.0,0.0,0.0,21.0
accuracy,0.830645,0.830645,0.830645,0.830645
macro avg,0.415323,0.5,0.453744,124.0
weighted avg,0.689971,0.830645,0.753801,124.0
NR,0.830645,1.0,0.907489,103.0
R,0.0,0.0,0.0,21.0
accuracy,0.830645,0.830645,0.830645,0.830645
macro avg,0.415323,0.5,0.453744,124.0
weighted avg,0.689971,0.830645,0.753801,124.0


### AdaBoost

### Model training and classification

In [41]:
#using classification_report metrics, run a prediction model using StratifiedKFold cross_validation with k=5 folds
#model being used as the classifier is AdaBoost Algorithm

kf5 = StratifiedKFold(n_splits = 5, shuffle=False)
model3=AdaBoostClassifier(n_estimators=500, learning_rate=0.8)
i=1
dfs_adb = []
for train_index, test_index in kf5.split(X3, y):
    #select train and test datasets from X and y
    X_train, X_test = X3.iloc[train_index], X3.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train the model
    model3.fit(X_train, y_train)
    #predict the test dataset
    predicted3 = model3.predict(X_test)
    #print the classification score report
    report = classification_report(y_test, predicted3, output_dict = True)
    df = pd.DataFrame(report).transpose()
    dfs_adb.append(df)
    i+=1
results_df_adb = pd.concat(dfs_adb)

In [42]:
results_df_adb

Unnamed: 0,precision,recall,f1-score,support
NR,0.855856,0.92233,0.88785,103.0
R,0.384615,0.238095,0.294118,21.0
accuracy,0.806452,0.806452,0.806452,0.806452
macro avg,0.620236,0.580213,0.590984,124.0
weighted avg,0.776049,0.806452,0.787299,124.0
NR,0.858491,0.883495,0.870813,103.0
R,0.333333,0.285714,0.307692,21.0
accuracy,0.782258,0.782258,0.782258,0.782258
macro avg,0.595912,0.584605,0.589253,124.0
weighted avg,0.769553,0.782258,0.775446,124.0


## Voting Classifier

### Model training and Classification

In [43]:
estimator = []
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('SVC', svm.SVC(kernel='linear', gamma = 'auto', probability = True)))
estimator.append(('KNN', KNeighborsClassifier(n_neighbors=3)))
kf5 = StratifiedKFold(n_splits = 5, shuffle=False)
model3=AdaBoostClassifier(n_estimators=300, learning_rate=1.0)
i=1
dfs_vc = []
for train_index, test_index in kf5.split(X3,y):
    #select train and test datasets from X and y
    X_train, X_test = X3.iloc[train_index], X3.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train the model
    vc = VotingClassifier(estimators = estimator, voting='hard').fit(X_train, y_train)
    #predict the test dataset
    predicted3 = vc.predict(X_test)
    #print the classification score report
    report = classification_report(y_test, predicted3, output_dict = True)
    df = pd.DataFrame(report).transpose()
    dfs_vc.append(df)
    i+=1
results_df_vc = pd.concat(dfs_vc)

In [44]:
results_df_vc

Unnamed: 0,precision,recall,f1-score,support
NR,0.829268,0.990291,0.902655,103.0
R,0.0,0.0,0.0,21.0
accuracy,0.822581,0.822581,0.822581,0.822581
macro avg,0.414634,0.495146,0.451327,124.0
weighted avg,0.688828,0.822581,0.749786,124.0
NR,0.844262,1.0,0.915556,103.0
R,1.0,0.095238,0.173913,21.0
accuracy,0.846774,0.846774,0.846774,0.846774
macro avg,0.922131,0.547619,0.544734,124.0
weighted avg,0.870637,0.846774,0.789955,124.0
