# Import required packages

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold,cross_validate, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
#from yellowbrick.classifier import ClassificationReport
import warnings
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
warnings.filterwarnings("ignore")

# Data preprocessing stage - merging and cleaning of data

### Data Collection and Preprocessing

In [2]:
#merging all the required datasets - metabolomics dataset, drug response dataset and the metadata of the cell lines

data = pd.read_excel("D:\\GaTech\\Fall Sem\\BIOL 8901\\Metabolomic Project\\metabolomic_data.xlsx", sheet_name="1-clean data")
data.rename(columns = {'Unnamed: 0': 'ID'}, inplace = True)
last_column = data.iloc[:,-1].name
sample = pd.read_csv("D:\\GaTech\\Fall Sem\\BIOL 8901\\sample_info.csv")
sample.rename(columns = {'CCLE_Name':'ID'}, inplace=True)
merged_data = data.merge(sample, on='ID')
drug = pd.read_csv('D:\\GaTech\\Fall Sem\\BIOL 8901\\sanger-dose-response.csv')
drug.rename(columns={'ARXSPAN_ID':'DepMap_ID'}, inplace = True)
working_data = merged_data.merge(drug, on='DepMap_ID')

In [3]:
#since our data has a lot of NaN's in them, we can fill them using 0 (for now, just for a workaround)

working_data.fillna(0, inplace=True)

In [4]:
X1 = working_data.loc[working_data['DRUG_NAME'] == 'CISPLATIN']

In [5]:
'''
reset the index as once we take a subset of the main working_dataset, the indexes will get mixed up.
thus the indices need to be reset before we start working on the model
'''

X1.reset_index(inplace=True)

In [6]:
#wherever there is any duplicate within a cell line's IC50 values, take the cell line that originates from the GDSC2 phase 
X_ = X1[~X1.duplicated(['DepMap_ID'], keep=False) | X1['DATASET'].eq('GDSC2')]

In [7]:
X_.shape

(618, 271)

In [8]:
X_.reset_index(inplace=True)

In [9]:
X_.shape

(618, 272)

In [10]:
#drop the metadata

X_ = X_.select_dtypes('float64')
X_.shape

(618, 244)

### Normalization

In [11]:
#scale the dataframe, at a quick glance the metabolite profiles are not scaled thus scaling of the dataframe is necessary

X_min = X_.min()
X_max = X_.max()
X_range = (X_max-X_min)
X_scaled = (X_-X_min)/(X_range)

In [12]:
#calculate the mean and standard deviation of the IC50_PUBLISHED

drug_mean = X_scaled['IC50_PUBLISHED'].mean()
drug_std = X_scaled['IC50_PUBLISHED'].std()
print(drug_mean, drug_std)

0.019814174656500174 0.07151777667799389


### Label Creation

In [13]:
#create labels

ic, labels = X_scaled['IC50_PUBLISHED'], []
for i in range(len(ic)):
    #if the IC50_PUBLISHED value is greater than the mean, add responsive label to the label list
    if ic[i] > drug_mean:
        labels.append('R')
    #if the IC50_PUBLISHED value is less than the mean, add non responsive label to the label list
    elif ic[i] < drug_mean:
        labels.append('NR')
        
y = pd.Series(labels)

In [14]:
X = X_scaled.iloc[:, :225]
X.head(1)

Unnamed: 0,2-aminoadipate,3-phosphoglycerate,alpha-glycerophosphate,4-pyridoxate,aconitate,adenine,adipate,alpha-ketoglutarate,AMP,citrate,...,C56:8 TAG,C56:7 TAG,C56:6 TAG,C56:5 TAG,C56:4 TAG,C56:3 TAG,C56:2 TAG,C58:8 TAG,C58:7 TAG,C58:6 TAG
0,0.3953,0.621262,0.510802,0.459783,0.417356,0.380599,0.587679,0.438061,0.405262,0.491669,...,0.683987,0.527663,0.679875,0.60164,0.609317,0.650514,0.463536,0.569028,0.51641,0.512918


### Dimension reduction using Correlation

In [15]:
#calculate the correlation matrix of the metabolite dataframe
#choose the upper triangle of the correlation matrix
#create a list of features where the correlation value is >0.90
#this list contains the highly correlated features, which will be removed from the dataset

corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column]>0.90)]

In [16]:
len(to_drop)

21

In [17]:
#drop the list of highly correlated features computed above

X.drop(to_drop, axis=1, inplace=True)

In [18]:
X.shape, y.shape

((618, 204), (618,))

## Data Processing

### Random Forest Algorithm

In [19]:
#run the RFECV model with estimator being Random Forest and StratifiedKFold cross validation with 5 folds.

rfecv = RFECV(estimator = RandomForestClassifier(random_state=101), step=1, cv=StratifiedKFold(5), scoring='accuracy')

In [20]:
#fit the X,y to the RFECV model

rfecv.fit(X, y)

RFECV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
      estimator=RandomForestClassifier(random_state=101), scoring='accuracy')

In [21]:
#choose the features that are selected by the RFECV model

selected_features = rfecv.get_support(1)

In [22]:
#select a subset dataframe that contains only the "optimal" metabolic features returned from the RFECV model

X3 = X[X.columns[selected_features]]
X3.shape

(618, 23)

In [23]:
#using classification_report metrics, run a prediction model using StratifiedKFold cross_validation with k=5 folds
#model being used as the classifier is Random Forest

kf3 = StratifiedKFold(n_splits = 5, shuffle=False)
model=RandomForestClassifier()
i=1
dfs = []
for train_index, test_index in kf3.split(X3, y):
    #select train and test datasets from X and y
    X_train, X_test = X3.iloc[train_index], X3.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train the model
    model.fit(X_train, y_train)
    #predict the test dataset
    predicted = model.predict(X_test)
    #print the classification score report
    report = classification_report(y_test, predicted, output_dict = True)
    df = pd.DataFrame(report).transpose()
    dfs.append(df)
    i+=1
results_df = pd.concat(dfs)

In [24]:
results_df

Unnamed: 0,precision,recall,f1-score,support
NR,0.836066,0.990291,0.906667,103.0
R,0.5,0.047619,0.086957,21.0
accuracy,0.830645,0.830645,0.830645,0.830645
macro avg,0.668033,0.518955,0.496812,124.0
weighted avg,0.779151,0.830645,0.767845,124.0
NR,0.829268,0.990291,0.902655,103.0
R,0.0,0.0,0.0,21.0
accuracy,0.822581,0.822581,0.822581,0.822581
macro avg,0.414634,0.495146,0.451327,124.0
weighted avg,0.688828,0.822581,0.749786,124.0


In [25]:
results_df.to_csv("CISPLATIN_Results_RF.tsv", sep="\t")

### SVM

### Feature Selection

In [26]:
#run the RFECV model with estimator being Random Forest and StratifiedKFold cross validation with 5 folds.

rfecv_svm = RFECV(estimator = RandomForestClassifier(random_state=101), step=1, cv=StratifiedKFold(5), scoring='accuracy')

In [27]:
#fit the X,y to the RFECV model

rfecv_svm.fit(X,y)

RFECV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
      estimator=RandomForestClassifier(random_state=101), scoring='accuracy')

In [28]:
#choose the features that are selected by the RFECV model

sf_svm = rfecv_svm.get_support(1)

In [29]:
#select a subset dataframe that contains only the "optimal" metabolic features returned from the RFECV model

x_svm = X[X.columns[sf_svm]]
x_svm.shape

(618, 23)

### Model training and classification

In [30]:
#using classification_report metrics, run a prediction model using StratifiedKFold cross_validation with k=5 folds
#model being used as the classifier is Support Vector Machine

kf5 = StratifiedKFold(n_splits = 5, shuffle=False)
model2=svm.SVC(kernel='linear', C=1)
i=1
dfs_svm = []
for train_index, test_index in kf5.split(x_svm, y):
    #select train and test datasets from X and y
    X_train, X_test = x_svm.iloc[train_index], x_svm.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train the model
    model2.fit(X_train, y_train)
    #predict the test dataset
    predicted2 = model2.predict(X_test)
    #print the classification score report
    report = classification_report(y_test, predicted2, output_dict = True)
    df = pd.DataFrame(report).transpose()
    dfs_svm.append(df)
    i+=1
results_df_svm = pd.concat(dfs_svm)

In [31]:
results_df_svm

Unnamed: 0,precision,recall,f1-score,support
NR,0.830645,1.0,0.907489,103.0
R,0.0,0.0,0.0,21.0
accuracy,0.830645,0.830645,0.830645,0.830645
macro avg,0.415323,0.5,0.453744,124.0
weighted avg,0.689971,0.830645,0.753801,124.0
NR,0.830645,1.0,0.907489,103.0
R,0.0,0.0,0.0,21.0
accuracy,0.830645,0.830645,0.830645,0.830645
macro avg,0.415323,0.5,0.453744,124.0
weighted avg,0.689971,0.830645,0.753801,124.0


### AdaBoost

### Feature Selection

In [32]:
#run the RFECV model with estimator being Random Forest and StratifiedKFold cross validation with 5 folds.

rfecv_adb = RFECV(estimator = RandomForestClassifier(random_state=101), step=1, cv=StratifiedKFold(5), scoring='accuracy')

In [33]:
#fit the X,y to the RFECV model

rfecv_adb.fit(X,y)

RFECV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
      estimator=RandomForestClassifier(random_state=101), scoring='accuracy')

In [34]:
#choose the features that are selected by the RFECV model

sf_adb = rfecv_adb.get_support(1)

In [35]:
#select a subset dataframe that contains only the "optimal" metabolic features returned from the RFECV model

x_adb = X[X.columns[sf_adb]]
x_adb.shape

(618, 23)

### Model training and classification

In [36]:
#using classification_report metrics, run a prediction model using StratifiedKFold cross_validation with k=5 folds
#model being used as the classifier is AdaBoost Algorithm

kf6 = StratifiedKFold(n_splits = 5, shuffle=False)
model3=AdaBoostClassifier(n_estimators=300, learning_rate=1.0)
i=1
dfs_adb = []
for train_index, test_index in kf5.split(x_adb, y):
    #select train and test datasets from X and y
    X_train, X_test = x_adb.iloc[train_index], x_adb.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train the model
    model3.fit(X_train, y_train)
    #predict the test dataset
    predicted3 = model3.predict(X_test)
    #print the classification score report
    report = classification_report(y_test, predicted3, output_dict = True)
    df = pd.DataFrame(report).transpose()
    dfs_adb.append(df)
    i+=1
results_df_adb = pd.concat(dfs_adb)

In [37]:
results_df_adb

Unnamed: 0,precision,recall,f1-score,support
NR,0.862385,0.912621,0.886792,103.0
R,0.4,0.285714,0.333333,21.0
accuracy,0.806452,0.806452,0.806452,0.806452
macro avg,0.631193,0.599168,0.610063,124.0
weighted avg,0.784078,0.806452,0.793061,124.0
NR,0.87037,0.912621,0.890995,103.0
R,0.4375,0.333333,0.378378,21.0
accuracy,0.814516,0.814516,0.814516,0.814516
macro avg,0.653935,0.622977,0.634687,124.0
weighted avg,0.797062,0.814516,0.804181,124.0


### Voting Classifier

In [42]:
estimator = []
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('SVC', svm.SVC(kernel='linear', C=1, probability = True)))
estimator.append(('KNN', KNeighborsClassifier(n_neighbors=3)))
kf5 = StratifiedKFold(n_splits = 5, shuffle=False)
model3=AdaBoostClassifier(n_estimators=300, learning_rate=1.0)
i=1
dfs_vc = []
for train_index, test_index in kf5.split(X,y):
    #select train and test datasets from X and y
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train the model
    vc = VotingClassifier(estimators = estimator, voting='hard').fit(X_train, y_train)
    #predict the test dataset
    predicted3 = vc.predict(X_test)
    #print the classification score report
    report = classification_report(y_test, predicted3, output_dict = True)
    df = pd.DataFrame(report).transpose()
    dfs_vc.append(df)
    i+=1
results_df_vc = pd.concat(dfs_vc)

In [43]:
results_df_vc

Unnamed: 0,precision,recall,f1-score,support
NR,0.826446,0.970874,0.892857,103.0
R,0.0,0.0,0.0,21.0
accuracy,0.806452,0.806452,0.806452,0.806452
macro avg,0.413223,0.485437,0.446429,124.0
weighted avg,0.686484,0.806452,0.741647,124.0
NR,0.827586,0.932039,0.876712,103.0
R,0.125,0.047619,0.068966,21.0
accuracy,0.782258,0.782258,0.782258,0.782258
macro avg,0.476293,0.489829,0.472839,124.0
weighted avg,0.7086,0.782258,0.739916,124.0
