In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold,cross_validate, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
#merging all the required datasets - metabolomics dataset, drug response dataset and the metadata of the cell lines

data = pd.read_excel("D:\\GaTech\\Fall Sem\\BIOL 8901\\Metabolomic Project\\metabolomic_data.xlsx", sheet_name="1-clean data")
data.rename(columns = {'Unnamed: 0': 'ID'}, inplace = True)
last_column = data.iloc[:,-1].name
sample = pd.read_csv("D:\\GaTech\\Fall Sem\\BIOL 8901\\sample_info.csv")
sample.rename(columns = {'CCLE_Name':'ID'}, inplace=True)
merged_data = data.merge(sample, on='ID')
drug = pd.read_csv('D:\\GaTech\\Fall Sem\\BIOL 8901\\sanger-dose-response.csv')
drug.rename(columns={'ARXSPAN_ID':'DepMap_ID'}, inplace = True)
working_data = merged_data.merge(drug, on='DepMap_ID')

In [3]:
#since our data has a lot of NaN's in them, we can fill them using 0 (for now, just for a workaround)

working_data.fillna(0, inplace=True)

In [4]:
X1 = working_data.loc[working_data['DRUG_NAME'] == 'DOCETAXEL']

In [5]:
X1.shape

(1596, 270)

In [6]:
'''
reset the index as once we take a subset of the main working_dataset, the indexes will get mixed up.
thus the indices need to be reset before we start working on the model
'''

X1.reset_index(inplace=True)

In [7]:
X1['DRUG_ID'].value_counts()

1007    1133
1819     463
Name: DRUG_ID, dtype: int64

In [8]:
X_ = X1[~X1.duplicated(['DepMap_ID'], keep=False) | X1['DATASET'].eq('GDSC2')]

In [9]:
X_['DRUG_ID'].value_counts()

1007    600
1819    463
Name: DRUG_ID, dtype: int64

In [10]:
X1007 = X_.loc[X_['DRUG_ID']==1007]
X1819 = X_.loc[X_['DRUG_ID']==1819]

In [11]:
X1007.shape, X1819.shape

((600, 271), (463, 271))

In [12]:
X1007.reset_index(inplace=True)
X1819.reset_index(inplace=True)

In [13]:
X1007 = X1007.select_dtypes('float64')
X1819 = X1819.select_dtypes('float64')

In [14]:
X1007.reset_index(inplace=True)
X1819.reset_index(inplace=True)

In [15]:
X1007.drop(columns = ['index'], inplace=True)
X1819.drop(columns = ['index'], inplace=True)

In [16]:
#now we are going to normalize the dataframe
#normalization means bringing each value in a column in the range of [-1,1]
#calculate the minimum value of each column
#calculate the maximum value of each column
#calculate the range by subtracting the max and min of each column
#find the scaled value for each value in a column

#normalize the X1007 dataframe

X1007_min = X1007.min()
X1007_max = X1007.max()
X1007_range = X1007_max - X1007_min
X1007_scaled = (X1007 - X1007_min)/(X1007_range)

#normalize the X1819 dataframe

X1819_min = X1819.min()
X1819_max = X1819.max()
X1819_range = X1819_max - X1819_min
X1819_scaled = (X1819 - X1819_min)/(X1819_range)

In [17]:
X1007_scaled.shape, X1819.shape

((600, 244), (463, 244))

In [18]:
#calculate the mean of the IC50_published datarfame
#this will allow us to create labels as IC50 values are the drug response values

drug_mean_1007 = X1007_scaled['IC50_PUBLISHED'].mean()
print(drug_mean_1007)

0.013483973777142928


In [19]:
ic_1007, labels_1007 = X1007_scaled['IC50_PUBLISHED'], []
for i in range(len(ic_1007)):
    if ic_1007[i] > drug_mean_1007:
        labels_1007.append("R")
    else:
        labels_1007.append("NR")
        
y_1007 = pd.Series(labels_1007)

In [20]:
X1007_scaled.shape, y_1007.shape

((600, 244), (600,))

In [21]:
#select the metabolites ONLY, drop the drug response values

X_1007 = X1007_scaled.iloc[:, :225]
X_1007.shape

(600, 225)

In [22]:
#calculate the correlation matrix of the metabolite dataframe
#choose the upper triangle of the correlation matrix
#create a list of features where the correlation value is >0.90
#this list contains the highly correlated features, which will be removed from the dataset


corr_matrix_1007 = X_1007.corr().abs()
upper_1007 = corr_matrix_1007.where(np.triu(np.ones(corr_matrix_1007.shape), k=1).astype(np.bool))
to_drop_1007 = [column for column in upper_1007.columns if any(upper_1007[column]>0.90)]

In [23]:
len(to_drop_1007)

22

In [24]:
X_1007.drop(to_drop_1007, axis=1, inplace=True)

In [25]:
X_1007.shape, y_1007.shape

((600, 203), (600,))

In [26]:
#run the RFECV model with estimator being Random Forest and StratifiedKFold cross validation with 5 folds.

rfecv = RFECV(estimator = RandomForestClassifier(random_state=101), step=1, cv=StratifiedKFold(5), scoring='accuracy')

In [27]:
#fit the X,y to the RFECV model

rfecv.fit(X_1007, y_1007)

RFECV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
      estimator=RandomForestClassifier(random_state=101), scoring='accuracy')

In [28]:
selected_features = rfecv.get_support(1)
len(selected_features)

48

In [29]:
D1 = X_1007[X_1007.columns[selected_features]]
D1.shape

(600, 48)

In [30]:
#using classification_report metrics, run a prediction model using StratifiedKFold cross_validation with k=5 folds
#model being used as the classifier is Random Forest

kf1 = StratifiedKFold(n_splits = 5, shuffle = False)
model = RandomForestClassifier()
i = 1
dfs = []
for train_index, test_index in kf1.split(D1, y_1007):
    X_train, X_test = D1.iloc[train_index], D1.iloc[test_index]
    y_train, y_test = y_1007.iloc[train_index], y_1007.iloc[test_index]
    model.fit(X_train, y_train)
    predicted = model.predict(X_test)
    report = classification_report(y_test, predicted, output_dict = True)
    df = pd.DataFrame(report).transpose()
    dfs.append(df)
    i+=1
results_df = pd.concat(dfs)
results_df.to_csv("DOCETAXEL_1007_results.tsv", sep="\t")

In [31]:
results_df

Unnamed: 0,precision,recall,f1-score,support
NR,0.841667,1.0,0.914027,101.0
R,0.0,0.0,0.0,19.0
accuracy,0.841667,0.841667,0.841667,0.841667
macro avg,0.420833,0.5,0.457014,120.0
weighted avg,0.708403,0.841667,0.769306,120.0
NR,0.848739,0.990196,0.914027,102.0
R,0.0,0.0,0.0,18.0
accuracy,0.841667,0.841667,0.841667,0.841667
macro avg,0.42437,0.495098,0.457014,120.0
weighted avg,0.721429,0.841667,0.776923,120.0


### Work off of X1819 drug

In [36]:
#calculate the mean of the IC50_published datarfame
#this will allow us to create labels as IC50 values are the drug response values

drug_mean_1819 = X1819_scaled['IC50_PUBLISHED'].mean()
print(drug_mean_1819)

0.019578099111581242


In [38]:
ic_1819, labels_1819 = X1819_scaled['IC50_PUBLISHED'], []
for i in range(len(ic_1819)):
    if ic_1819[i] > drug_mean_1819:
        labels_1819.append('R')
    else:
        labels_1819.append('NR')

y_1819 = pd.Series(labels_1819)

In [40]:
#select the metabolites ONLY, drop the drug response values

X_1819 = X1819_scaled.iloc[:, :225]
X_1819.shape

(463, 225)

In [42]:
#calculate the correlation matrix of the metabolite dataframe
#choose the upper triangle of the correlation matrix
#create a list of features where the correlation value is >0.90
#this list contains the highly correlated features, which will be removed from the dataset


corr_matrix_1819 = X_1819.corr().abs()
upper_1819 = corr_matrix_1819.where(np.triu(np.ones(corr_matrix_1819.shape), k=1).astype(np.bool))
to_drop_1819 = [column for column in upper_1819.columns if any(upper_1819[column]>0.90)]

In [43]:
X_1819.drop(to_drop_1819, axis=1, inplace=True)

In [44]:
X_1819.shape, y_1819.shape

((463, 203), (463,))

In [45]:
#run the RFECV model with estimator being Random Forest and StratifiedKFold cross validation with 5 folds.

rfecv2 = RFECV(estimator = RandomForestClassifier(random_state=101), step=1, cv=StratifiedKFold(5), scoring='accuracy')

In [46]:
#fit the X,y to the RFECV model

rfecv2.fit(X_1819, y_1819)

RFECV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
      estimator=RandomForestClassifier(random_state=101), scoring='accuracy')

In [47]:
selected_features2 = rfecv2.get_support(1)
len(selected_features2)

28

In [48]:
D2 = X_1819[X_1819.columns[selected_features2]]
D2.shape

(463, 28)

In [49]:
#using classification_report metrics, run a prediction model using StratifiedKFold cross_validation with k=5 folds
#model being used as the classifier is Random Forest

kf2 = StratifiedKFold(n_splits = 5, shuffle = False)
model2 = RandomForestClassifier()
i = 1
dfs2 = []
for train_index, test_index in kf2.split(D2, y_1819):
    X_train, X_test = D2.iloc[train_index], D2.iloc[test_index]
    y_train, y_test = y_1819.iloc[train_index], y_1819.iloc[test_index]
    model2.fit(X_train, y_train)
    predicted2 = model2.predict(X_test)
    report2 = classification_report(y_test, predicted2, output_dict = True)
    df = pd.DataFrame(report2).transpose()
    dfs2.append(df)
    i+=1
results_df_2 = pd.concat(dfs)
results_df_2.to_csv("DOCETAXEL_1819_results.tsv", sep="\t")

In [50]:
results_df_2

Unnamed: 0,precision,recall,f1-score,support
NR,0.841667,1.0,0.914027,101.0
R,0.0,0.0,0.0,19.0
accuracy,0.841667,0.841667,0.841667,0.841667
macro avg,0.420833,0.5,0.457014,120.0
weighted avg,0.708403,0.841667,0.769306,120.0
NR,0.848739,0.990196,0.914027,102.0
R,0.0,0.0,0.0,18.0
accuracy,0.841667,0.841667,0.841667,0.841667
macro avg,0.42437,0.495098,0.457014,120.0
weighted avg,0.721429,0.841667,0.776923,120.0
