In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold,cross_validate, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

In [3]:
#merging all the required datasets - metabolomics dataset, drug response dataset and the metadata of the cell lines

data = pd.read_excel("D:\\GaTech\\Fall Sem\\BIOL 8901\\Metabolomic Project\\metabolomic_data.xlsx", sheet_name="1-clean data")
data.rename(columns = {'Unnamed: 0': 'ID'}, inplace = True)
last_column = data.iloc[:,-1].name
sample = pd.read_csv("D:\\GaTech\\Fall Sem\\BIOL 8901\\sample_info.csv")
sample.rename(columns = {'CCLE_Name':'ID'}, inplace=True)
merged_data = data.merge(sample, on='ID')
drug = pd.read_csv('D:\\GaTech\\Fall Sem\\BIOL 8901\\sanger-dose-response.csv')
drug.rename(columns={'ARXSPAN_ID':'DepMap_ID'}, inplace = True)
working_data = merged_data.merge(drug, on='DepMap_ID')

  warn(msg)


In [4]:
#create a dataframe that contains the counts of all the cell lines treated by a drug

drug_df = working_data.loc[:,['DepMap_ID', 'DRUG_NAME']].drop_duplicates().DRUG_NAME.value_counts()\
        .to_frame().reset_index().rename(columns={'index':'Drug_Name','DRUG_NAME':'Counts'})

In [5]:
drug_df

Unnamed: 0,Drug_Name,Counts
0,AVAGACESTAT,627
1,SB 505124,626
2,CHIR-99021,626
3,JQ1,625
4,AZD6482,625
...,...,...
392,SALUBRINAL,223
393,CYCLOPAMINE,222
394,ERK5-IN-1,222
395,ROSCOVITINE,222


In [6]:
#if there are any NaN's in the working dataset then replace that with 0

working_data.fillna(0, inplace=True)

In [7]:
#from the drug dataframe, select a subset dataframe of the top drug with the maximum number of cell lines

X1 = working_data.loc[working_data['DRUG_NAME'] == 'AVAGACESTAT']

In [8]:
X1.reset_index(inplace=True)

In [9]:
#drop the metadata

X1 = X1.select_dtypes('float64')

In [10]:
#scale the dataframe, at a quick glance the metabolite profiles are not scaled thus scaling of the dataframe is necessary

X_min = X1.min()
X_max = X1.max()
X_range = (X_max-X_min)
X_scaled = (X1-X_min)/(X_range)
X_scaled.head(5)

Unnamed: 0,2-aminoadipate,3-phosphoglycerate,alpha-glycerophosphate,4-pyridoxate,aconitate,adenine,adipate,alpha-ketoglutarate,AMP,citrate,...,IC50_PUBLISHED,AUC_PUBLISHED,upper_limit,ec50,slope,lower_limit,auc,log2.ic50,mse,R2
0,0.3953,0.621262,0.546764,0.459783,0.417356,0.380599,0.587679,0.438061,0.405262,0.587155,...,0.225842,0.970965,8e-06,4.798431e-226,0.786835,1.0,0.998739,0.210298,5e-06,0.998616
1,0.3953,0.621262,0.546764,0.459783,0.417356,0.380599,0.587679,0.438061,0.405262,0.587155,...,0.012788,0.816429,7e-06,3.0021060000000002e-227,0.932907,1.0,0.875311,0.210298,0.043689,0.512986
2,0.21583,0.454296,0.314096,0.491937,0.544737,0.363184,0.521658,0.473832,0.353421,0.699079,...,0.823701,0.964819,0.0,0.0,1.0,1.0,0.0,0.210298,0.0,0.206276
3,0.21583,0.454296,0.314096,0.491937,0.544737,0.363184,0.521658,0.473832,0.353421,0.699079,...,0.037639,0.840097,3.5e-05,1.020485e-241,0.981598,1.0,0.875887,0.210298,0.061438,0.206246
4,0.319421,0.37158,0.441413,0.392502,0.48412,0.383373,0.526204,0.507683,0.347641,0.558086,...,0.192989,0.980164,0.0,0.0,1.0,1.0,0.0,0.210298,0.0,0.206276


In [11]:
#calculate the mean of the IC50_PUBLISHED

drug_mean = X_scaled['IC50_PUBLISHED'].mean()

In [12]:
#create labels

ic, labels = X_scaled['IC50_PUBLISHED'], []
for i in range(len(ic)):
    #if the IC50_PUBLISHED value is greater than the mean, add responsive label to the label list
    if ic[i] > drug_mean:
        labels.append('R')
    #if the IC50_PUBLISHED value is less than the mean, add non responsive label to the label list
    elif ic[i] < drug_mean:
        labels.append('NR')
        
y = pd.Series(labels)

In [13]:
#remove unnecessary features, keeping just metabolite values in the final dataset

X = X_scaled.loc[:, :last_column]
X.shape, X1.shape

((1196, 225), (1196, 244))

In [14]:
#calculate the correlation matrix of the metabolite dataframe
#choose the upper triangle of the correlation matrix
#create a list of features where the correlation value is >0.90
#this list contains the highly correlated features, which will be removed from the dataset

corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column]>0.90)]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [15]:
corr_matrix

Unnamed: 0,2-aminoadipate,3-phosphoglycerate,alpha-glycerophosphate,4-pyridoxate,aconitate,adenine,adipate,alpha-ketoglutarate,AMP,citrate,...,C56:8 TAG,C56:7 TAG,C56:6 TAG,C56:5 TAG,C56:4 TAG,C56:3 TAG,C56:2 TAG,C58:8 TAG,C58:7 TAG,C58:6 TAG
2-aminoadipate,1.000000,0.038103,0.035480,0.027591,0.155489,0.018590,0.079931,0.149538,0.181314,0.075790,...,0.011597,0.029590,0.037614,0.032006,0.086984,0.113767,0.156126,0.031493,0.043190,0.051325
3-phosphoglycerate,0.038103,1.000000,0.009784,0.239749,0.403206,0.053397,0.241627,0.450780,0.138597,0.390334,...,0.302914,0.376897,0.416079,0.386933,0.278997,0.075583,0.064423,0.393599,0.404581,0.400276
alpha-glycerophosphate,0.035480,0.009784,1.000000,0.035248,0.186814,0.185430,0.076065,0.109466,0.481274,0.163502,...,0.126219,0.119268,0.075781,0.122072,0.156377,0.150552,0.204848,0.094785,0.095798,0.108696
4-pyridoxate,0.027591,0.239749,0.035248,1.000000,0.291602,0.062968,0.271884,0.181371,0.092723,0.234425,...,0.117592,0.210811,0.218029,0.198484,0.135079,0.011834,0.015629,0.162202,0.163908,0.214932
aconitate,0.155489,0.403206,0.186814,0.291602,1.000000,0.111368,0.252302,0.579400,0.084656,0.862259,...,0.383008,0.404966,0.483882,0.431955,0.375343,0.202428,0.223949,0.416177,0.414778,0.439305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C56:3 TAG,0.113767,0.075583,0.150552,0.011834,0.202428,0.062429,0.037976,0.225169,0.253747,0.202757,...,0.236529,0.487469,0.491457,0.598766,0.812033,1.000000,0.802306,0.523638,0.563950,0.562990
C56:2 TAG,0.156126,0.064423,0.204848,0.015629,0.223949,0.082367,0.137088,0.201324,0.324251,0.210500,...,0.203434,0.408570,0.402573,0.444476,0.639248,0.802306,1.000000,0.392014,0.420688,0.436988
C58:8 TAG,0.031493,0.393599,0.094785,0.162202,0.416177,0.066976,0.343298,0.359004,0.139444,0.412511,...,0.604492,0.899457,0.897015,0.867473,0.749032,0.523638,0.392014,1.000000,0.928354,0.904750
C58:7 TAG,0.043190,0.404581,0.095798,0.163908,0.414778,0.058003,0.326759,0.365021,0.146176,0.414322,...,0.545471,0.832878,0.877844,0.865709,0.767776,0.563950,0.420688,0.928354,1.000000,0.886665


In [16]:
len(to_drop)

21

In [17]:
#drop the list of highly correlated features computed above

X.drop(to_drop, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [18]:
X.shape, y.shape

((1196, 204), (1196,))

In [19]:
#run the RFECV model with estimator being Random Forest and StratifiedKFold cross validation with 5 folds.

rfecv = RFECV(estimator = RandomForestClassifier(random_state=101), step=1, cv=StratifiedKFold(5), scoring='accuracy', min_features_to_select=3)

In [20]:
#fit the X,y to the RFECV model

rfecv.fit(X, y)

RFECV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
      estimator=RandomForestClassifier(random_state=101),
      min_features_to_select=3, scoring='accuracy')

In [21]:
#choose the features that are selected by the RFECV model

selected_features = rfecv.get_support(1)

In [22]:
#select a subset dataframe that contains only the "optimal" metabolic features returned from the RFECV model

X3 = X[X.columns[selected_features]]

In [23]:
#using classification_report metrics, run a prediction model using StratifiedKFold cross_validation with k=5 folds
#model being used as the classifier is Random Forest

kf3 = StratifiedKFold(n_splits = 5, shuffle=False)
model=RandomForestClassifier()
i=1
for train_index, test_index in kf3.split(X3, y):
    #select train and test datasets from X and y
    X_train, X_test = X3.iloc[train_index], X3.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train the model
    model.fit(X_train, y_train)
    #predict the test dataset
    predicted = model.predict(X_test)
    #print the classification score report
    print(f"{classification_report(y_test, predicted)}\n")
    i+=1

              precision    recall  f1-score   support

          NR       0.61      0.86      0.71       148
           R       0.32      0.11      0.16        92

    accuracy                           0.57       240
   macro avg       0.47      0.48      0.44       240
weighted avg       0.50      0.57      0.50       240


              precision    recall  f1-score   support

          NR       0.58      0.76      0.66       148
           R       0.22      0.11      0.15        91

    accuracy                           0.51       239
   macro avg       0.40      0.44      0.40       239
weighted avg       0.45      0.51      0.47       239


              precision    recall  f1-score   support

          NR       0.56      0.73      0.63       148
           R       0.11      0.05      0.07        91

    accuracy                           0.47       239
   macro avg       0.33      0.39      0.35       239
weighted avg       0.39      0.47      0.42       239


              pr

In [23]:
X_ = X.head(5)
y_ = y.head(5)

In [24]:
sfs = SFS(model, cv = StratifiedKFold(2), direction='forward')