In [1]:
import os 
import pandas as pd
import numpy as np

In [2]:
main_dir = "E:\MGMT patch detection\mgmt-patch-model"
os.chdir(main_dir)

Selecting patients with MGMT status available with struc features

In [3]:
df_t1_t2 = pd.read_csv("./Extracted feat combined/t1_t2_flair_t1gd.csv")
df_t1_t2.drop(['Unnamed: 0'], axis=1,inplace=True)
df_mgmt = pd.read_csv("UPENN-GBM_clinical_info_v1.0.csv")
df_mgmt.rename(columns={'ID':'SubjectID'}, inplace=True)

df_t1_t2 = df_t1_t2.merge(df_mgmt[['MGMT','SubjectID']], on='SubjectID', how='right')
df_t1_t2 = df_t1_t2.dropna(axis=0)
print("df_t1_t2.shape after combining mgmt status" + str(df_t1_t2.shape))

df_1 = df_t1_t2[df_t1_t2['MGMT'] == 'Unmethylated']
df_2 = df_t1_t2[df_t1_t2['MGMT'] == 'Methylated']
df_t1_t2 = pd.concat([df_2,df_1])
print("df_t1_t2.shape after selecting only available mgmt" + str(df_t1_t2.shape))

df_t1_t2.shape after combining mgmt status(598, 1730)
df_t1_t2.shape after selecting only available mgmt(256, 1730)


In [4]:
# Getting number of methylated and unmethylated
df_t1_t2.MGMT.value_counts()

Unmethylated    147
Methylated      109
Name: MGMT, dtype: int64

Converting : Methylated as 1 and Unmethylated as 0 

In [5]:
df_t1_t2['MGMT']=df_t1_t2['MGMT'].apply(lambda a :1 if a=='Methylated' else 0)

### **Data Preprocessing**

In [6]:
### Reducing features using Variance Threshold
from sklearn.feature_selection import VarianceThreshold
vt=VarianceThreshold()
vt.fit(df_t1_t2.drop(['MGMT','SubjectID'],axis=1))
df_t1_t2_vt=vt.transform(df_t1_t2.drop(['MGMT','SubjectID'],axis=1))
df_t1_t2_vt.shape

(256, 1644)

### Training the features

#### Train Test Split

In [7]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(df_t1_t2_vt, 
                                                    df_t1_t2.MGMT, test_size=0.20, 
                                                    random_state=42, 
                                                    stratify=df_t1_t2['MGMT'])
print("train size - " + str(train_x.shape))
print("test size - " + str(test_x.shape))

train size - (204, 1644)
test size - (52, 1644)


#### Feature selection using SVM

In [28]:
# function for rfe 
from sklearn.feature_selection import RFE
def rfe_feature_selection(algo,X,y):
  rfe=RFE(estimator=algo).fit(X,y)
  rfe_sel_index=rfe.get_support(indices=True)
  return rfe_sel_index

Scaling features

In [29]:
from sklearn.preprocessing import StandardScaler
x_scaled = StandardScaler().fit(train_x)
train_x_scaled = x_scaled.transform(train_x)

get selected feature idx

In [30]:
from sklearn.svm import SVC
svc_feat_index = rfe_feature_selection(SVC(kernel='linear'), train_x_scaled, train_y)
svc_feat_index.shape

(822,)

try with no split, predict test

In [67]:
from sklearn.metrics import accuracy_score
def test_score(model):
    sel_feat_idx=rfe_feature_selection(model, train_x_scaled, train_y, 1000, 10).get_support(indices=True)
    model.fit(np.take(train_x_scaled,sel_feat_idx,axis=1),train_y)
    test_x_scaled = x_scaled.transform(test_x)
    test_x_scaled_sel=np.take(test_x_scaled,sel_feat_idx,axis=1)
    y_pred=model.predict(test_x_scaled_sel)
    y_pred=np.where(y_pred>0.5,1,0)
    return accuracy_score(test_y,y_pred)



In [74]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
models_list=[LogisticRegression(max_iter=1000),
             SVC(kernel='linear'),
             DecisionTreeClassifier(),
             xgb.XGBClassifier(objective="binary:logistic", random_state=42)] 
for model in models_list:
    name=str(model).split('(')[0]
    score=str(test_score(model))
    print(name+' - '+score)

LogisticRegression - 0.5961538461538461
SVC - 0.6153846153846154
DecisionTreeClassifier - 0.5961538461538461
XGBClassifier - 0.5576923076923077


SVC performs more accuracy on test data in one split

### Cross Validation

In [13]:
import numpy as np
train_x_scaled_sel = np.take(train_x_scaled,sel_feat_idx, axis=1)

In [14]:
### List of Models
from sklearn.linear_model import LogisticRegression
from sklearn.
models_list=[LogisticRegression(),SVC(kernel='linear')] 

### DataFrame of accuracies for all model wrt cross validation
# df_acc=pd.DataFrame(columns=[])

In [56]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold,cross_val_score
model_names=['Logistic Regression','SVM']
acc_list=['Validation Accuracy','Test Accuracy']
cv=10
kfd=StratifiedKFold(n_splits=cv)
print("Cross Validation on "+str(cv)+"Folds")
df_model_acc=pd.DataFrame(columns=model_names,index=[[0,0],acc_list])
k=0
for train_idx,val_idx in kfd.split(train_x_scaled_sel,train_y):
    
    x_train = np.take(train_x_scaled_sel,train_idx,axis=0)
    x_val = np.take(train_x_scaled_sel,val_idx,axis=0)
    y_train = np.take(train_y,train_idx,axis=0)
    y_val = np.take(train_y,val_idx,axis=0)
    val_accuracies=[]
    test_accuracies=[]
    
    for model in models_list:
        history = model.fit(x_train,y_train)
        y_pred = history.predict(x_val)
        val_acc = accuracy_score(y_val,y_pred)
        # print("validation acc - " + str(acc))

        y_pred_test=history.predict(test_x_scaled_sel)
        # y_pred_test=np.where(y_pred_test>0.5,1,0)
        test_acc = accuracy_score(test_y,y_pred_test)
        # print("Test acc - " + str(test_acc))
        val_accuracies.append(val_acc)
        test_accuracies.append(test_acc)
        
    df_model_acc=pd.concat([df_model_acc,
                            pd.DataFrame(columns=model_names,
                                         data=[val_accuracies,test_accuracies],
                                         index=[[k+1,k+1],acc_list])])
    k+=1
df_model_acc[2:]
    

Cross Validation on 10Folds


Unnamed: 0,Unnamed: 1,Logistic Regression,SVM
1,Validation Accuracy,0.666667,0.619048
1,Test Accuracy,0.519231,0.576923
2,Validation Accuracy,0.52381,0.52381
2,Test Accuracy,0.557692,0.538462
3,Validation Accuracy,0.52381,0.571429
3,Test Accuracy,0.596154,0.615385
4,Validation Accuracy,0.428571,0.47619
4,Test Accuracy,0.576923,0.557692
5,Validation Accuracy,0.5,0.7
5,Test Accuracy,0.615385,0.596154
