In [1]:
import os 
import pandas as pd
import numpy as np

In [2]:
main_dir = "E:\MGMT patch detection\mgmt-patch-model"
os.chdir(main_dir)

Selecting patients with MGMT status available with struc features

In [3]:
df_t1_t2 = pd.read_csv("./Extracted feat combined/t1_t2_flair_t1gd.csv")
df_t1_t2.drop(['Unnamed: 0'], axis=1,inplace=True)
df_mgmt = pd.read_csv("UPENN-GBM_clinical_info_v1.0.csv")
df_mgmt.rename(columns={'ID':'SubjectID'}, inplace=True)

df_t1_t2 = df_t1_t2.merge(df_mgmt[['MGMT','SubjectID']], on='SubjectID', how='right')
df_t1_t2 = df_t1_t2.dropna(axis=0)
print("df_t1_t2.shape after combining mgmt status" + str(df_t1_t2.shape))

df_1 = df_t1_t2[df_t1_t2['MGMT'] == 'Unmethylated']
df_2 = df_t1_t2[df_t1_t2['MGMT'] == 'Methylated']
df_t1_t2 = pd.concat([df_2,df_1])
print("df_t1_t2.shape after selecting only available mgmt" + str(df_t1_t2.shape))

df_t1_t2.shape after combining mgmt status(598, 1730)
df_t1_t2.shape after selecting only available mgmt(256, 1730)


In [4]:
# Getting number of methylated and unmethylated
df_t1_t2.MGMT.value_counts()

Unmethylated    147
Methylated      109
Name: MGMT, dtype: int64

Converting : Methylated as 1 and Unmethylated as 0 

In [5]:
df_t1_t2['MGMT']=df_t1_t2['MGMT'].apply(lambda a :1 if a=='Methylated' else 0)

### **Data Preprocessing**

In [6]:
### Reducing features using Variance Threshold
from sklearn.feature_selection import VarianceThreshold
vt=VarianceThreshold()
vt.fit(df_t1_t2.drop(['MGMT','SubjectID'],axis=1))
df_t1_t2_vt=vt.transform(df_t1_t2.drop(['MGMT','SubjectID'],axis=1))
df_t1_t2_vt.shape

(256, 1644)

### Training the features

#### Train Test Split

In [7]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(df_t1_t2_vt, 
                                                    df_t1_t2.MGMT, test_size=0.20, 
                                                    random_state=42, 
                                                    stratify=df_t1_t2['MGMT'])
print("train size - " + str(train_x.shape))
print("test size - " + str(test_x.shape))

train size - (204, 1644)
test size - (52, 1644)


#### Feature selection using SVM

In [8]:
# function for rfe 
from sklearn.feature_selection import RFE
def rfe_feature_selection(algo,X,y,n_feature_selection,step):
  rfe=RFE(algo,n_features_to_select=n_feature_selection,step=step)
  return rfe.fit(X,y)

Scaling features

In [9]:
from sklearn.preprocessing import StandardScaler
x_scaled = StandardScaler().fit(train_x)
train_x_scaled = x_scaled.transform(train_x)

get selected feature idx

In [10]:
from sklearn.svm import SVC
svc_model = rfe_feature_selection(SVC(kernel='linear'), train_x_scaled, train_y, 1000, 10)
sel_feat_idx=svc_model.get_support(indices=True)

try with no split, predict test

In [11]:
svc=SVC(kernel='linear')
svc.fit(np.take(train_x_scaled,sel_feat_idx,axis=1),train_y)

test_x_scaled = x_scaled.transform(test_x)
test_x_scaled_sel=np.take(test_x_scaled,sel_feat_idx,axis=1)

from sklearn.metrics import accuracy_score
y_pred=svc.predict(test_x_scaled_sel)
y_pred=np.where(y_pred>0.5,1,0)
accuracy_score(test_y,y_pred)


0.6153846153846154

### Cross Validation

In [12]:
import numpy as np
train_x_scaled_sel = np.take(train_x_scaled,sel_feat_idx, axis=1)

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold,cross_val_score
k=10
kfd=StratifiedKFold(n_splits=k)
print("Cross Validation on "+str(k)+"Folds")
for train_idx,val_idx in kfd.split(train_x_scaled_sel,train_y):
    x_train = np.take(train_x_scaled_sel,train_idx,axis=0)
    x_val = np.take(train_x_scaled_sel,val_idx,axis=0)
    y_train = np.take(train_y,train_idx,axis=0)
    y_val = np.take(train_y,val_idx,axis=0)

    history = SVC(kernel='linear').fit(x_train,y_train)
    y_pred = history.predict(x_val)
    acc = accuracy_score(y_val,y_pred)
    print("validation acc - " + str(acc))

    y_pred_test=history.predict(test_x_scaled_sel)
    # y_pred_test=np.where(y_pred_test>0.5,1,0)
    test_acc = accuracy_score(test_y,y_pred_test)
    print("Test acc - " + str(test_acc))

Cross Validation on 10Folds
validation acc - 0.6190476190476191
Test acc - 0.5769230769230769
validation acc - 0.5238095238095238
Test acc - 0.5384615384615384
validation acc - 0.5714285714285714
Test acc - 0.6153846153846154
validation acc - 0.47619047619047616
Test acc - 0.5576923076923077
validation acc - 0.7
Test acc - 0.5961538461538461
validation acc - 0.5
Test acc - 0.5576923076923077
validation acc - 0.5
Test acc - 0.5769230769230769
validation acc - 0.5
Test acc - 0.5384615384615384
validation acc - 0.6
Test acc - 0.6730769230769231
validation acc - 0.7
Test acc - 0.5961538461538461
