In [240]:
import os 
import pandas as pd
import numpy as np

In [241]:
main_dir = "E:\MGMT patch detection\mgmt-patch-model"
os.chdir(main_dir)

Selecting patients with MGMT status available with struc features

In [242]:
df_t1_t2 = pd.read_csv("./Extracted feat combined/t1_t2_flair_t1gd.csv")
df_t1_t2.drop(['Unnamed: 0'], axis=1,inplace=True)
df_mgmt = pd.read_csv("UPENN-GBM_clinical_info_v1.0.csv")
df_mgmt.rename(columns={'ID':'SubjectID'}, inplace=True)

df_t1_t2 = df_t1_t2.merge(df_mgmt[['MGMT','SubjectID']], on='SubjectID', how='right')
df_t1_t2 = df_t1_t2.dropna(axis=0)
print("df_t1_t2.shape after combining mgmt status" + str(df_t1_t2.shape))

df_1 = df_t1_t2[df_t1_t2['MGMT'] == 'Unmethylated']
df_2 = df_t1_t2[df_t1_t2['MGMT'] == 'Methylated']
df_t1_t2 = pd.concat([df_2,df_1])
print("df_t1_t2.shape after selecting only available mgmt" + str(df_t1_t2.shape))

df_t1_t2.shape after combining mgmt status(598, 1730)
df_t1_t2.shape after selecting only available mgmt(256, 1730)


In [243]:
# Getting number of methylated and unmethylated
df_t1_t2.MGMT.value_counts()

Unmethylated    147
Methylated      109
Name: MGMT, dtype: int64

Converting : Methylated as 1 and Unmethylated as 0 

In [244]:
df_t1_t2['MGMT']=df_t1_t2['MGMT'].apply(lambda a :1 if a=='Methylated' else 0)

### **Data Preprocessing**

In [245]:
### Reducing features using Variance Threshold
from sklearn.feature_selection import VarianceThreshold
vt=VarianceThreshold(threshold=0.1)
vt.fit(df_t1_t2.drop(['MGMT','SubjectID'],axis=1))
df_t1_t2_vt=vt.transform(df_t1_t2.drop(['MGMT','SubjectID'],axis=1))
df_t1_t2_vt.shape

(256, 1103)

### Training the features

#### Train Test Split

In [246]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(df_t1_t2_vt, 
                                                    df_t1_t2.MGMT, test_size=0.20, 
                                                    random_state=42, 
                                                    stratify=df_t1_t2['MGMT'])
print("train size - " + str(train_x.shape))
print("test size - " + str(test_x.shape))

train size - (204, 1103)
test size - (52, 1103)


#### Feature selection TEchniques

In [247]:
# function for rfe 
from sklearn.feature_selection import RFE,RFECV
def rfe_feature_selection(algo,X,y):
  rfe=RFECV(estimator=algo,cv=10,min_features_to_select=1000)
  rfe.fit(X,y)
  rfe_sel_index=rfe.get_support(indices=True)
  return rfe_sel_index

In [248]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
# Function for Random Forest Classifier
def rf(x,y):
   select_feat = SelectFromModel(estimator= RandomForestClassifier()).fit(x,y)
   feat_idx = select_feat.get_support(indices=True)
   return feat_idx

#### Features Scaling

In [249]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
x_scaled = StandardScaler().fit(train_x)
train_x_scaled = x_scaled.transform(train_x)
test_x_scaled=x_scaled.transform(test_x)

### **Cross Validation**

Calling RFECV

In [250]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
lr=LogisticRegression()
knn=KNeighborsClassifier(n_neighbors=7)
svc=SVC(kernel='linear',C=0.7)
rfecv_feat_idx=rfe_feature_selection(svc,train_x_scaled,train_y) # Getting features using RFECV 
rfecv_feat_idx.shape

(1000,)

In [251]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.svm import SVC
# k_folds=10
# cv_sel_idx_list=[]
# skfold=StratifiedKFold(n_splits=k_folds,random_state=42,shuffle=True)
# for train_idx,val_idx in skfold.split(train_x_scaled,train_y):
#     x_train = np.take(train_x_scaled,train_idx,axis=0) # This train is the train set in validation
#     x_val = np.take(train_x_scaled,val_idx,axis=0)
#     y_train = np.take(train_y,train_idx,axis=0)
#     y_val = np.take(train_y,val_idx,axis=0)
#     # cv_sel_idx=rfe_feature_selection(SVC(kernel='linear'),x_train,y_train)
#     cv_sel_idx=rf(x_train,y_train)
#     cv_sel_idx_list.append(cv_sel_idx)
    
    

In [252]:
# ### Concatinating the selected indexes from cv_sel_idx_list
# concat_sel_idx=cv_sel_idx_list[0]
# for i in list(range(1,10)):
#     concat_sel_idx=np.concatenate((concat_sel_idx,cv_sel_idx_list[i]))
    
# concat_sel_idx=np.unique(concat_sel_idx)
# concat_sel_idx.shape

In [253]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
k_folds=10
skfold=StratifiedKFold(n_splits=k_folds,random_state=42,shuffle=True)
sel_train_x=np.take(train_x_scaled,rfecv_feat_idx,axis=1)
sel_test_x=np.take(test_x_scaled,rfecv_feat_idx,axis=1)
metrics=[]
for train_idx,val_idx in skfold.split(sel_train_x,train_y):
    x_train = np.take(sel_train_x,train_idx,axis=0) # This train is the train set in validation
    x_val = np.take(sel_train_x,val_idx,axis=0)
    y_train = np.take(train_y,train_idx,axis=0)
    y_val = np.take(train_y,val_idx,axis=0)
    model_svc=SVC(kernel='poly',C=0.7)
    model_svc.fit(x_train,y_train)
    pred_val_y=model_svc.predict(x_val)
    pred_test_y=model_svc.predict(sel_test_x)
    metrics.append([accuracy_score(y_val,pred_val_y),
                    accuracy_score(test_y,pred_test_y)])
    
    

In [254]:
metrics

[[0.5714285714285714, 0.5769230769230769],
 [0.6190476190476191, 0.5576923076923077],
 [0.5714285714285714, 0.5769230769230769],
 [0.5714285714285714, 0.5769230769230769],
 [0.6, 0.5769230769230769],
 [0.55, 0.5769230769230769],
 [0.6, 0.5769230769230769],
 [0.55, 0.5769230769230769],
 [0.6, 0.5769230769230769],
 [0.55, 0.5769230769230769]]

get selected feature idx

In [255]:
metrics=np.array(metrics)
avg_accuracy=np.mean(metrics,axis=0)
std_accuracy=np.std(metrics,axis=0)
print("Accuracy : "+str(avg_accuracy[1])+" +/- "+str(std_accuracy[1]))

Accuracy : 0.5749999999999998 +/- 0.0057692307692307505


In [256]:

# svc_feat_index = rfe_feature_selection(SVC(kernel='linear'), train_x_scaled, train_y)
# svc_feat_index.shape