In [63]:
import os 
import pandas as pd

In [64]:
main_dir = "E:\ML projects\MGMT"
os.chdir(main_dir)

Selecting patients with MGMT status available with struc features

In [76]:
df_t1_t2 = pd.read_csv("./Extracted feat combined/t1_t2_flair_t1gd.csv")
df_t1_t2.drop(['Unnamed: 0'], axis=1,inplace=True)
df_mgmt = pd.read_csv("UPENN-GBM_clinical_info_v1.0.csv")
df_mgmt.rename(columns={'ID':'SubjectID'}, inplace=True)

df_t1_t2 = df_t1_t2.merge(df_mgmt[['MGMT','SubjectID']], on='SubjectID', how='right')
df_t1_t2 = df_t1_t2.dropna(axis=0)
print("df_t1_t2.shape after combining mgmt status" + str(df_t1_t2.shape))

df_1 = df_t1_t2[df_t1_t2['MGMT'] == 'Unmethylated']
df_2 = df_t1_t2[df_t1_t2['MGMT'] == 'Methylated']
df_t1_t2 = pd.concat([df_2,df_1])
print("df_t1_t2.shape after selecting only available mgmt" + str(df_t1_t2.shape))

df_t1_t2.shape after combining mgmt status(598, 1730)
df_t1_t2.shape after selecting only available mgmt(256, 1730)


In [77]:
df_t1_t2.MGMT.value_counts()

Unmethylated    147
Methylated      109
Name: MGMT, dtype: int64

In [78]:
df_t1_t2['MGMT']=df_t1_t2['MGMT'].apply(lambda a :1 if a=='Methylated' else 0)
df_t1_t2['MGMT'].value_counts()

0    147
1    109
Name: MGMT, dtype: int64

#### test train split

In [168]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(df_t1_t2.drop(['MGMT'], axis=1), df_t1_t2.MGMT, test_size=0.10, random_state=42, stratify=df_t1_t2['MGMT'])

In [169]:
print("train size - " + str(train_x.shape))
print("test size - " + str(test_x.shape))

train size - (230, 1729)
test size - (26, 1729)


#### Feature selection using SVM

In [171]:
# function for rfe 
from sklearn.feature_selection import RFE
def rfe_feature_selection(algo,X,y,n_feature_selection,step):
  rfe=RFE(algo,n_features_to_select=n_feature_selection,step=step)
  return rfe.fit(X,y)

Scaling features

In [172]:
from sklearn.preprocessing import StandardScaler
x_scaled = StandardScaler().fit(train_x.drop(['SubjectID'], axis=1))
train_x_scaled = x_scaled.transform(train_x.drop(['SubjectID'], axis=1))

get selected feature idx

In [173]:
from sklearn.svm import SVC
svc_model = rfe_feature_selection(SVC(kernel='linear'), train_x_scaled, train_y, 1000, 10)
sel_feat_idx=svc_model.get_support(indices=True)

try with no split, predict test

In [175]:
svc=SVC(kernel='linear')
svc.fit(np.take(train_x_scaled,sel_feat_idx,axis=1),train_y)

test_x_scaled = x_scaled.transform(test_x.drop(['SubjectID'], axis=1))
test_x_scaled_sel=np.take(test_x_scaled,sel_feat_idx,axis=1)

# from sklearn.metrics import accuracy_score
# y_pred=svc.predict(test_x_scaled_sel)
# y_pred=np.where(y_pred>0.5,1,0)
# accuracy_score(test_y,y_pred)

### Cross Validation

In [176]:
import numpy as np
train_x_scaled_sel = np.take(train_x_scaled,sel_feat_idx, axis=1)

In [177]:
from sklearn.model_selection import StratifiedKFold,cross_val_score

cross_val_score(SVC(kernel='linear'),
                train_x_scaled_sel ,
                train_y,cv=10,)
    

array([0.73913043, 0.43478261, 0.7826087 , 0.65217391, 0.56521739,
       0.69565217, 0.56521739, 0.56521739, 0.60869565, 0.73913043])

In [178]:
from sklearn.metrics import accuracy_score

kfd=StratifiedKFold(n_splits=10)
for train_idx,val_idx in kfd.split(train_x_scaled_sel,train_y):
    x_train = np.take(train_x_scaled_sel,train_idx,axis=0)
    x_val = np.take(train_x_scaled_sel,val_idx,axis=0)
    y_train = np.take(train_y,train_idx,axis=0)
    y_val = np.take(train_y,val_idx,axis=0)

    history = SVC(kernel='linear').fit(x_train,y_train)
    y_pred = history.predict(x_val)
    acc = accuracy_score(y_val,y_pred)
    print("validation acc - " + str(acc))

    y_pred_test=history.predict(test_x_scaled_sel)
    # y_pred_test=np.where(y_pred_test>0.5,1,0)
    test_acc = accuracy_score(test_y,y_pred_test)
    print("Test acc - " + str(test_acc))

validation acc - 0.7391304347826086
Test acc - 0.5
validation acc - 0.43478260869565216
Test acc - 0.5
validation acc - 0.782608695652174
Test acc - 0.46153846153846156
validation acc - 0.6521739130434783
Test acc - 0.5
validation acc - 0.5652173913043478
Test acc - 0.5384615384615384
validation acc - 0.6956521739130435
Test acc - 0.5
validation acc - 0.5652173913043478
Test acc - 0.5384615384615384
validation acc - 0.5652173913043478
Test acc - 0.5
validation acc - 0.6086956521739131
Test acc - 0.46153846153846156
validation acc - 0.7391304347826086
Test acc - 0.5384615384615384
