### **Import Libraries**

In [1]:
import os 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### **Loading Data**

In [2]:
# Loading t1_t2_flair_t1gd data
df_t1_t2 = pd.read_csv("../Extracted feat combined/t1_t2_flair_t1gd.csv")
df_t1_t2.drop(['Unnamed: 0'], axis=1,inplace=True)

# Loading Clinical Info
df_mgmt_clinical = pd.read_csv("../UPENN-GBM_clinical_info_v1.0.csv")
df_mgmt_clinical.rename(columns={'ID':'SubjectID'}, inplace=True)  # Renaming ID to SubjectID

# Merging the above two CSV's
df_t1_t2_mgmt = df_t1_t2.merge(df_mgmt_clinical[['Survival_from_surgery_days','SubjectID','MGMT']], on='SubjectID', how='right')
df_t1_t2_mgmt.rename(columns={'Survival_from_surgery_days':'Survival_days'}, inplace=True)
df_t1_t2_mgmt = df_t1_t2_mgmt.dropna(axis=0)
df_t1_t2_mgmt=df_t1_t2_mgmt.loc[df_t1_t2_mgmt['MGMT'].isin(['Methylated','Unmethylated'])]
df_t1_t2_mgmt=df_t1_t2_mgmt.loc[df_t1_t2_mgmt['Survival_days']!='Not Available']
print('Shape of dataset: '+str(df_t1_t2_mgmt.shape))

Shape of dataset: (209, 1731)


In [3]:
# Classifing the Survival Days into Short period and Long period
# More than 365 days are long period
# Less than or equals 365 are short period
df_t1_t2_mgmt['Survival_days']=df_t1_t2_mgmt['Survival_days'].apply(lambda x: 'long' if int(x)>365 else 'short')

# Converting long and short labels to 1 and 0 simultaneosly
df_t1_t2_mgmt['Survival_days']=df_t1_t2_mgmt['Survival_days'].apply(lambda x: 1 if x=='long' else 0)

# Methylated coded to 1 and Unmethylated coded to 0
df_t1_t2_mgmt['MGMT']=df_t1_t2_mgmt['MGMT'].apply(lambda a :1 if a=='Methylated' else 0)

# Value couts of MGMT
print(df_t1_t2_mgmt['MGMT'].value_counts())

# Value couts of Survival 
df_t1_t2_mgmt['Survival_days'].value_counts()

0    136
1     73
Name: MGMT, dtype: int64


1    109
0    100
Name: Survival_days, dtype: int64

## **Survival Prediction** 

In [4]:
survival_train_x, survival_test_x, survival_train_y, survival_test_y = train_test_split(df_t1_t2_mgmt.drop(['MGMT',
                                                                                                            'Survival_days',
                                                                                                            'SubjectID'],
                                                                                                           axis=1), 
                                                    df_t1_t2_mgmt['Survival_days'], test_size=0.24, 
                                                    random_state=100, 
                                                    stratify=df_t1_t2_mgmt['Survival_days'])
print("Survival train size - " + str(survival_train_x.shape))
print("Survival test size - " + str(survival_test_x.shape))

Survival train size - (158, 1728)
Survival test size - (51, 1728)


#### **Variance Threshold**

In [5]:
### Reducing features using Variance Threshold
threshold=0.4
low_var_col= [x for x in survival_train_x.columns if survival_train_x[x].var()<threshold]
survival_train_x.drop(low_var_col,axis=1,inplace=True)
survival_test_x.drop(low_var_col,axis=1,inplace=True)
print('Features after variance threshold: '+str(survival_train_x.shape[1]))

Features after variance threshold: 1020


#### **Correlation Coeficient**

In [6]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

correlated_idx=correlation(survival_train_x,0.85)
survival_train_x.drop(correlated_idx,axis=1,inplace=True)
survival_test_x.drop(correlated_idx,axis=1,inplace=True)
print("Shape of dataset after correlation : ",str(survival_train_x.shape))

Shape of dataset after correlation :  (158, 302)


#### **RFECV**

In [7]:
# Function for rfecv
from sklearn.feature_selection import RFE,RFECV
def rfe_feature_selection(algo,X,y,cv):
  rfe=RFECV(estimator=algo,cv=cv,min_features_to_select=20)
  rfe.fit(X,y)
  rfe_sel_index=rfe.get_support(indices=True)
  return rfe_sel_index

#### **Feature Scaling**

In [8]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
survival_x_scaled = StandardScaler().fit(survival_train_x)
survival_train_x_scaled = survival_x_scaled.transform(survival_train_x)
survival_test_x_scaled=survival_x_scaled.transform(survival_test_x)

In [24]:
import xgboost as xgb
import lightgbm as lgb
from boruta import BorutaPy
xgb_model=xgb.XGBClassifier()
survival_feat_sel_boruta=BorutaPy(xgb_model,n_estimators='auto',verbose=0,random_state=1)
survival_feat_sel_boruta.fit(survival_train_x_scaled,survival_train_y)
survival_boruta_rank=survival_feat_sel_boruta.ranking_
survival_boruta_rank

array([ 18, 247,   9,  94,  76,  37, 247, 156,  23,  94, 156,  76,  94,
        94, 247, 247,  41, 247, 247, 156,  60,  15, 247, 156, 247,  94,
       247, 116, 247,  22,   5,  43, 156, 247, 156, 247, 247, 247, 247,
         6, 156,  17, 156, 156, 247,  94, 156, 247, 247,  94, 156, 247,
       156,  12,  21, 247, 116, 247,  46,  38,  76, 156, 247, 156, 156,
       247,  94, 156, 247, 156,  94, 247, 247, 156,  94, 156, 247,  31,
        94, 247,  94,  94, 247,  94, 247,  20,   7, 156, 247,  39,  60,
       247, 247, 247,  48, 247,  50, 247,  72, 156, 116, 247,  60, 156,
       247, 156, 247,  94, 156,  49,  33, 156, 156, 247,  60, 116,  19,
       247, 247, 116,  60, 156, 247, 156,  94, 247,  94,  60,  94, 247,
        72, 156, 156, 247, 247, 156,  35, 156, 247, 247, 247, 247, 247,
       156, 247, 247, 156, 247,  27, 116,  60, 156, 156,  94, 247,  94,
        41, 247,  10, 156,  60,  94,  51,   3,  11, 247,  94,  30, 156,
       247, 156,  60, 156, 247, 247, 247,  26,  34,  94, 247, 24

In [25]:
survival_boruta_rank.shape

(302,)

In [26]:
survival_col_dict_rank=dict(zip(survival_train_x.columns,survival_boruta_rank))
survival_boruta_sel_col=[col for col in survival_col_dict_rank.keys() if survival_col_dict_rank[col]<=50]
survival_boruta_train_x=survival_train_x[survival_boruta_sel_col]
survival_boruta_test_x=survival_test_x[survival_boruta_sel_col]

survival_boruta_x_scaled = StandardScaler().fit(survival_boruta_train_x)
survival_boruta_train_x_scaled = survival_boruta_x_scaled.transform(survival_boruta_train_x)
survival_boruta_test_x_scaled=survival_boruta_x_scaled.transform(survival_boruta_test_x)

In [29]:
from sklearn.svm import SVC
svc=SVC(kernel='rbf',C=0.1)
svc.fit(survival_boruta_train_x_scaled,survival_train_y)
print('Score of train: ',str(svc.score(survival_boruta_train_x_scaled,survival_train_y)))
print('Score of test: ',str(svc.score(survival_boruta_test_x_scaled,survival_test_y)))

Score of train:  0.5189873417721519
Score of test:  0.5294117647058824


In [99]:
# svc.fit(survival_boruta_train_x_scaled)
svc.decision_function(survival_boruta_train_x_scaled)

array([0.48660161, 1.00041046, 0.87729053, 0.50059571, 0.55507973,
       0.68928742, 0.65594426, 0.64121932, 0.52768166, 0.6195324 ,
       0.63538503, 0.65790826, 0.32287326, 0.63578647, 0.41962756,
       1.05322517, 0.91046825, 0.49959235, 0.80008848, 0.64303589,
       0.4363451 , 0.99986746, 0.26148086, 0.62383259, 0.8072652 ,
       0.88184493, 0.7481859 , 0.51157145, 0.7170945 , 0.6685894 ,
       0.86959193, 0.69508045, 0.95693146, 0.87814485, 0.74142002,
       0.79471341, 0.54237857, 0.88530419, 0.90403506, 0.67095748,
       0.54725125, 0.81436357, 0.56246855, 0.84634583, 0.9997631 ,
       0.60510791, 0.67632566, 0.79051667, 0.14821989, 0.53572615,
       0.67268777, 0.49735288, 0.70993069, 0.63416589, 0.84381503,
       0.86732989, 0.80008232, 0.79527842, 0.95747018, 0.84138835,
       0.82708607, 0.78758899, 0.91959965, 0.23645749, 0.57068546,
       0.61307061, 0.71897761, 0.33525737, 0.62155529, 1.19931372,
       0.6998623 , 0.97735669, 0.34391859, 0.77898736, 0.51689

## **MGMT**

In [30]:
mgmt_train_x, mgmt_test_x, mgmt_train_y, mgmt_test_y = train_test_split(df_t1_t2_mgmt.drop(['Survival_days','SubjectID','MGMT'],axis=1), 
                                                    df_t1_t2_mgmt['MGMT'], test_size=0.24, 
                                                    random_state=100, 
                                                    stratify=df_t1_t2_mgmt['Survival_days'])
print("train size - " + str(mgmt_train_x.shape))
print("test size - " + str(mgmt_test_x.shape))

train size - (158, 1728)
test size - (51, 1728)


In [31]:
mgmt_train_y.value_counts()

0    101
1     57
Name: MGMT, dtype: int64

### **Variance Threshold**

In [32]:
### Reducing features using Variance Threshold
threshold=0.4
low_var_col= [x for x in mgmt_train_x.columns if mgmt_train_x[x].var()<threshold]
mgmt_train_x.drop(low_var_col,axis=1,inplace=True)
mgmt_test_x.drop(low_var_col,axis=1,inplace=True)

In [33]:
mgmt_correlated_idx=correlation(mgmt_train_x,0.85)
mgmt_train_x.drop(mgmt_correlated_idx,axis=1,inplace=True)
mgmt_test_x.drop(mgmt_correlated_idx,axis=1,inplace=True)
print("Shape of dataset: ",str(mgmt_train_x.shape))

Shape of dataset:  (158, 302)


In [34]:
mgmt_x_scaled = StandardScaler().fit(mgmt_train_x)
mgmt_train_x_scaled = mgmt_x_scaled.transform(mgmt_train_x)
mgmt_test_x_scaled=mgmt_x_scaled.transform(mgmt_test_x)

In [35]:
import xgboost as xgb
import lightgbm as lgb
from boruta import BorutaPy
xgb_model=xgb.XGBClassifier()
mgmt_feat_sel_boruta=BorutaPy(xgb_model,n_estimators='auto',verbose=0,random_state=1)
mgmt_feat_sel_boruta.fit(mgmt_train_x_scaled,survival_train_y)
mgmt_boruta_rank=mgmt_feat_sel_boruta.ranking_
mgmt_boruta_rank

array([ 18, 247,   9,  94,  76,  37, 247, 156,  23,  94, 156,  76,  94,
        94, 247, 247,  41, 247, 247, 156,  60,  15, 247, 156, 247,  94,
       247, 116, 247,  22,   5,  43, 156, 247, 156, 247, 247, 247, 247,
         6, 156,  17, 156, 156, 247,  94, 156, 247, 247,  94, 156, 247,
       156,  12,  21, 247, 116, 247,  46,  38,  76, 156, 247, 156, 156,
       247,  94, 156, 247, 156,  94, 247, 247, 156,  94, 156, 247,  31,
        94, 247,  94,  94, 247,  94, 247,  20,   7, 156, 247,  39,  60,
       247, 247, 247,  48, 247,  50, 247,  72, 156, 116, 247,  60, 156,
       247, 156, 247,  94, 156,  49,  33, 156, 156, 247,  60, 116,  19,
       247, 247, 116,  60, 156, 247, 156,  94, 247,  94,  60,  94, 247,
        72, 156, 156, 247, 247, 156,  35, 156, 247, 247, 247, 247, 247,
       156, 247, 247, 156, 247,  27, 116,  60, 156, 156,  94, 247,  94,
        41, 247,  10, 156,  60,  94,  51,   3,  11, 247,  94,  30, 156,
       247, 156,  60, 156, 247, 247, 247,  26,  34,  94, 247, 24

In [36]:
mgmt_col_dict_rank=dict(zip(mgmt_train_x.columns,mgmt_boruta_rank))
mgmt_boruta_sel_col=[col for col in mgmt_col_dict_rank.keys() if mgmt_col_dict_rank[col]<=50]
mgmt_boruta_train_x=mgmt_train_x[mgmt_boruta_sel_col]
mgmt_boruta_test_x=mgmt_test_x[mgmt_boruta_sel_col]

mgmt_boruta_x_scaled = StandardScaler().fit(mgmt_boruta_train_x)
mgmt_boruta_train_x_scaled = mgmt_boruta_x_scaled.transform(mgmt_boruta_train_x)
mgmt_boruta_test_x_scaled=mgmt_boruta_x_scaled.transform(mgmt_boruta_test_x)

In [37]:
from sklearn.svm import SVC
from sklearn.metrics import recall_score
svc=SVC(kernel='linear',class_weight='balanced',C=0.3)
svc.fit(mgmt_boruta_train_x_scaled,mgmt_train_y)
print('Score of train: ',str(svc.score(mgmt_boruta_train_x_scaled,mgmt_train_y)))
print('Score of test: ',str(svc.score(mgmt_boruta_test_x_scaled,mgmt_test_y)))
y_pred=svc.predict(mgmt_boruta_test_x_scaled)
print(y_pred)
print(recall_score(mgmt_test_y,y_pred))

Score of train:  0.6772151898734177
Score of test:  0.47058823529411764
[0 1 1 0 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0
 1 1 1 0 0 0 1 0 0 0 1 0 0 1]
0.25


In [38]:
# svc.fit(survival_boruta_train_x_scaled)
svc.decision_function(survival_boruta_train_x_scaled)

array([-0.9998053 , -1.00025191, -0.99968989, -0.99997554,  0.33898593,
       -0.72394832,  0.43473534,  0.70159263, -0.99991371, -0.58734659,
       -3.82261193, -0.79675515,  1.00012836, -1.00018069,  0.23617847,
        1.31569388,  0.9352092 ,  0.46349869, -1.00024409,  1.00041924,
        0.05436979, -0.66849577,  0.64294918, -0.2436051 ,  0.29531761,
        0.15776365,  0.36863394, -0.57692801,  1.15839213, -0.28442649,
       -1.00027123, -0.99996029,  0.36652751, -0.64561707,  0.87479268,
       -1.25332263, -0.99999696,  0.75655735,  1.62153437, -0.28606411,
        1.66426793,  0.94345255,  1.03589615,  0.05308089,  1.21595107,
       -1.00029355,  0.99978689,  0.99957813, -1.00441922,  1.56583973,
        0.48398713,  0.20509903, -1.68086098,  0.25443039,  1.29794993,
       -0.99970212,  1.0990672 ,  0.44768133,  0.73494327,  0.23487928,
        0.99993838,  3.69762331, -0.99956355, -1.93661245, -0.92122687,
        0.18779863, -0.43641884, -0.03682458, -1.24716959, -0.99