## Imports 

In [134]:
import os 
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

## Read the dataset and the radiogenomic data

In [206]:
pyradiomics_dataset_path = os.path.join(os.getcwd(), 'dataset', 'dataset.csv')
pyradiomics_dataset = pd.read_csv(pyradiomics_dataset_path)
pyradiomics_dataset.head()

Unnamed: 0,Mask,Image,original_shape_VoxelVolume,original_shape_MeshVolume,original_shape_SurfaceArea,original_shape_SurfaceVolumeRatio,original_shape_Sphericity,original_shape_Maximum3DDiameter,original_shape_Maximum2DDiameterSlice,original_shape_Maximum2DDiameterColumn,...,lbp-3D-k_gldm_GrayLevelNonUniformity,lbp-3D-k_gldm_GrayLevelVariance,lbp-3D-k_gldm_HighGrayLevelEmphasis,lbp-3D-k_gldm_LargeDependenceEmphasis,lbp-3D-k_gldm_LargeDependenceHighGrayLevelEmphasis,lbp-3D-k_gldm_LargeDependenceLowGrayLevelEmphasis,lbp-3D-k_gldm_LowGrayLevelEmphasis,lbp-3D-k_gldm_SmallDependenceEmphasis,lbp-3D-k_gldm_SmallDependenceHighGrayLevelEmphasis,lbp-3D-k_gldm_SmallDependenceLowGrayLevelEmphasis
0,R01-127_roi.nii,R01-127.nii,6646.222923,6620.933051,3821.238084,0.577145,0.446231,42.351097,41.211769,38.654876,...,17568.232131,0.107344,1.366904,435.038264,447.886326,431.826248,0.908274,0.017466,0.060178,0.006788
1,R01-064_roi.nii,R01-064.nii,3366.088867,3297.932943,1870.232366,0.567092,0.572906,52.086588,14.92576,51.939907,...,814.020852,0.130997,1.465095,273.110607,297.173164,267.094968,0.883726,0.0194,0.065205,0.007949
2,R01-118_roi.nii,R01-118.nii,154492.126465,153863.610586,29776.235984,0.193524,0.466342,148.169042,75.007242,146.583008,...,27212.449782,0.174788,1.677255,310.474449,365.343731,296.757129,0.830686,0.015801,0.051458,0.006886
3,R01-044_roi.nii,R01-044.nii,707.652683,690.997678,1016.191502,1.470615,0.371957,32.898135,30.636614,26.940836,...,1678.344142,0.061102,1.19613,357.702929,361.049686,356.86624,0.950968,0.02096,0.064607,0.010049
4,R01-024_roi.nii,R01-024.nii,4905.527134,4877.243809,2747.608401,0.563353,0.506188,47.004263,27.938821,30.42705,...,8310.601509,0.14375,1.522119,362.528292,419.715021,348.23161,0.86947,0.011873,0.038674,0.005173


In [207]:
radiogenomics_labels_path = os.path.join(os.getcwd(), 'dataset', 'radiogenomics_labels.csv')
radiogenomics_labels = pd.read_csv(radiogenomics_labels_path)
radiogenomics_labels.tail()

Unnamed: 0,Case ID,Patient affiliation,Age at Histological Diagnosis,Weight (lbs),Gender,Ethnicity,Smoking status,Pack Years,Quit Smoking Year,%GG,...,Recurrence,Recurrence Location,Date of Recurrence,Date of Last Known Alive,Survival Status,Date of Death,Time to Death (days),CT Date,Days between CT and surgery,PET Date
206,R01-159,Stanford,75,184,Male,Caucasian,Former,55,1994.0,Not Assessed,...,no,,,7/13/1995,Alive,,,11/24/1994,14,11/16/1994
207,R01-160,VA,61,231.5,Male,Caucasian,Former,12,1993.0,Not Assessed,...,no,,,7/3/1999,Alive,,,8/12/1993,72,9/22/1993
208,R01-161,Stanford,52,Not Collected,Female,Caucasian,Former,7,,Not Assessed,...,no,,,4/2/1999,Alive,,,12/13/1995,8,9/26/1995
209,R01-162,Stanford,67,158,Male,Asian,Former,15,1966.0,Not Assessed,...,no,,,10/8/1997,Dead,10/8/1997,671.0,10/3/1995,65,11/14/1995
210,R01-163,VA,68,229,Male,Caucasian,Current,30,,Not Assessed,...,yes,distant,2/15/1996,1/11/1997,Dead,1/11/1997,462.0,8/17/1995,51,7/12/1995


## Data Preprocessing

In [208]:
pyradiomics_dataset['Case ID'] = None

for i, image in enumerate(pyradiomics_dataset['Image']):
    pyradiomics_dataset.loc[i, 'Case ID'] = image.split('.')[0]

In [209]:
dataset = pd.merge(pyradiomics_dataset, radiogenomics_labels[['Case ID', 'Survival Status']], left_on='Case ID', right_on='Case ID', how='left')

In [210]:
dataset.drop(['Mask', 'Image', 'Case ID'], axis=1, inplace=True)
dataset.dropna(inplace=True)

In [211]:
dataset.head()

Unnamed: 0,original_shape_VoxelVolume,original_shape_MeshVolume,original_shape_SurfaceArea,original_shape_SurfaceVolumeRatio,original_shape_Sphericity,original_shape_Maximum3DDiameter,original_shape_Maximum2DDiameterSlice,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_MajorAxisLength,...,lbp-3D-k_gldm_GrayLevelVariance,lbp-3D-k_gldm_HighGrayLevelEmphasis,lbp-3D-k_gldm_LargeDependenceEmphasis,lbp-3D-k_gldm_LargeDependenceHighGrayLevelEmphasis,lbp-3D-k_gldm_LargeDependenceLowGrayLevelEmphasis,lbp-3D-k_gldm_LowGrayLevelEmphasis,lbp-3D-k_gldm_SmallDependenceEmphasis,lbp-3D-k_gldm_SmallDependenceHighGrayLevelEmphasis,lbp-3D-k_gldm_SmallDependenceLowGrayLevelEmphasis,Survival Status
0,6646.222923,6620.933051,3821.238084,0.577145,0.446231,42.351097,41.211769,38.654876,42.075213,28.771746,...,0.107344,1.366904,435.038264,447.886326,431.826248,0.908274,0.017466,0.060178,0.006788,Alive
1,3366.088867,3297.932943,1870.232366,0.567092,0.572906,52.086588,14.92576,51.939907,50.219246,49.321299,...,0.130997,1.465095,273.110607,297.173164,267.094968,0.883726,0.0194,0.065205,0.007949,Alive
2,154492.126465,153863.610586,29776.235984,0.193524,0.466342,148.169042,75.007242,146.583008,146.015102,144.590257,...,0.174788,1.677255,310.474449,365.343731,296.757129,0.830686,0.015801,0.051458,0.006886,Alive
3,707.652683,690.997678,1016.191502,1.470615,0.371957,32.898135,30.636614,26.940836,21.01103,23.222462,...,0.061102,1.19613,357.702929,361.049686,356.86624,0.950968,0.02096,0.064607,0.010049,Alive
4,4905.527134,4877.243809,2747.608401,0.563353,0.506188,47.004263,27.938821,30.42705,34.059646,35.852307,...,0.14375,1.522119,362.528292,419.715021,348.23161,0.86947,0.011873,0.038674,0.005173,Dead


In [212]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [213]:
y.replace({'Alive': 1, 'Dead': 0}, inplace=True)

In [214]:
X = MinMaxScaler().fit_transform(X.astype('float64'))
X.shape
X

array([[0.00427518, 0.00426526, 0.02317787, ..., 0.49312589, 0.51965444,
        0.3411778 ],
       [0.00206953, 0.00202958, 0.01033367, ..., 0.59914555, 0.59089235,
        0.48988226],
       [0.10369073, 0.10332855, 0.19404933, ..., 0.40182731, 0.39609946,
        0.35375796],
       ...,
       [0.34380234, 0.34347954, 0.36091841, ..., 0.11286225, 0.15153591,
        0.06239028],
       [0.01118434, 0.01107761, 0.03753623, ..., 0.74163153, 0.71603   ,
        0.62327651],
       [0.00267366, 0.00266141, 0.01840232, ..., 0.12419842, 0.13923837,
        0.12331212]])

In [215]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Model Building

### Decision Tree Classifier

In [145]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()

In [146]:
scores = cross_val_score(tree_clf, X_train, y_train, scoring="accuracy", cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.62 (+/- 0.35)


## Random Forest Classifier

In [147]:
from sklearn.ensemble import RandomForestClassifier

random_forest_clf = RandomForestClassifier(n_estimators=100)

In [148]:
scores = cross_val_score(random_forest_clf, X_train, y_train, scoring="accuracy", cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.50 (+/- 0.28)


## XGBoost

In [149]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier()

In [150]:
scores = cross_val_score(xgb_clf, X_train, y_train, scoring="accuracy", cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.64 (+/- 0.38)


## GaussianNB Classifier

In [101]:
from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()

In [102]:
scores = cross_val_score(gnb_clf, X_train, y_train, scoring="accuracy", cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.55 (+/- 0.24)


## SVM Classifier

In [106]:
from sklearn.svm import SVC

svc_clf = SVC(gamma='scale')

In [107]:
scores = cross_val_score(svc_clf, X_train, y_train, scoring="accuracy", cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.58 (+/- 0.25)


## AdaBoost Classifier

In [108]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier()

In [109]:
scores = cross_val_score(ada_clf, X_train, y_train, scoring="accuracy", cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.67 (+/- 0.35)


## KNeighbors Classifier

In [111]:
from sklearn.neighbors import KNeighborsClassifier

kn_clf = KNeighborsClassifier()

In [112]:
scores = cross_val_score(kn_clf, X_train, y_train, scoring="accuracy", cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.55 (+/- 0.36)


## Gaussian Process Classifier

In [202]:
from sklearn.gaussian_process import GaussianProcessClassifier

gaussian_clf = GaussianProcessClassifier()

In [216]:
scores = cross_val_score(gaussian_clf, X_train, y_train, scoring="accuracy", cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.58 (+/- 0.22)


## SelectKBest

In [217]:
from sklearn.feature_selection import SelectKBest, chi2
X_train.shape

(94, 1562)

In [229]:
X_new = SelectKBest(chi2, k=20).fit_transform(X_train, y_train)
X_new.shape

(94, 20)

In [230]:
final_clf = XGBClassifier()
scores = cross_val_score(final_clf, X_new, y_train, scoring="accuracy", cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.67 (+/- 0.26)
