### Process
- import dependencies
- load data
- data preprocessing
    - feature selection
    - pearson correlation
    - 
- test-train split
- do classification
    - bagging
    - boosting
    - stacking
    - traditional ml techniques
- tables comparing accuracies, precesion, recall, f1
- different graphs

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt



In [2]:
clinical_data = pd.read_csv('data/recent-data.csv', index_col=False)

In [3]:
clinical_data['Case ID'].str.startswith('AMC').sum()

49

In [4]:
clinical_data.head(2)

Unnamed: 0,Case ID,Patient affiliation,Age at Histological Diagnosis,Weight (lbs),Gender,Ethnicity,Smoking status,Pack Years,Quit Smoking Year,%GG,...,Recurrence,Recurrence Location,Date of Recurrence,Date of Last Known Alive,Survival Status,Date of Death,Time to Death (days),CT Date,Days between CT and surgery,PET Date
0,AMC-001,Stanford,34,Not Collected,Male,Not Recorded In Database,Nonsmoker,,,Not Assessed,...,yes,distant,10/7/1994,1/7/1997,Dead,1/7/1997,872.0,8/10/1994,9,Not Collected
1,AMC-002,Stanford,33,Not Collected,Female,Not Recorded In Database,Nonsmoker,,,Not Assessed,...,no,,,3/20/1992,Alive,,,2/19/1992,3,Not Collected


## Data Pre-processing

In [5]:
clinical_data.columns

Index(['Case ID', 'Patient affiliation', 'Age at Histological Diagnosis',
       'Weight (lbs)', 'Gender', 'Ethnicity', 'Smoking status', 'Pack Years',
       'Quit Smoking Year', '%GG', 'Tumor Location (choice=RUL)',
       'Tumor Location (choice=RML)', 'Tumor Location (choice=RLL)',
       'Tumor Location (choice=LUL)', 'Tumor Location (choice=LLL)',
       'Tumor Location (choice=L Lingula)', 'Tumor Location (choice=Unknown)',
       'Histology ', 'Pathological T stage', 'Pathological N stage',
       'Pathological M stage', 'Histopathological Grade',
       'Lymphovascular invasion',
       'Pleural invasion (elastic, visceral, or parietal)',
       'EGFR mutation status', 'KRAS mutation status',
       'ALK translocation status', 'Adjuvant Treatment', 'Chemotherapy',
       'Radiation', 'Recurrence', 'Recurrence Location', 'Date of Recurrence',
       'Date of Last Known Alive', 'Survival Status', 'Date of Death',
       'Time to Death (days)', 'CT Date', 'Days between CT and sur

In [6]:
#remove caseid and patient affiliation
clinical_data.drop(columns=['Case ID', 'Patient affiliation'], inplace=True)


In [7]:
clinical_data.iloc[48]['Recurrence']

'Not collected'

In [8]:
clinical_data.drop(index=[48], inplace=True)

In [9]:
clinical_data.iloc[48]['Recurrence']

'no'

In [10]:
clinical_data.columns

Index(['Age at Histological Diagnosis', 'Weight (lbs)', 'Gender', 'Ethnicity',
       'Smoking status', 'Pack Years', 'Quit Smoking Year', '%GG',
       'Tumor Location (choice=RUL)', 'Tumor Location (choice=RML)',
       'Tumor Location (choice=RLL)', 'Tumor Location (choice=LUL)',
       'Tumor Location (choice=LLL)', 'Tumor Location (choice=L Lingula)',
       'Tumor Location (choice=Unknown)', 'Histology ', 'Pathological T stage',
       'Pathological N stage', 'Pathological M stage',
       'Histopathological Grade', 'Lymphovascular invasion',
       'Pleural invasion (elastic, visceral, or parietal)',
       'EGFR mutation status', 'KRAS mutation status',
       'ALK translocation status', 'Adjuvant Treatment', 'Chemotherapy',
       'Radiation', 'Recurrence', 'Recurrence Location', 'Date of Recurrence',
       'Date of Last Known Alive', 'Survival Status', 'Date of Death',
       'Time to Death (days)', 'CT Date', 'Days between CT and surgery',
       'PET Date'],
      dtype='o

In [11]:
clinical_data['Weight (lbs)'].describe

<bound method NDFrame.describe of 0      Not Collected
1      Not Collected
2      Not Collected
3      Not Collected
4      Not Collected
           ...      
206              184
207            231.5
208    Not Collected
209              158
210              229
Name: Weight (lbs), Length: 210, dtype: object>

In [12]:
#repacing not collected with nan and calculating medeian to replace nan
clinical_data['Weight (lbs)'].replace('Not Collected', 'NaN', inplace=True)
clinical_data['Weight (lbs)'].replace('NaN', np.nan, inplace=True)

In [13]:
#replace Not Collected with NaN
clinical_data.replace('Not Collected', 'NaN', inplace=True)
clinical_data.replace('Not collected', 'NaN', inplace=True)
clinical_data.replace('Not Recorded In Database', 'NaN', inplace=True)
clinical_data.replace('NaN', np.nan, inplace=True)

In [14]:
clinical_data['Weight (lbs)'].describe()

count         152
unique        101
top       200.655
freq            7
Name: Weight (lbs), dtype: object

In [15]:
median_value = clinical_data['Weight (lbs)'].median()

In [16]:
clinical_data['Weight (lbs)'].replace(np.nan, median_value, inplace=True)

In [17]:
clinical_data['Weight (lbs)'].describe()

count     210.0
unique    102.0
top       171.0
freq       58.0
Name: Weight (lbs), dtype: float64

In [18]:
clinical_data.columns

Index(['Age at Histological Diagnosis', 'Weight (lbs)', 'Gender', 'Ethnicity',
       'Smoking status', 'Pack Years', 'Quit Smoking Year', '%GG',
       'Tumor Location (choice=RUL)', 'Tumor Location (choice=RML)',
       'Tumor Location (choice=RLL)', 'Tumor Location (choice=LUL)',
       'Tumor Location (choice=LLL)', 'Tumor Location (choice=L Lingula)',
       'Tumor Location (choice=Unknown)', 'Histology ', 'Pathological T stage',
       'Pathological N stage', 'Pathological M stage',
       'Histopathological Grade', 'Lymphovascular invasion',
       'Pleural invasion (elastic, visceral, or parietal)',
       'EGFR mutation status', 'KRAS mutation status',
       'ALK translocation status', 'Adjuvant Treatment', 'Chemotherapy',
       'Radiation', 'Recurrence', 'Recurrence Location', 'Date of Recurrence',
       'Date of Last Known Alive', 'Survival Status', 'Date of Death',
       'Time to Death (days)', 'CT Date', 'Days between CT and surgery',
       'PET Date'],
      dtype='o

In [19]:
gender = pd.Categorical(clinical_data['Gender'])
#remove old gender column
clinical_data.drop(columns=['Gender'])

#create new gender column
clinical_data['Gender'] = gender

In [20]:
clinical_data.Gender

0        Male
1      Female
2      Female
3      Female
4        Male
        ...  
206      Male
207      Male
208    Female
209      Male
210      Male
Name: Gender, Length: 210, dtype: category
Categories (2, object): ['Female', 'Male']

In [21]:
clinical_data.columns

Index(['Age at Histological Diagnosis', 'Weight (lbs)', 'Gender', 'Ethnicity',
       'Smoking status', 'Pack Years', 'Quit Smoking Year', '%GG',
       'Tumor Location (choice=RUL)', 'Tumor Location (choice=RML)',
       'Tumor Location (choice=RLL)', 'Tumor Location (choice=LUL)',
       'Tumor Location (choice=LLL)', 'Tumor Location (choice=L Lingula)',
       'Tumor Location (choice=Unknown)', 'Histology ', 'Pathological T stage',
       'Pathological N stage', 'Pathological M stage',
       'Histopathological Grade', 'Lymphovascular invasion',
       'Pleural invasion (elastic, visceral, or parietal)',
       'EGFR mutation status', 'KRAS mutation status',
       'ALK translocation status', 'Adjuvant Treatment', 'Chemotherapy',
       'Radiation', 'Recurrence', 'Recurrence Location', 'Date of Recurrence',
       'Date of Last Known Alive', 'Survival Status', 'Date of Death',
       'Time to Death (days)', 'CT Date', 'Days between CT and surgery',
       'PET Date'],
      dtype='o

In [22]:
smoking_status = pd.Categorical(clinical_data['Smoking status'])
#remove old smoking_status column
clinical_data.drop(columns=['Smoking status'])

#create new smoking_status column
clinical_data['Smoking status'] = smoking_status

In [23]:
gg = pd.Categorical(clinical_data['%GG'])
#remove old gg column
clinical_data.drop(columns=['%GG'])

#create new gg column
clinical_data['%GG'] = gg

In [24]:
from numpy import NaN


clinical_data['Tumor Location'] = NaN

## Functions

In [25]:
def To_Categorical(column_name):
    column = pd.Categorical(clinical_data[column_name])
    #remove old smoking_status column
    clinical_data.drop(columns=[column_name])

    #create new smoking_status column
    clinical_data[column_name] = column

In [26]:
# To_Categorical('Tumor Location (choice=RUL)')
# To_Categorical('Tumor Location (choice=RML)')
# To_Categorical('Tumor Location (choice=RLL)')
# To_Categorical('Tumor Location (choice=LUL)')
# To_Categorical('Tumor Location (choice=LLL)')
# To_Categorical('Tumor Location (choice=Unknown)')
# To_Categorical('Tumor Location (choice=L Lingula)')

In [27]:
clinical_data.replace({'Tumor Location (choice=L Lingula)': {'Checked':1, 'Unchecked':0}}, inplace=True)
clinical_data.replace({'Tumor Location (choice=RUL)': {'Checked':1, 'Unchecked':0}}, inplace=True)
clinical_data.replace({'Tumor Location (choice=RML)': {'Checked':1, 'Unchecked':0}}, inplace=True)
clinical_data.replace({'Tumor Location (choice=RLL)': {'Checked':1, 'Unchecked':0}}, inplace=True)
clinical_data.replace({'Tumor Location (choice=LUL)': {'Checked':1, 'Unchecked':0}}, inplace=True)
clinical_data.replace({'Tumor Location (choice=LLL)': {'Checked':1, 'Unchecked':0}}, inplace=True)
clinical_data.replace({'Tumor Location (choice=Unknown)': {'Checked':1, 'Unchecked':0}}, inplace=True)

In [28]:
clinical_data.drop(columns=['Tumor Location'], inplace=True)

In [29]:
clinical_data['Histology'] = clinical_data['Histology ']
clinical_data.drop(columns=['Histology '], inplace=True)

In [30]:
To_Categorical('Histology')

In [31]:
To_Categorical('Pathological T stage')
To_Categorical('Pathological M stage')
To_Categorical('Pathological N stage')

In [32]:
To_Categorical('Histopathological Grade')

In [33]:
To_Categorical('Lymphovascular invasion')

In [34]:
# clinical_data['Pleural invasion (elastic, visceral, or parietal)'].value_counts()
To_Categorical('Pleural invasion (elastic, visceral, or parietal)')

In [35]:
# clinical_data['EGFR mutation status'].value_counts()
To_Categorical('EGFR mutation status')

In [36]:
# clinical_data['KRAS mutation status'].value_counts()
To_Categorical('KRAS mutation status')

In [37]:
clinical_data['Adjuvant Treatment'].value_counts()
# To_Categorical('Adjuvant Treatment')
clinical_data.replace({'Adjuvant Treatment': {'Yes':1, 'No':0}}, inplace=True)

In [38]:
clinical_data['Chemotherapy'].value_counts()
clinical_data.replace({'Chemotherapy': {'Yes':1, 'No':0}}, inplace=True)

In [39]:
clinical_data['Radiation'].value_counts()
clinical_data.replace({'Radiation': {'Yes':1, 'No':0}}, inplace=True)

In [40]:
clinical_data['Recurrence'].value_counts()
clinical_data.replace({'Recurrence': {'yes':1, 'no':0}}, inplace=True)
# To_Categorical('Recurrence')

In [41]:

# clinical_data['Recurrence Location'].value_counts()
To_Categorical('Recurrence Location')

In [42]:
# remove columns 'Date of Recurrence','Date of Last Known Alive', 'Survival Status', 'Date of Death',

clinical_data.drop(columns=['Date of Recurrence','Date of Last Known Alive', 'Survival Status', 'Date of Death', 'Time to Death (days)', 'Recurrence Location'], inplace=True)
clinical_data.drop(columns=['PET Date', 'CT Date'], inplace=True)
clinical_data.drop(columns=['Quit Smoking Year'], inplace=True)

In [43]:

clinical_data['Histology'].value_counts()
# To_Categorical('Histology')

Adenocarcinoma                         171
Squamous cell carcinoma                 35
NSCLC NOS (not otherwise specified)      4
Name: Histology, dtype: int64

In [44]:
clinical_data.columns

Index(['Age at Histological Diagnosis', 'Weight (lbs)', 'Gender', 'Ethnicity',
       'Smoking status', 'Pack Years', '%GG', 'Tumor Location (choice=RUL)',
       'Tumor Location (choice=RML)', 'Tumor Location (choice=RLL)',
       'Tumor Location (choice=LUL)', 'Tumor Location (choice=LLL)',
       'Tumor Location (choice=L Lingula)', 'Tumor Location (choice=Unknown)',
       'Pathological T stage', 'Pathological N stage', 'Pathological M stage',
       'Histopathological Grade', 'Lymphovascular invasion',
       'Pleural invasion (elastic, visceral, or parietal)',
       'EGFR mutation status', 'KRAS mutation status',
       'ALK translocation status', 'Adjuvant Treatment', 'Chemotherapy',
       'Radiation', 'Recurrence', 'Days between CT and surgery', 'Histology'],
      dtype='object')

In [45]:
# # ethnicity 49 not recorded
# clinical_data.Ethnicity.vaalue_counts()['Not Recorded In Database']

# #pack years - total 163
# clinical_data['Pack Years'].count()

# #Quit Smoking years - total 108
# clinical_data['Quit Smoking Year'].count()



In [46]:
clinical_data

Unnamed: 0,Age at Histological Diagnosis,Weight (lbs),Gender,Ethnicity,Smoking status,Pack Years,%GG,Tumor Location (choice=RUL),Tumor Location (choice=RML),Tumor Location (choice=RLL),...,"Pleural invasion (elastic, visceral, or parietal)",EGFR mutation status,KRAS mutation status,ALK translocation status,Adjuvant Treatment,Chemotherapy,Radiation,Recurrence,Days between CT and surgery,Histology
0,34,171.0,Male,,Nonsmoker,,Not Assessed,0,0,1,...,,Wildtype,Mutant,Wildtype,0,0,0,1,9,Adenocarcinoma
1,33,171.0,Female,,Nonsmoker,,Not Assessed,0,0,0,...,,Wildtype,Wildtype,,0,0,0,0,3,Adenocarcinoma
2,69,171.0,Female,,Nonsmoker,,Not Assessed,0,0,1,...,,Mutant,Wildtype,Wildtype,0,0,0,0,28,Adenocarcinoma
3,80,171.0,Female,,Nonsmoker,,Not Assessed,0,0,0,...,,Wildtype,Wildtype,Wildtype,0,0,0,0,47,Adenocarcinoma
4,76,171.0,Male,,Former,30,Not Assessed,0,0,0,...,,Mutant,Wildtype,Wildtype,0,0,0,1,2,Adenocarcinoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,75,184,Male,Caucasian,Former,55,Not Assessed,0,0,0,...,No,Unknown,Unknown,Unknown,0,0,0,0,14,Squamous cell carcinoma
207,61,231.5,Male,Caucasian,Former,12,Not Assessed,1,0,0,...,No,Unknown,Mutant,Unknown,1,1,0,0,72,Adenocarcinoma
208,52,171.0,Female,Caucasian,Former,7,Not Assessed,0,0,0,...,No,Wildtype,Wildtype,Wildtype,0,0,0,0,8,Adenocarcinoma
209,67,158,Male,Asian,Former,15,Not Assessed,0,0,0,...,No,Mutant,Wildtype,Wildtype,0,0,0,0,65,Adenocarcinoma


In [47]:
# clinical_data['Weight (lbs)']

In [48]:
#weight to numerical value
clinical_data['Weight (lbs)'] = pd.to_numeric(clinical_data['Weight (lbs)'])
# clinical_data.astype({'Weight (lbs)':'Int32'})

In [49]:
# clinical_data['Quit Smoking'] = clinical_data['Quit Smoking Year'].isnull()
# clinical_data['Quit Smoking Year'] = clinical_data.drop('Quit Smoking Year', axis=1)

In [50]:
clinical_data['Weight (lbs)']

0      171.0
1      171.0
2      171.0
3      171.0
4      171.0
       ...  
206    184.0
207    231.5
208    171.0
209    158.0
210    229.0
Name: Weight (lbs), Length: 210, dtype: float64

In [51]:
clinical_data['Recurrence'].value_counts()

0    156
1     54
Name: Recurrence, dtype: int64

In [52]:
156+54

210

Train Test Split

In [53]:
print(156/210)
print(54/210)

0.7428571428571429
0.2571428571428571


In [54]:
0.8*210

168.0

In [55]:
0.25 * 168

42.0

In [56]:
clinical_data['Recurrence'].value_counts()

0    156
1     54
Name: Recurrence, dtype: int64

In [57]:
# 42 no data from recurrence
# 168-42 = 126 yes data from recurrence

In [58]:
# get only reccurrence yes values
clinical_data['Recurrence'][clinical_data['Recurrence'] == 1]

0      1
4      1
10     1
11     1
33     1
37     1
38     1
43     1
52     1
53     1
55     1
58     1
60     1
64     1
66     1
68     1
70     1
72     1
74     1
77     1
78     1
79     1
81     1
83     1
84     1
98     1
99     1
102    1
113    1
116    1
120    1
123    1
125    1
136    1
139    1
141    1
146    1
147    1
151    1
157    1
159    1
161    1
167    1
170    1
173    1
176    1
189    1
194    1
195    1
196    1
197    1
199    1
201    1
210    1
Name: Recurrence, dtype: int64

In [59]:
# get only reccurrence yes values
clinical_data['Recurrence'][clinical_data['Recurrence'] == 0]

1      0
2      0
3      0
5      0
6      0
      ..
205    0
206    0
207    0
208    0
209    0
Name: Recurrence, Length: 156, dtype: int64

In [60]:
X = clinical_data.copy()
X.drop(columns=['Recurrence'], inplace=True)
Y = clinical_data['Recurrence']

## Need to create a mechanism to train test by myself

In [61]:
from sklearn.preprocessing import OneHotEncoder

In [62]:
X['Gender']

0        Male
1      Female
2      Female
3      Female
4        Male
        ...  
206      Male
207      Male
208    Female
209      Male
210      Male
Name: Gender, Length: 210, dtype: category
Categories (2, object): ['Female', 'Male']

In [63]:
one_hot_encoded_data = pd.get_dummies(X)

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(one_hot_encoded_data, Y, test_size = 0.30, random_state = 0)

In [65]:
clinical_data['Tumor Location (choice=RUL)']

0      0
1      0
2      0
3      0
4      0
      ..
206    0
207    1
208    0
209    0
210    0
Name: Tumor Location (choice=RUL), Length: 210, dtype: int64

Feature Scaling

In [66]:
clinical_data.describe()

Unnamed: 0,Age at Histological Diagnosis,Weight (lbs),Tumor Location (choice=RUL),Tumor Location (choice=RML),Tumor Location (choice=RLL),Tumor Location (choice=LUL),Tumor Location (choice=LLL),Tumor Location (choice=L Lingula),Tumor Location (choice=Unknown),Adjuvant Treatment,Chemotherapy,Radiation,Recurrence,Days between CT and surgery
count,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0
mean,67.928571,169.904878,0.366667,0.085714,0.142857,0.22381,0.157143,0.02381,0.0,0.233333,0.233333,0.07619,0.257143,50.352381
std,10.017671,35.618808,0.483046,0.280611,0.350763,0.417792,0.364805,0.15282,0.0,0.423963,0.423963,0.265937,0.438103,62.486053
min,24.0,49.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,64.0,152.145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.25
50%,68.0,171.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.5
75%,75.0,184.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,65.75
max,87.0,317.52,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,735.0


In [67]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [68]:
np.array(y_train)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1])

In [69]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [70]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

from sklearn.svm import SVC
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

from sklearn.ensemble import GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(X_train, y_train)


In [71]:
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score, precision_score
def predict(classifier):
    y_predict = classifier.predict(X_test)

    return y_predict

def print_evaluations(y_predict):
    cm = confusion_matrix(y_test, y_predict)
    ac = accuracy_score(y_test,y_predict)
    rs = recall_score(y_test,y_predict)
    ps = precision_score(y_test,y_predict)

    print(cm)
    print(ac)
    print(rs)
    print(ps)


In [72]:
nb_y_predict = predict(nb_classifier)
print('\nnb_y_predict')
print_evaluations(nb_y_predict)

svm_y_predict = predict(svm_classifier)
print('\nsvm_y_predict')
print_evaluations(svm_y_predict)

dt_y_predict = predict(dt_classifier)
print('\ndt_y_predict')
print_evaluations(dt_y_predict)

gb_y_predict = predict(gb_classifier)
print('\ngb_y_predict')
print_evaluations(gb_y_predict)


nb_y_predict
[[15 27]
 [ 7 14]]
0.4603174603174603
0.6666666666666666
0.34146341463414637

svm_y_predict
[[42  0]
 [21  0]]
0.6666666666666666
0.0
0.0

dt_y_predict
[[38  4]
 [15  6]]
0.6984126984126984
0.2857142857142857
0.6

gb_y_predict
[[40  2]
 [18  3]]
0.6825396825396826
0.14285714285714285
0.6


  _warn_prf(average, modifier, msg_start, len(result))


k-fold validation

In [73]:
from sklearn.model_selection import cross_val_score

In [74]:
cross_val_score(GaussianNB(), one_hot_encoded_data, Y)

array([0.26190476, 0.33333333, 0.45238095, 0.35714286, 0.4047619 ])

In [75]:
cross_val_score(SVC(), one_hot_encoded_data, Y)

array([0.73809524, 0.73809524, 0.73809524, 0.73809524, 0.76190476])

In [76]:
cross_val_score(DecisionTreeClassifier(), one_hot_encoded_data, Y)

array([0.69047619, 0.76190476, 0.52380952, 0.61904762, 0.71428571])

In [77]:
cross_val_score(GradientBoostingClassifier(), one_hot_encoded_data, Y)

array([0.73809524, 0.78571429, 0.64285714, 0.69047619, 0.83333333])

In [78]:
from sklearn.ensemble import AdaBoostClassifier
adaboost_classifier = AdaBoostClassifier()
adaboost_classifier.fit(X_train, y_train)

from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)


In [79]:
adaboost_y_predict = predict(adaboost_classifier)
print('\nadabost_y_predict')
print_evaluations(adaboost_y_predict)

rf_y_predict = predict(rf_classifier)
print('\nrf_y_predict')
print_evaluations(rf_y_predict)


adabost_y_predict
[[34  8]
 [16  5]]
0.6190476190476191
0.23809523809523808
0.38461538461538464

rf_y_predict
[[40  2]
 [19  2]]
0.6666666666666666
0.09523809523809523
0.5


In [80]:
from sklearn.ensemble import BaggingClassifier
bag_model = BaggingClassifier(
base_estimator=DecisionTreeClassifier(), 
    n_estimators=100, 
    max_samples=1.0, 
    bootstrap=True,
    oob_score=True,
    random_state=0
)
bag_model.fit(X_train, y_train)

In [81]:
bag_y_predict = predict(bag_model)
print('\nrf_y_predict')
print_evaluations(bag_y_predict)


rf_y_predict
[[39  3]
 [17  4]]
0.6825396825396826
0.19047619047619047
0.5714285714285714


In [82]:
cross_val_score(bag_model, one_hot_encoded_data, Y)

array([0.76190476, 0.78571429, 0.64285714, 0.69047619, 0.76190476])

**Feature Selection**
>dimentionality reduction

>RFE

In [117]:
from sklearn.feature_selection import RFE


rfe = RFE(RandomForestClassifier(n_estimators=100), n_features_to_select=30)

In [118]:
rfe.fit(one_hot_encoded_data, Y)

In [119]:
# correlated_features = set()
# correlation_matrix = clinical_data.drop('Recurrence', axis=1).corr()

# for i in range(len(correlation_matrix.columns)):
#     for j in range(i):
#         if abs(correlation_matrix.iloc[i, j]) > 0.8:
#             colname = correlation_matrix.columns[i]
#             correlated_features.add(colname)

In [120]:
one_hot_encoded_data

Unnamed: 0,Age at Histological Diagnosis,Weight (lbs),Tumor Location (choice=RUL),Tumor Location (choice=RML),Tumor Location (choice=RLL),Tumor Location (choice=LUL),Tumor Location (choice=LLL),Tumor Location (choice=L Lingula),Tumor Location (choice=Unknown),Adjuvant Treatment,...,EGFR mutation status_Wildtype,KRAS mutation status_Mutant,KRAS mutation status_Unknown,KRAS mutation status_Wildtype,ALK translocation status_Translocated,ALK translocation status_Unknown,ALK translocation status_Wildtype,Histology_Adenocarcinoma,Histology_NSCLC NOS (not otherwise specified),Histology_Squamous cell carcinoma
0,34,171.0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,1,1,0,0
1,33,171.0,0,0,0,0,1,0,0,0,...,1,0,0,1,0,0,0,1,0,0
2,69,171.0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
3,80,171.0,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,1,1,0,0
4,76,171.0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,75,184.0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,1
207,61,231.5,1,0,0,0,0,0,0,1,...,0,1,0,0,0,1,0,1,0,0
208,52,171.0,0,0,0,1,0,0,0,0,...,1,0,0,1,0,0,1,1,0,0
209,67,158.0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0


In [122]:
# correlated_features

In [123]:
one_hot_encoded_data

Unnamed: 0,Age at Histological Diagnosis,Weight (lbs),Tumor Location (choice=RUL),Tumor Location (choice=RML),Tumor Location (choice=RLL),Tumor Location (choice=LUL),Tumor Location (choice=LLL),Tumor Location (choice=L Lingula),Tumor Location (choice=Unknown),Adjuvant Treatment,...,EGFR mutation status_Wildtype,KRAS mutation status_Mutant,KRAS mutation status_Unknown,KRAS mutation status_Wildtype,ALK translocation status_Translocated,ALK translocation status_Unknown,ALK translocation status_Wildtype,Histology_Adenocarcinoma,Histology_NSCLC NOS (not otherwise specified),Histology_Squamous cell carcinoma
0,34,171.0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,1,1,0,0
1,33,171.0,0,0,0,0,1,0,0,0,...,1,0,0,1,0,0,0,1,0,0
2,69,171.0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
3,80,171.0,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,1,1,0,0
4,76,171.0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,75,184.0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,1
207,61,231.5,1,0,0,0,0,0,0,1,...,0,1,0,0,0,1,0,1,0,0
208,52,171.0,0,0,0,1,0,0,0,0,...,1,0,0,1,0,0,1,1,0,0
209,67,158.0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0


In [124]:
rfe.estimator_.feature_importances_

array([0.11072537, 0.12035385, 0.03187517, 0.01719659, 0.02328348,
       0.01837127, 0.02620746, 0.03017   , 0.03898115, 0.12497457,
       0.02478563, 0.01850369, 0.03932488, 0.01963648, 0.02282403,
       0.0276154 , 0.0237721 , 0.01637874, 0.02605753, 0.02887147,
       0.02600548, 0.02728591, 0.02134432, 0.01403968, 0.02011414,
       0.02172026, 0.01803641, 0.02413205, 0.0173938 , 0.02001909])

In [125]:
rfe.get_support()

array([ True,  True,  True, False,  True,  True,  True, False, False,
        True,  True,  True,  True, False,  True, False, False,  True,
       False, False, False,  True,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False,  True, False, False, False,  True, False,
       False, False, False,  True, False,  True, False, False,  True,
       False,  True,  True, False, False,  True, False,  True, False,
        True, False,  True, False, False,  True, False, False,  True,
        True, False, False])

In [127]:
selected_feat= one_hot_encoded_data.columns[(rfe.get_support())]
len(selected_feat)

30

In [128]:
selected_feat

Index(['Age at Histological Diagnosis', 'Weight (lbs)',
       'Tumor Location (choice=RUL)', 'Tumor Location (choice=RLL)',
       'Tumor Location (choice=LUL)', 'Tumor Location (choice=LLL)',
       'Adjuvant Treatment', 'Chemotherapy', 'Radiation',
       'Days between CT and surgery', 'Gender_Male', 'Ethnicity_Caucasian',
       'Smoking status_Former', 'Smoking status_Nonsmoker', 'Pack Years_50',
       '%GG_0%', '%GG_>0 - 25%', 'Pathological T stage_T2a',
       'Pathological N stage_N0', 'Pathological N stage_N2',
       'Pathological M stage_M1b',
       'Histopathological Grade_G2 Moderately differentiated',
       'Histopathological Grade_G3 Poorly differentiated',
       'Lymphovascular invasion_Absent',
       'Pleural invasion (elastic, visceral, or parietal)_No',
       'EGFR mutation status_Mutant', 'EGFR mutation status_Wildtype',
       'KRAS mutation status_Wildtype', 'ALK translocation status_Wildtype',
       'Histology_Adenocarcinoma'],
      dtype='object')

In [113]:
for i in range(X_train.shape[1]):
	print('Column: %d, Selected: %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected: True, Rank: 1.000
Column: 1, Selected: True, Rank: 1.000
Column: 2, Selected: True, Rank: 1.000
Column: 3, Selected: False, Rank: 9.000
Column: 4, Selected: True, Rank: 1.000
Column: 5, Selected: True, Rank: 1.000
Column: 6, Selected: True, Rank: 1.000
Column: 7, Selected: False, Rank: 30.000
Column: 8, Selected: False, Rank: 74.000
Column: 9, Selected: True, Rank: 1.000
Column: 10, Selected: True, Rank: 1.000
Column: 11, Selected: True, Rank: 1.000
Column: 12, Selected: True, Rank: 1.000
Column: 13, Selected: True, Rank: 1.000
Column: 14, Selected: True, Rank: 1.000
Column: 15, Selected: False, Rank: 20.000
Column: 16, Selected: False, Rank: 21.000
Column: 17, Selected: True, Rank: 1.000
Column: 18, Selected: False, Rank: 24.000
Column: 19, Selected: False, Rank: 37.000
Column: 20, Selected: False, Rank: 4.000
Column: 21, Selected: True, Rank: 1.000
Column: 22, Selected: True, Rank: 1.000
Column: 23, Selected: False, Rank: 63.000
Column: 24, Selected: False, Rank:

In [114]:
selected = one_hot_encoded_data == rfe.get_support()

In [115]:
len(selected.columns)

111

In [116]:
len(one_hot_encoded_data.columns)

111

In [131]:
selected_data = one_hot_encoded_data[selected_feat]

In [132]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(selected_data, Y, test_size = 0.30, random_state = 0)

In [133]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [134]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

from sklearn.svm import SVC
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

from sklearn.ensemble import GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(X_train, y_train)

In [135]:
nb_y_predict = predict(nb_classifier)
print('\nnb_y_predict')
print_evaluations(nb_y_predict)

svm_y_predict = predict(svm_classifier)
print('\nsvm_y_predict')
print_evaluations(svm_y_predict)

dt_y_predict = predict(dt_classifier)
print('\ndt_y_predict')
print_evaluations(dt_y_predict)

gb_y_predict = predict(gb_classifier)
print('\ngb_y_predict')
print_evaluations(gb_y_predict)


nb_y_predict
[[42  0]
 [17  4]]
0.7301587301587301
0.19047619047619047
1.0

svm_y_predict
[[42  0]
 [20  1]]
0.6825396825396826
0.047619047619047616
1.0

dt_y_predict
[[36  6]
 [16  5]]
0.6507936507936508
0.23809523809523808
0.45454545454545453

gb_y_predict
[[37  5]
 [17  4]]
0.6507936507936508
0.19047619047619047
0.4444444444444444


In [137]:
from sklearn.ensemble import AdaBoostClassifier
adaboost_classifier = AdaBoostClassifier()
adaboost_classifier.fit(X_train, y_train)

from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)


In [138]:
adaboost_y_predict = predict(adaboost_classifier)
print('\nadabost_y_predict')
print_evaluations(adaboost_y_predict)

rf_y_predict = predict(rf_classifier)
print('\nrf_y_predict')
print_evaluations(rf_y_predict)


adabost_y_predict
[[34  8]
 [15  6]]
0.6349206349206349
0.2857142857142857
0.42857142857142855

rf_y_predict
[[41  1]
 [17  4]]
0.7142857142857143
0.19047619047619047
0.8


In [139]:
from sklearn.ensemble import BaggingClassifier
bag_model = BaggingClassifier(
base_estimator=DecisionTreeClassifier(), 
    n_estimators=100, 
    max_samples=1.0, 
    bootstrap=True,
    oob_score=True,
    random_state=0
)
bag_model.fit(X_train, y_train)

In [140]:
bag_y_predict = predict(bag_model)
print('\nrf_y_predict')
print_evaluations(bag_y_predict)


rf_y_predict
[[40  2]
 [16  5]]
0.7142857142857143
0.23809523809523808
0.7142857142857143


In [144]:
from sklearn.model_selection import StratifiedKFold


scoring = {'acc': 'accuracy',
           'prec': 'precision_macro',
           'rec': 'recall_macro',
           'f1': 'f1_macro',
           'mse': 'neg_mean_squared_error',
           'rmse': 'neg_root_mean_squared_error'}
cv = StratifiedKFold(n_splits=10, random_state=50, shuffle=True)

In [147]:
from sklearn.model_selection import cross_validate


cross_validate(RandomForestClassifier(),one_hot_encoded_data, Y, cv=cv, scoring=scoring, n_jobs=1)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'fit_time': array([0.09113193, 0.08965516, 0.08113098, 0.08078289, 0.07884192,
        0.078336  , 0.07957196, 0.07849193, 0.07838798, 0.07972884]),
 'score_time': array([0.00907993, 0.00831318, 0.00731397, 0.00736809, 0.007231  ,
        0.00750208, 0.00718307, 0.00739312, 0.00791788, 0.00721502]),
 'test_acc': array([0.71428571, 0.71428571, 0.76190476, 0.71428571, 0.80952381,
        0.80952381, 0.71428571, 0.76190476, 0.76190476, 0.80952381]),
 'test_prec': array([0.35714286, 0.61842105, 0.72222222, 0.63235294, 0.9       ,
        0.75      , 0.375     , 0.64473684, 0.38095238, 0.75      ]),
 'test_rec': array([0.5       , 0.55      , 0.63333333, 0.6       , 0.6       ,
        0.66875   , 0.46875   , 0.56875   , 0.5       , 0.66875   ]),
 'test_f1': array([0.41666667, 0.53676471, 0.64646465, 0.60625   , 0.61111111,
        0.69117647, 0.41666667, 0.57142857, 0.43243243, 0.69117647]),
 'test_mse': array([-0.28571429, -0.28571429, -0.23809524, -0.28571429, -0.19047619,
        -0.19

In [148]:

cross_validate(RandomForestClassifier(),selected_data, Y, scoring=scoring, n_jobs=1)

  _warn_prf(average, modifier, msg_start, len(result))


{'fit_time': array([0.08522797, 0.07452989, 0.074862  , 0.07654309, 0.07625794]),
 'score_time': array([0.00802588, 0.00718307, 0.00704908, 0.00788093, 0.00710893]),
 'test_acc': array([0.73809524, 0.76190476, 0.73809524, 0.66666667, 0.78571429]),
 'test_prec': array([0.36904762, 0.71794872, 0.625     , 0.51428571, 0.70540541]),
 'test_rec': array([0.5       , 0.57478006, 0.52932551, 0.51026393, 0.61875   ]),
 'test_f1': array([0.42465753, 0.57142857, 0.49945829, 0.50505051, 0.63478261]),
 'test_mse': array([-0.26190476, -0.23809524, -0.26190476, -0.33333333, -0.21428571]),
 'test_rmse': array([-0.51176632, -0.48795004, -0.51176632, -0.57735027, -0.46291005])}

In [150]:


cross_validate(GradientBoostingClassifier(),selected_data, Y, scoring=scoring, n_jobs=1)

  _warn_prf(average, modifier, msg_start, len(result))


{'fit_time': array([0.05313706, 0.0508287 , 0.04411387, 0.04473519, 0.04331303]),
 'score_time': array([0.00523686, 0.00265217, 0.00253892, 0.00282598, 0.00256896]),
 'test_acc': array([0.73809524, 0.78571429, 0.66666667, 0.64285714, 0.78571429]),
 'test_prec': array([0.36904762, 0.72426471, 0.46486486, 0.49264706, 0.73076923]),
 'test_rec': array([0.5       , 0.67888563, 0.48093842, 0.4941349 , 0.584375  ]),
 'test_f1': array([0.42465753, 0.69392713, 0.45955882, 0.48987854, 0.59046587]),
 'test_mse': array([-0.26190476, -0.21428571, -0.33333333, -0.35714286, -0.21428571]),
 'test_rmse': array([-0.51176632, -0.46291005, -0.57735027, -0.5976143 , -0.46291005])}

In [152]:


from sklearn.linear_model import SGDClassifier


cross_validate(SGDClassifier(),selected_data, Y, scoring=scoring, n_jobs=1)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'fit_time': array([0.00637078, 0.00539279, 0.00484395, 0.00387907, 0.00401783]),
 'score_time': array([0.00697923, 0.00730014, 0.00647569, 0.00580096, 0.00639105]),
 'test_acc': array([0.73809524, 0.73809524, 0.26190476, 0.73809524, 0.76190476]),
 'test_prec': array([0.36904762, 0.36904762, 0.13095238, 0.36904762, 0.38095238]),
 'test_rec': array([0.5, 0.5, 0.5, 0.5, 0.5]),
 'test_f1': array([0.42465753, 0.42465753, 0.20754717, 0.42465753, 0.43243243]),
 'test_mse': array([-0.26190476, -0.26190476, -0.73809524, -0.26190476, -0.23809524]),
 'test_rmse': array([-0.51176632, -0.51176632, -0.85912469, -0.51176632, -0.48795004])}