In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report, multilabel_confusion_matrix
import tensorflow as tf
import datetime, os
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, Flatten, concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.constraints import MaxNorm
from sklearn.model_selection import GridSearchCV
from splitrepeat import splitrepeat_cv

config = tf.compat.v1.ConfigProto(gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.8))
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)

features = ['faq8', 'moca_orient', 'AVDELTOT', 'e_plan_pt', 'MMTREEDL', 'e_lang_cg', 'ADAS_Q11', 'LDELTOTAL', 'e_divatt_cg', 'e_memory_pt', 'faq4', 'faq1', 'e_plan_cg', 'ADAS_Q7', 'ADAS_Q8', 'faq3', 'e_visspat_pt', 'AVDEL30MIN', 'e_visspat_cg', 'faq2', 'e_organ_cg', 'moca_recall', 'faq10', 'e_lang_pt', 'CATANIMSC', 'e_memory_cg', 'ADAS_Q1', 'faq9', 'ADAS_Q4', 'tmab_time', 'e_divatt_pt']

categorical_features = np.in1d(features, ['MMONFLR', 'faq10', 'faq6', 'COPYSYM', 'COPYNUM', 'faq5', 'MMTREE', 'COPYHAND', 'moca_clock', 'MMSEASON', 'moca_letters', 
                                          'MMBALL', 'faq9', 'MMFLOOR', 'MMDRAW', 'MMMONTH', 'PXGENAPP', 'MMWATCH', 'CLOCKCIRC', 'faq8', 'MMHOSPIT', 'moca_naming', 
                                          'PXEXTREM', 'CLOCKNUM', 'PXMUSCUL', 'faq1', 'MMTREEDL', 'CLOCKTIME', 'PXABDOM', 'MMFLAG', 'COPYCIRC', 'MMAREA', 'faq3', 
                                          'moca_digits', 'CLOCKHAND', 'MMREAD', 'MMYEAR', 'MMREPEAT', 'moca_visuo_exec','MMHAND', 'MMBALLDL', 'PXCHEST', 'MMDATE', 
                                          'MMFLAGDL',  'MMSTATE','moca_repeat', 'MMFOLD', 'MMPENCIL', 'MMDAY',  'faq2', 'PXHEART', 'CLOCKSYM', 'faq4', 'moca_serial7', 
                                          'faq7', 'MMCITY', 'PXHEADEY', 'COPYTIME', 'PXPERIPH', 'PXSKIN', 'moca_fluency', 'moca_similarities', 'PXNECK', 'MMWRITE',
                                          'e_memory_pt', 'e_lang_pt', 'e_visspat_pt', 'e_plan_pt', 'e_organ_pt', 'e_divatt_pt','e_memory_cg', 'e_lang_cg', 
                                          'e_visspat_cg','e_plan_cg', 'e_organ_cg', 'e_divatt_cg'])

data = pd.read_csv('../data/interim/data_adni.csv')
X = data[features]
y = data['CDGLOBAL']
y.replace({2:0,1:1},inplace=True)

In [2]:
X, X_test, y, y_test = train_test_split(X, y, train_size=0.25, random_state=33433, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=33433, stratify=y)     

# RF Classifier

In [4]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_features=.5, random_state=0)

%time \
df = splitrepeat_cv(X,y,rf,splits=[111,222,333,444],repeats=[111,222,333,444,555])
df.describe()

Wall time: 3.9 s


Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.7935,0.882812,0.837245,0.838156,0.882812,0.7935,0.837245,0.838156,0.882812,0.7935,0.837245,0.838156
std,0.036168,0.035906,0.012596,0.012569,0.035906,0.036168,0.012596,0.012569,0.035906,0.036168,0.012596,0.012569
min,0.72,0.822917,0.811224,0.811458,0.822917,0.72,0.811224,0.811458,0.822917,0.72,0.811224,0.811458
25%,0.7875,0.864583,0.831633,0.83224,0.864583,0.7875,0.831633,0.83224,0.864583,0.7875,0.831633,0.83224
50%,0.81,0.880208,0.836735,0.83875,0.880208,0.81,0.836735,0.83875,0.880208,0.81,0.836735,0.83875
75%,0.82,0.90625,0.846939,0.847552,0.90625,0.82,0.846939,0.847552,0.90625,0.82,0.846939,0.847552
max,0.82,0.947917,0.857143,0.858125,0.947917,0.82,0.857143,0.858125,0.947917,0.82,0.857143,0.858125


## Random Search

In [5]:
from sklearn.model_selection import RandomizedSearchCV

# Search parameters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['sqrt', 'log2', .3, .5, .7, .9]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

search_params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [6]:
rf_search_rand = RandomizedSearchCV(rf, search_params, n_iter=400,cv=3,verbose=2,random_state=33433, n_jobs=-1)
rf_search_rand.fit(X_train, y_train)

Fitting 3 folds for each of 400 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  6.7min finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(max_features=0.5,
                                                    random_state=555),
                   n_iter=400, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['sqrt', 'log2', 0.3,
                                                         0.5, 0.7, 0.9],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                        

In [7]:
rf_search_rand.best_params_

{'n_estimators': 800,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 90,
 'bootstrap': False}

In [8]:
best = rf_search_rand.best_estimator_
df_rand = splitrepeat_cv(X,y,best,splits=[10,20,30],repeats=[10,20,30])
df_rand.describe()

Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,0.816667,0.907407,0.861111,0.862037,0.907407,0.816667,0.861111,0.862037,0.907407,0.816667,0.861111,0.862037
std,0.070178,0.024675,0.025482,0.024584,0.024675,0.070178,0.025482,0.024584,0.024675,0.070178,0.025482,0.024584
min,0.72,0.875,0.826531,0.82875,0.875,0.72,0.826531,0.82875,0.875,0.72,0.826531,0.82875
25%,0.73,0.885417,0.831633,0.83375,0.885417,0.73,0.831633,0.83375,0.885417,0.73,0.831633,0.83375
50%,0.86,0.90625,0.872449,0.872708,0.90625,0.86,0.872449,0.872708,0.90625,0.86,0.872449,0.872708
75%,0.86,0.9375,0.882653,0.882917,0.9375,0.86,0.882653,0.882917,0.9375,0.86,0.882653,0.882917
max,0.87,0.9375,0.887755,0.888125,0.9375,0.87,0.887755,0.888125,0.9375,0.87,0.887755,0.888125


## Grid Search

In [9]:
from sklearn.model_selection import GridSearchCV

search_params = {'n_estimators': [700, 800, 900],
               'max_features': [.2, 'log2', 'sqrt'],
               'max_depth': [90, 100, 80],
               'min_samples_split': [1, 2, 3],
               'min_samples_leaf': [1, 2],
               'bootstrap': [False, True]}

rf_search_grid = GridSearchCV(rf, search_params, 
                          cv = 3, n_jobs = -1, verbose = 2)
rf_search_grid.fit(X_train, y_train)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   54.8s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed:  2.8min finished


GridSearchCV(cv=3,
             estimator=RandomForestClassifier(max_features=0.5,
                                              random_state=555),
             n_jobs=-1,
             param_grid={'bootstrap': [False, True], 'max_depth': [90, 100, 80],
                         'max_features': [0.2, 'log2', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [1, 2, 3],
                         'n_estimators': [700, 800, 900]},
             verbose=2)

In [10]:
rf_search_grid.best_params_

{'bootstrap': False,
 'max_depth': 90,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 800}

In [11]:
best = rf_search_grid.best_estimator_
df_grid = splitrepeat_cv(X,y,best,splits=[10,20,30],repeats=[10,20,30])
df_grid.describe()

Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,0.816667,0.907407,0.861111,0.862037,0.907407,0.816667,0.861111,0.862037,0.907407,0.816667,0.861111,0.862037
std,0.070178,0.024675,0.025482,0.024584,0.024675,0.070178,0.025482,0.024584,0.024675,0.070178,0.025482,0.024584
min,0.72,0.875,0.826531,0.82875,0.875,0.72,0.826531,0.82875,0.875,0.72,0.826531,0.82875
25%,0.73,0.885417,0.831633,0.83375,0.885417,0.73,0.831633,0.83375,0.885417,0.73,0.831633,0.83375
50%,0.86,0.90625,0.872449,0.872708,0.90625,0.86,0.872449,0.872708,0.90625,0.86,0.872449,0.872708
75%,0.86,0.9375,0.882653,0.882917,0.9375,0.86,0.882653,0.882917,0.9375,0.86,0.882653,0.882917
max,0.87,0.9375,0.887755,0.888125,0.9375,0.87,0.887755,0.888125,0.9375,0.87,0.887755,0.888125


In [12]:
rf = RandomForestClassifier(n_estimators=90, max_features= 'log2', max_depth = 90, min_samples_split = 2, min_samples_leaf = 1, bootstrap=False, random_state=33433)

%time \
df = splitrepeat_cv(X,y,rf,splits=[111,222,333,444],repeats=[111,222,333,444,555])
df.describe()

Wall time: 2.56 s


Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.8045,0.878646,0.840816,0.841573,0.878646,0.8045,0.840816,0.841573,0.878646,0.8045,0.840816,0.841573
std,0.031031,0.032988,0.019133,0.019159,0.032988,0.031031,0.019133,0.019159,0.032988,0.031031,0.019133,0.019159
min,0.75,0.822917,0.811224,0.811667,0.822917,0.75,0.811224,0.811667,0.822917,0.75,0.811224,0.811667
25%,0.7875,0.851562,0.826531,0.826979,0.851562,0.7875,0.826531,0.826979,0.851562,0.7875,0.826531,0.826979
50%,0.81,0.890625,0.836735,0.8375,0.890625,0.81,0.836735,0.8375,0.890625,0.81,0.836735,0.8375
75%,0.8225,0.90625,0.84949,0.850729,0.90625,0.8225,0.84949,0.850729,0.90625,0.8225,0.84949,0.850729
max,0.85,0.916667,0.877551,0.878125,0.916667,0.85,0.877551,0.878125,0.916667,0.85,0.877551,0.878125


In [14]:
# All samples

%time \
df = splitrepeat_cv(X,y,rf,splits=list(int(x)*42+42 for x in range(10)),repeats=list(int(x)*42+42 for x in range(10)), initial_split_seed=33433, initial_split_ratio=.25)
df.describe()

Wall time: 17.9 s


Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.84595,0.899688,0.87227,0.872819,0.899688,0.84595,0.87227,0.872819,0.899688,0.84595,0.87227,0.872819
std,0.026721,0.01394,0.014184,0.013992,0.01394,0.026721,0.014184,0.013992,0.01394,0.026721,0.014184,0.013992
min,0.775,0.864583,0.841837,0.842917,0.864583,0.775,0.841837,0.842917,0.864583,0.775,0.841837,0.842917
25%,0.83,0.890625,0.859694,0.860833,0.890625,0.83,0.859694,0.860833,0.890625,0.83,0.859694,0.860833
50%,0.8475,0.901042,0.872449,0.873281,0.901042,0.8475,0.872449,0.873281,0.901042,0.8475,0.872449,0.873281
75%,0.87,0.911458,0.885204,0.885339,0.911458,0.87,0.885204,0.885339,0.911458,0.87,0.885204,0.885339
max,0.885,0.9375,0.910714,0.91125,0.9375,0.885,0.910714,0.91125,0.9375,0.885,0.910714,0.91125


# Outputs

In [15]:
### Save outputs ###
q = 'CDR5_BorutaSHAP_2'

df['set'] = q
df.to_csv(('../models/outputs/' + q + '.csv'), index=False)