In [44]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report, multilabel_confusion_matrix
import tensorflow as tf
import datetime, os
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, Flatten, concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.constraints import MaxNorm
from sklearn.model_selection import GridSearchCV
from splitrepeat import splitrepeat_cv

config = tf.compat.v1.ConfigProto(gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.8))
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)

features = ['e_visspat_cg', 'LDELTOTAL', 'ADAS_Q12', 'ADAS_Q9', 'ADAS_Q4', 'AVDELTOT', 'moca_clock', 'MMDAY', 'CLOCKTIME', 'moca_serial7', 'MMBALLDL', 'MMTREEDL', 'AVRECALL', 'faq7', 'faq1', 'CATANIMSC', 'TRABERRCOM', 'ADAS_Q5', 'MMMONTH', 'MMFLOOR', 'nbspan_forward', 'ADAS_Q11', 'faq8', 'faq6', 'faq3', 'ADAS_Q13', 'nbspan_backward', 'PXHEADEY', 'faq10', 'e_plan_pt', 'e_divatt_pt', 'MMFLAGDL', 'ADAS_Q7', 'AVDEL30MIN', 'CLOCKSYM', 'ADAS_Q1', 'tmab_time', 'faq9', 'moca_visuo_exec', 'ADAS_Q2', 'moca_recall', 'faq2', 'faq4', 'TRABERROM', 'moca_similarities', 'e_memory_cg', 'TRAAERRCOM', 'moca_orient', 'MMSPELL_late', 'ADAS_Q10', 'PXSKIN', 'AVDELERR2', 'e_divatt_cg', 'ADAS_Q8', 'e_organ_cg', 'CATVEGESC', 'e_plan_cg', 'MMDRAW', 'COPYTIME', 'ADAS_Q3', 'ADAS_Q6', 'PXHEART', 'MMDATE', 'e_lang_cg']

categorical_features = np.in1d(features, ['MMONFLR', 'faq10', 'faq6', 'COPYSYM', 'COPYNUM', 'faq5', 'MMTREE', 'COPYHAND', 'moca_clock', 'MMSEASON', 'moca_letters', 
                                          'MMBALL', 'faq9', 'MMFLOOR', 'MMDRAW', 'MMMONTH', 'PXGENAPP', 'MMWATCH', 'CLOCKCIRC', 'faq8', 'MMHOSPIT', 'moca_naming', 
                                          'PXEXTREM', 'CLOCKNUM', 'PXMUSCUL', 'faq1', 'MMTREEDL', 'CLOCKTIME', 'PXABDOM', 'MMFLAG', 'COPYCIRC', 'MMAREA', 'faq3', 
                                          'moca_digits', 'CLOCKHAND', 'MMREAD', 'MMYEAR', 'MMREPEAT', 'moca_visuo_exec','MMHAND', 'MMBALLDL', 'PXCHEST', 'MMDATE', 
                                          'MMFLAGDL',  'MMSTATE','moca_repeat', 'MMFOLD', 'MMPENCIL', 'MMDAY',  'faq2', 'PXHEART', 'CLOCKSYM', 'faq4', 'moca_serial7', 
                                          'faq7', 'MMCITY', 'PXHEADEY', 'COPYTIME', 'PXPERIPH', 'PXSKIN', 'moca_fluency', 'moca_similarities', 'PXNECK', 'MMWRITE',
                                          'e_memory_pt', 'e_lang_pt', 'e_visspat_pt', 'e_plan_pt', 'e_organ_pt', 'e_divatt_pt','e_memory_cg', 'e_lang_cg', 
                                          'e_visspat_cg','e_plan_cg', 'e_organ_cg', 'e_divatt_cg'])

data = pd.read_csv('../data/processed/data_adni.csv')
X = data[features]
y = data['CDGLOBAL']
y.replace({1:0, 2:1},inplace=True)

In [None]:
# Divide data in half for tuning, to reduce leakage when using split/repeat cross-validation
X, X_test, y, y_test = train_test_split(X, y, train_size=0.25, random_state=33433, stratify=y)

from imblearn.over_sampling import SMOTENC
sm = SMOTENC(categorical_features=categorical_features, sampling_strategy=.5, random_state=33433)
X, y = sm.fit_resample(X,y)
y = pd.Series(y)   

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=33433, stratify=y)     

# RF Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_features=.5, random_state=0)

%time \
df = splitrepeat_cv(X,y,rf,splits=[111,222,333,444],repeats=[111,222,333,444,555])
df.describe()

Wall time: 7.05 s


Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.983424,0.955978,0.974275,0.969701,0.955978,0.983424,0.974275,0.969701,0.955978,0.983424,0.974275,0.969701
std,0.010937,0.018818,0.006097,0.007495,0.018818,0.010937,0.006097,0.007495,0.018818,0.010937,0.006097,0.007495
min,0.967391,0.923913,0.967391,0.956522,0.923913,0.967391,0.967391,0.956522,0.923913,0.967391,0.967391,0.956522
25%,0.972826,0.951087,0.970109,0.964674,0.951087,0.972826,0.970109,0.964674,0.951087,0.972826,0.970109,0.964674
50%,0.98913,0.967391,0.972826,0.96875,0.967391,0.98913,0.972826,0.96875,0.967391,0.98913,0.972826,0.96875
75%,0.98913,0.967391,0.978261,0.973505,0.967391,0.98913,0.978261,0.973505,0.967391,0.98913,0.978261,0.973505
max,1.0,0.978261,0.985507,0.983696,0.978261,1.0,0.985507,0.983696,0.978261,1.0,0.985507,0.983696


## Random Search

In [36]:
from sklearn.model_selection import RandomizedSearchCV

# Search parameters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['sqrt', 'log2', .3, .5, .7, .9]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

search_params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [37]:
rf_search_rand = RandomizedSearchCV(rf, search_params, n_iter=400,cv=3,verbose=2,random_state=33433, n_jobs=-1)
rf_search_rand.fit(X_train, y_train)

Fitting 3 folds for each of 400 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  8.7min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=0.5,
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                  

In [38]:
rf_search_rand.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 80,
 'bootstrap': True}

In [39]:
best = rf_search_rand.best_estimator_
df_rand = splitrepeat_cv(X,y,best,splits=[10,20,30],repeats=[10,20,30])
df_rand.describe()

Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,0.985507,0.963768,0.978261,0.974638,0.963768,0.985507,0.978261,0.974638,0.963768,0.985507,0.978261,0.974638
std,0.002717,0.02369,0.008302,0.012076,0.02369,0.002717,0.008302,0.012076,0.02369,0.002717,0.008302,0.012076
min,0.983696,0.934783,0.967391,0.959239,0.934783,0.983696,0.967391,0.959239,0.934783,0.983696,0.967391,0.959239
25%,0.983696,0.934783,0.967391,0.959239,0.934783,0.983696,0.967391,0.959239,0.934783,0.983696,0.967391,0.959239
50%,0.983696,0.967391,0.981884,0.978261,0.967391,0.983696,0.981884,0.978261,0.967391,0.983696,0.981884,0.978261
75%,0.98913,0.98913,0.985507,0.986413,0.98913,0.98913,0.985507,0.986413,0.98913,0.98913,0.985507,0.986413
max,0.98913,0.98913,0.985507,0.986413,0.98913,0.98913,0.985507,0.986413,0.98913,0.98913,0.985507,0.986413


## Grid Search

In [40]:
from sklearn.model_selection import GridSearchCV

search_params = {'n_estimators': [500, 600, 700],
               'max_features': [.2, 'sqrt', 'log2'],
               'max_depth': [70, 80, 90],
               'min_samples_split': [1, 2, 3],
               'min_samples_leaf': [1, 2],
               'bootstrap': [False, True]}

rf_search_grid = GridSearchCV(rf, search_params, 
                          cv = 3, n_jobs = -1, verbose = 2)
rf_search_grid.fit(X_train, y_train)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   45.6s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed:  2.3min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=0.5,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=555,
                                    

In [41]:
rf_search_grid.best_params_

{'bootstrap': True,
 'max_depth': 70,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 600}

In [42]:
best = rf_search_grid.best_estimator_
df_grid = splitrepeat_cv(X,y,best,splits=[10,20,30],repeats=[10,20,30])
df_grid.describe()

Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,0.985507,0.963768,0.978261,0.974638,0.963768,0.985507,0.978261,0.974638,0.963768,0.985507,0.978261,0.974638
std,0.002717,0.02369,0.008302,0.012076,0.02369,0.002717,0.008302,0.012076,0.02369,0.002717,0.008302,0.012076
min,0.983696,0.934783,0.967391,0.959239,0.934783,0.983696,0.967391,0.959239,0.934783,0.983696,0.967391,0.959239
25%,0.983696,0.934783,0.967391,0.959239,0.934783,0.983696,0.967391,0.959239,0.934783,0.983696,0.967391,0.959239
50%,0.983696,0.967391,0.981884,0.978261,0.967391,0.983696,0.981884,0.978261,0.967391,0.983696,0.981884,0.978261
75%,0.98913,0.98913,0.985507,0.986413,0.98913,0.98913,0.985507,0.986413,0.98913,0.98913,0.985507,0.986413
max,0.98913,0.98913,0.985507,0.986413,0.98913,0.98913,0.985507,0.986413,0.98913,0.98913,0.985507,0.986413


In [43]:
rf = RandomForestClassifier(n_estimators=600, max_features='sqrt', max_depth = 70, min_samples_split = 2, min_samples_leaf = 1, bootstrap=True, random_state=33433)

%time \
df = splitrepeat_cv(X,y,rf,splits=[111,222,333,444],repeats=[111,222,333,444,555])
df.describe()

Wall time: 25.4 s


Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.979348,0.972826,0.977174,0.976087,0.972826,0.979348,0.977174,0.976087,0.972826,0.979348,0.977174,0.976087
std,0.010068,0.018154,0.005771,0.007302,0.018154,0.010068,0.005771,0.007302,0.018154,0.010068,0.005771,0.007302
min,0.961957,0.945652,0.967391,0.967391,0.945652,0.961957,0.967391,0.967391,0.945652,0.961957,0.967391,0.967391
25%,0.971467,0.961957,0.973732,0.970109,0.961957,0.971467,0.973732,0.970109,0.961957,0.971467,0.973732,0.970109
50%,0.978261,0.978261,0.978261,0.975543,0.978261,0.978261,0.978261,0.975543,0.978261,0.978261,0.978261,0.975543
75%,0.985054,0.980978,0.981884,0.980978,0.980978,0.985054,0.981884,0.980978,0.980978,0.985054,0.981884,0.980978
max,0.994565,1.0,0.985507,0.98913,1.0,0.994565,0.985507,0.98913,1.0,0.994565,0.985507,0.98913


In [45]:
# All samples
rf = RandomForestClassifier(n_estimators=600, max_features='sqrt', max_depth = 70, min_samples_split = 2, min_samples_leaf = 1, bootstrap=True, random_state=33433)

%time \
df = splitrepeat_cv(X,y,rf,splits=list(int(x)*42+42 for x in range(10)),repeats=list(int(x)*42+42 for x in range(10)), imbalanced='over', categorical_features=categorical_features, initial_split_seed=33433, initial_split_ratio=.25)
df.describe()

Wall time: 2min 7s


Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.991626,0.631739,0.97051,0.811683,0.631739,0.991626,0.97051,0.811683,0.631739,0.991626,0.97051,0.811683
std,0.006389,0.075548,0.007173,0.037647,0.075548,0.006389,0.007173,0.037647,0.075548,0.006389,0.007173,0.037647
min,0.97561,0.434783,0.94898,0.717391,0.434783,0.97561,0.94898,0.717391,0.434783,0.97561,0.94898,0.717391
25%,0.98916,0.565217,0.969388,0.781254,0.565217,0.98916,0.969388,0.781254,0.565217,0.98916,0.969388,0.781254
50%,0.99187,0.608696,0.971939,0.802993,0.608696,0.99187,0.971939,0.802993,0.608696,0.99187,0.971939,0.802993
75%,0.99729,0.695652,0.977041,0.845116,0.695652,0.99729,0.977041,0.845116,0.695652,0.99729,0.977041,0.845116
max,1.0,0.73913,0.982143,0.866855,0.73913,1.0,0.982143,0.866855,0.73913,1.0,0.982143,0.866855


# Outputs

In [46]:
### Save outputs ###
q = 'CDR1_BorutaSHAP_2'

df['set'] = q
df.to_csv(('../models/outputs/' + q + '.csv'), index=False)