In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report, multilabel_confusion_matrix
import tensorflow as tf
import datetime, os
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, Flatten, concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.constraints import MaxNorm
from sklearn.model_selection import GridSearchCV
from splitrepeat import splitrepeat_cv

config = tf.compat.v1.ConfigProto(gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.8))
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)

features = ['ADAS_Q1', 'ADAS_Q4', 'LDELTOTAL','tmab_time', 
           'e_memory_pt', 'e_lang_pt', 'e_visspat_pt', 'e_plan_pt', 'e_organ_pt', 'e_divatt_pt','e_memory_cg', 'e_lang_cg', 
           'e_visspat_cg','e_plan_cg', 'e_organ_cg', 'e_divatt_cg', 'faq1','faq2','faq3','faq4','faq5','faq6','faq7','faq8','faq9','faq10']

categorical_features = np.in1d(features, ['MMONFLR', 'q_memory_pt', 'faq10', 'faq6', 'COPYSYM', 'q_judgmt_cg', 'COPYNUM', 'faq5', 'MMTREE', 'COPYHAND', 'moca_clock', 'MMSEASON', 'moca_letters', 'MMBALL', 'faq9', 'MMFLOOR', 'MMDRAW', 'MMMONTH', 'PXGENAPP', 'MMWATCH', 'CLOCKCIRC', 'faq8', 'MMHOSPIT', 'moca_naming', 'PXEXTREM', 'q_orient_pt', 'CLOCKNUM', 'PXMUSCUL', 'faq1', 'q_orient_cg', 'MMTREEDL', 'CLOCKTIME', 'PXABDOM', 'MMFLAG', 'COPYCIRC', 'MMAREA', 'faq3', 'moca_digits', 'CLOCKHAND', 'MMREAD', 'q_memory_cg', 'MMYEAR', 'MMREPEAT', 'q_homeact_cg', 'q_language_pt', 'moca_visuo_exec', 'q_judgmt_pt', 'MMHAND', 'MMBALLDL', 'PXCHEST', 'MMDATE', 'MMFLAGDL', 'q_outsideact_pt', 'MMSTATE', 'q_outsideact_cg', 'q_attention_cg', 'moca_repeat', 'q_homeact_pt', 'MMFOLD', 'MMPENCIL', 'q_language_cg', 'MMDAY', 'q_attention_pt', 'faq2', 'PXHEART', 'CLOCKSYM', 'faq4', 'moca_serial7', 'faq7', 'MMCITY', 'PXHEADEY', 'COPYTIME', 'PXPERIPH', 'PXSKIN', 'moca_fluency', 'moca_similarities', 'PXNECK', 'MMWRITE'])

data = pd.read_csv('../data/interim/data_adni.csv')
X = data[features]
y = data['CDGLOBAL']
y.replace({2:1},inplace=True)

In [2]:
_ = features
print(len(_))
print('[%s]' % ', '.join(map(str, _)))

26
[ADAS_Q1, ADAS_Q4, LDELTOTAL, tmab_time, e_memory_pt, e_lang_pt, e_visspat_pt, e_plan_pt, e_organ_pt, e_divatt_pt, e_memory_cg, e_lang_cg, e_visspat_cg, e_plan_cg, e_organ_cg, e_divatt_cg, faq1, faq2, faq3, faq4, faq5, faq6, faq7, faq8, faq9, faq10]


In [2]:
X, X_test, y, y_test = train_test_split(X, y, train_size=0.25, random_state=33433, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=33433, stratify=y)     

# RF Classifier

In [4]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_features=.5, random_state=0)

%time \
df = splitrepeat_cv(X,y,rf,splits=[111,222,333,444],repeats=[111,222,333,444,555])
df.describe()

Wall time: 4.69 s


Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.809091,0.922222,0.871429,0.865657,0.922222,0.809091,0.871429,0.865657,0.922222,0.809091,0.871429,0.865657
std,0.024622,0.019329,0.014488,0.014797,0.019329,0.024622,0.014488,0.014797,0.019329,0.024622,0.014488,0.014797
min,0.761364,0.888889,0.841837,0.834386,0.888889,0.761364,0.841837,0.834386,0.888889,0.761364,0.841837,0.834386
25%,0.792614,0.907407,0.864796,0.858112,0.907407,0.792614,0.864796,0.858112,0.907407,0.792614,0.864796,0.858112
50%,0.818182,0.925926,0.875,0.869213,0.925926,0.818182,0.875,0.869213,0.925926,0.818182,0.875,0.869213
75%,0.829545,0.935185,0.882653,0.875894,0.935185,0.829545,0.882653,0.875894,0.935185,0.829545,0.882653,0.875894
max,0.852273,0.953704,0.892857,0.886995,0.953704,0.852273,0.892857,0.886995,0.953704,0.852273,0.892857,0.886995


## Random Search

In [5]:
from sklearn.model_selection import RandomizedSearchCV

# Search parameters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['sqrt', 'log2', .3, .5, .7, .9]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

search_params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [6]:
rf_search_rand = RandomizedSearchCV(rf, search_params, n_iter=400,cv=3,verbose=2,random_state=33433, n_jobs=-1)
rf_search_rand.fit(X_train, y_train)

Fitting 3 folds for each of 400 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   46.4s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  6.1min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=0.5,
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                  

In [7]:
rf_search_rand.best_params_

{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 0.7,
 'max_depth': 50,
 'bootstrap': True}

In [8]:
best = rf_search_rand.best_estimator_
df_rand = splitrepeat_cv(X,y,best,splits=[10,20,30],repeats=[10,20,30])
df_rand.describe()

Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,0.871212,0.923868,0.900227,0.89754,0.923868,0.871212,0.900227,0.89754,0.923868,0.871212,0.900227,0.89754
std,0.027249,0.01012,0.010239,0.011598,0.01012,0.027249,0.010239,0.011598,0.01012,0.027249,0.010239,0.011598
min,0.840909,0.907407,0.882653,0.87984,0.907407,0.840909,0.882653,0.87984,0.907407,0.840909,0.882653,0.87984
25%,0.852273,0.916667,0.892857,0.889099,0.916667,0.852273,0.892857,0.889099,0.916667,0.852273,0.892857,0.889099
50%,0.852273,0.925926,0.897959,0.893729,0.925926,0.852273,0.897959,0.893729,0.925926,0.852273,0.897959,0.893729
75%,0.897727,0.935185,0.908163,0.907197,0.935185,0.897727,0.908163,0.907197,0.935185,0.897727,0.908163,0.907197
max,0.909091,0.935185,0.913265,0.912879,0.935185,0.909091,0.913265,0.912879,0.935185,0.909091,0.913265,0.912879


## Grid Search

In [9]:
from sklearn.model_selection import GridSearchCV

search_params = {'n_estimators': [100, 200, 300],
               'max_features': [.6, .7, .8],
               'max_depth': [40, 50, 60],
               'min_samples_split': [1,2,3],
               'min_samples_leaf': [1, 2, 3],
               'bootstrap': [False, True]}

rf_search_grid = GridSearchCV(rf, search_params, 
                          cv = 3, n_jobs = -1, verbose = 2)
rf_search_grid.fit(X_train, y_train)

Fitting 3 folds for each of 486 candidates, totalling 1458 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 198 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 401 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 684 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 1049 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done 1458 out of 1458 | elapsed:  1.1min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=0.5,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=555,
                                    

In [10]:
rf_search_grid.best_params_

{'bootstrap': True,
 'max_depth': 40,
 'max_features': 0.6,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [11]:
best = rf_search_grid.best_estimator_
df_grid = splitrepeat_cv(X,y,best,splits=[10,20,30],repeats=[10,20,30])
df_grid.describe()

Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,0.864899,0.930041,0.900794,0.89747,0.930041,0.864899,0.900794,0.89747,0.930041,0.864899,0.900794,0.89747
std,0.023041,0.010467,0.011152,0.012001,0.010467,0.023041,0.011152,0.012001,0.010467,0.023041,0.011152,0.012001
min,0.840909,0.916667,0.887755,0.883418,0.916667,0.840909,0.887755,0.883418,0.916667,0.840909,0.887755,0.883418
25%,0.852273,0.925926,0.892857,0.889099,0.925926,0.852273,0.892857,0.889099,0.925926,0.852273,0.892857,0.889099
50%,0.852273,0.925926,0.897959,0.893729,0.925926,0.852273,0.897959,0.893729,0.925926,0.852273,0.897959,0.893729
75%,0.886364,0.935185,0.908163,0.90404,0.935185,0.886364,0.908163,0.90404,0.935185,0.886364,0.908163,0.90404
max,0.897727,0.944444,0.923469,0.921086,0.944444,0.897727,0.923469,0.921086,0.944444,0.897727,0.923469,0.921086


In [12]:
rf = RandomForestClassifier(n_estimators=100, max_features=.6, max_depth = 40, min_samples_split = 2, min_samples_leaf = 1, bootstrap=True, random_state=33433)

%time \
df = splitrepeat_cv(X,y,rf,splits=[111,222,333,444],repeats=[111,222,333,444,555])
df.describe()

Wall time: 4.27 s


Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.808523,0.923611,0.871939,0.866067,0.923611,0.808523,0.871939,0.866067,0.923611,0.808523,0.871939,0.866067
std,0.026656,0.016143,0.015072,0.015736,0.016143,0.026656,0.015072,0.015736,0.016143,0.026656,0.015072,0.015736
min,0.75,0.888889,0.841837,0.833333,0.888889,0.75,0.841837,0.833333,0.888889,0.75,0.841837,0.833333
25%,0.792614,0.914352,0.862245,0.855798,0.914352,0.792614,0.862245,0.855798,0.914352,0.792614,0.862245,0.855798
50%,0.818182,0.925926,0.877551,0.871002,0.925926,0.818182,0.877551,0.871002,0.925926,0.818182,0.877551,0.871002
75%,0.829545,0.935185,0.882653,0.876684,0.935185,0.829545,0.882653,0.876684,0.935185,0.829545,0.882653,0.876684
max,0.852273,0.953704,0.897959,0.893729,0.953704,0.852273,0.897959,0.893729,0.953704,0.852273,0.897959,0.893729


In [2]:
# All samples
rf = RandomForestClassifier(n_estimators=100, max_features=.6, max_depth = 40, min_samples_split = 2, min_samples_leaf = 1, bootstrap=True, random_state=33433)

%time \
df = splitrepeat_cv(X,y,rf,splits=list(int(x)*42+42 for x in range(20)),repeats=list(int(x)*42+42 for x in range(20)), avg_strategy='weighted', initial_split_seed=33433, initial_split_ratio=.25)
df.describe()

Wall time: 1min 59s


Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,0.846899,0.930636,0.893895,0.888768,0.930636,0.846899,0.893895,0.888768,0.930636,0.846899,0.893895,0.888768
std,0.0287,0.019358,0.015613,0.016317,0.019358,0.0287,0.015613,0.016317,0.019358,0.0287,0.015613,0.016317
min,0.782946,0.878788,0.840136,0.834743,0.878788,0.782946,0.840136,0.834743,0.878788,0.782946,0.840136,0.834743
25%,0.821705,0.915152,0.884354,0.877519,0.915152,0.821705,0.884354,0.877519,0.915152,0.821705,0.884354,0.877519
50%,0.844961,0.933333,0.894558,0.889147,0.933333,0.844961,0.894558,0.889147,0.933333,0.844961,0.894558,0.889147
75%,0.868217,0.945455,0.904762,0.899084,0.945455,0.868217,0.904762,0.899084,0.945455,0.868217,0.904762,0.899084
max,0.914729,0.969697,0.931973,0.929246,0.969697,0.914729,0.931973,0.929246,0.969697,0.914729,0.931973,0.929246


# Outputs

In [3]:
### Save outputs ###
q = 'CDR_Imp_B_EFAQ_400'

df['set'] = q
df.to_csv(('../models/outputs/' + q + '.csv'), index=False)