In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report, multilabel_confusion_matrix
import tensorflow as tf
import datetime, os
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, Flatten, concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.constraints import MaxNorm
from sklearn.model_selection import GridSearchCV
from splitrepeat import splitrepeat_cv

config = tf.compat.v1.ConfigProto(gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.8))
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)

features = ['e_memory_pt', 'LDELTOTAL', 'e_memory_cg', 'tmab_time']

data = pd.read_csv('../data/interim/data_adni.csv')
X = data[features]
y = data['CDGLOBAL']
y.replace({2:1},inplace=True)

In [2]:
X, X_test, y, y_test = train_test_split(X, y, train_size=0.25, random_state=33433, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=33433, stratify=y)     

# RF Classifier

In [4]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_features=.5, random_state=0)

%time \
df = splitrepeat_cv(X,y,rf,splits=[111,222,333,444],repeats=[111,222,333,444,555])
df.describe()

Wall time: 3.7 s


Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.825568,0.934722,0.885714,0.880145,0.934722,0.825568,0.885714,0.880145,0.934722,0.825568,0.885714,0.880145
std,0.03156,0.028257,0.01883,0.018948,0.028257,0.03156,0.01883,0.018948,0.028257,0.03156,0.01883,0.018948
min,0.772727,0.898148,0.846939,0.840067,0.898148,0.772727,0.846939,0.840067,0.898148,0.772727,0.846939,0.840067
25%,0.795455,0.916667,0.875,0.87158,0.916667,0.795455,0.875,0.87158,0.916667,0.795455,0.875,0.87158
50%,0.835227,0.925926,0.887755,0.88447,0.925926,0.835227,0.887755,0.88447,0.925926,0.835227,0.887755,0.88447
75%,0.852273,0.951389,0.894133,0.889362,0.951389,0.852273,0.894133,0.889362,0.951389,0.852273,0.894133,0.889362
max,0.863636,0.981481,0.913265,0.905513,0.981481,0.863636,0.913265,0.905513,0.981481,0.863636,0.913265,0.905513


## Random Search

In [5]:
from sklearn.model_selection import RandomizedSearchCV

# Search parameters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['sqrt', 'log2', .3, .5, .7, .9]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

search_params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [6]:
rf_search_rand = RandomizedSearchCV(rf, search_params, n_iter=400,cv=3,verbose=2,random_state=33433, n_jobs=-1)
rf_search_rand.fit(X_train, y_train)

Fitting 3 folds for each of 400 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  5.5min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=0.5,
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                  

In [7]:
rf_search_rand.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 0.3,
 'max_depth': 100,
 'bootstrap': True}

In [8]:
best = rf_search_rand.best_estimator_
df_rand = splitrepeat_cv(X,y,best,splits=[10,20,30],repeats=[10,20,30])
df_rand.describe()

Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,0.857323,0.954733,0.910998,0.906028,0.954733,0.857323,0.910998,0.906028,0.954733,0.857323,0.910998,0.906028
std,0.027315,0.014962,0.01751,0.018211,0.014962,0.027315,0.01751,0.018211,0.014962,0.027315,0.01751,0.018211
min,0.818182,0.935185,0.887755,0.881313,0.935185,0.818182,0.887755,0.881313,0.935185,0.818182,0.887755,0.881313
25%,0.829545,0.944444,0.897959,0.891625,0.944444,0.829545,0.897959,0.891625,0.944444,0.829545,0.897959,0.891625
50%,0.863636,0.953704,0.908163,0.90404,0.953704,0.863636,0.908163,0.90404,0.953704,0.863636,0.908163,0.90404
75%,0.875,0.962963,0.928571,0.922559,0.962963,0.875,0.928571,0.922559,0.962963,0.875,0.928571,0.922559
max,0.897727,0.981481,0.938776,0.934975,0.981481,0.897727,0.938776,0.934975,0.981481,0.897727,0.938776,0.934975


## Grid Search

In [9]:
from sklearn.model_selection import GridSearchCV

search_params = {'n_estimators': [100, 200, 300],
               'max_features': [.2, .3, .4],
               'max_depth': [90, 100, 110],
               'min_samples_split': [4,5,6],
               'min_samples_leaf': [3,4,5],
               'bootstrap': [False, True]}

rf_search_grid = GridSearchCV(rf, search_params, 
                          cv = 3, n_jobs = -1, verbose = 2)
rf_search_grid.fit(X_train, y_train)

Fitting 3 folds for each of 486 candidates, totalling 1458 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1458 out of 1458 | elapsed:  1.3min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=0.5,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=555,
                                    

In [10]:
rf_search_grid.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 0.2,
 'min_samples_leaf': 3,
 'min_samples_split': 4,
 'n_estimators': 300}

In [11]:
best = rf_search_grid.best_estimator_
df_grid = splitrepeat_cv(X,y,best,splits=[10,20,30],repeats=[10,20,30])
df_grid.describe()

Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,0.859848,0.952675,0.910998,0.906262,0.952675,0.859848,0.910998,0.906262,0.952675,0.859848,0.910998,0.906262
std,0.028409,0.011753,0.017877,0.018774,0.011753,0.028409,0.017877,0.018774,0.011753,0.028409,0.017877,0.018774
min,0.829545,0.935185,0.892857,0.886995,0.935185,0.829545,0.892857,0.886995,0.935185,0.829545,0.892857,0.886995
25%,0.829545,0.944444,0.897959,0.891625,0.944444,0.829545,0.897959,0.891625,0.944444,0.829545,0.897959,0.891625
50%,0.852273,0.953704,0.908163,0.902988,0.953704,0.852273,0.908163,0.902988,0.953704,0.852273,0.908163,0.902988
75%,0.886364,0.962963,0.933673,0.929293,0.962963,0.886364,0.933673,0.929293,0.962963,0.886364,0.933673,0.929293
max,0.897727,0.972222,0.933673,0.930345,0.972222,0.897727,0.933673,0.930345,0.972222,0.897727,0.933673,0.930345


In [12]:
rf = RandomForestClassifier(n_estimators=300, max_features=.2, max_depth = 90, min_samples_split = 4, min_samples_leaf = 3, bootstrap=True, random_state=33433)

%time \
df = splitrepeat_cv(X,y,rf,splits=[111,222,333,444],repeats=[111,222,333,444,555])
df.describe()

Wall time: 9.66 s


Unnamed: 0,Sensitivity0,Specificity0,Accuracy0,AUC0,Sensitivity1,Specificity1,Accuracy1,AUC1,Sensitivity,Specificity,Accuracy,AUC
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.820455,0.939352,0.885969,0.879903,0.939352,0.820455,0.885969,0.879903,0.939352,0.820455,0.885969,0.879903
std,0.030313,0.032421,0.015901,0.015438,0.032421,0.030313,0.015901,0.015438,0.032421,0.030313,0.015901,0.015438
min,0.772727,0.87963,0.867347,0.86069,0.87963,0.772727,0.867347,0.86069,0.87963,0.772727,0.867347,0.86069
25%,0.795455,0.925926,0.872449,0.866372,0.925926,0.795455,0.872449,0.866372,0.925926,0.795455,0.872449,0.866372
50%,0.8125,0.939815,0.880102,0.875105,0.939815,0.8125,0.880102,0.875105,0.939815,0.8125,0.880102,0.875105
75%,0.838068,0.960648,0.90051,0.896044,0.960648,0.838068,0.90051,0.896044,0.960648,0.838068,0.90051,0.896044
max,0.875,0.981481,0.913265,0.905513,0.981481,0.875,0.913265,0.905513,0.981481,0.875,0.913265,0.905513


In [2]:
# All samples
rf = RandomForestClassifier(n_estimators=300, max_features=.2, max_depth = 90, min_samples_split = 4, min_samples_leaf = 3, bootstrap=True, random_state=33433)

%time \
df = splitrepeat_cv(X,y,rf,splits=list(int(x)*42+42 for x in range(20)),repeats=list(int(x)*42+42 for x in range(20)), avg_strategy='weighted', initial_split_seed=33433, initial_split_ratio=.25)
df.describe()

Wall time: 2min 57s


Unnamed: 0,Sensitivity0,Specificity0,PPV0,NPV0,Accuracy0,Sensitivity1,Specificity1,PPV1,NPV1,Accuracy1,Sensitivity,Specificity,PPV,NPV,F1_Score,Accuracy
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,0.924333,0.884318,0.841143,0.945348,0.899626,0.884318,0.924333,0.945348,0.841143,0.899626,0.945348,0.841143,0.884318,0.924333,0.898928,0.899626
std,0.026874,0.017417,0.028189,0.02149,0.013737,0.017417,0.026874,0.02149,0.028189,0.013737,0.02149,0.028189,0.017417,0.026874,0.013838,0.013737
min,0.867769,0.849462,0.782946,0.89697,0.863946,0.849462,0.867769,0.89697,0.782946,0.863946,0.89697,0.782946,0.849462,0.867769,0.863379,0.863946
25%,0.904762,0.868852,0.813953,0.927273,0.891156,0.868852,0.904762,0.927273,0.813953,0.891156,0.927273,0.813953,0.868852,0.904762,0.890484,0.891156
50%,0.929204,0.885057,0.844961,0.951515,0.897959,0.885057,0.929204,0.951515,0.844961,0.897959,0.951515,0.844961,0.885057,0.929204,0.897108,0.897959
75%,0.945946,0.897727,0.868217,0.963636,0.908163,0.897727,0.945946,0.963636,0.868217,0.908163,0.963636,0.868217,0.897727,0.945946,0.908037,0.908163
max,0.972477,0.91716,0.891473,0.981818,0.928571,0.91716,0.972477,0.981818,0.891473,0.928571,0.981818,0.891473,0.91716,0.972477,0.928133,0.928571


# Outputs

In [3]:
### Save outputs ###
q = 'CDR_Imp_Boruta4_400'

df['set'] = q
df.to_csv(('../models/outputs/' + q + '.csv'), index=False)