## Random search

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import mp_utils as mp

import xgboost as xgb
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('.\data\data_compressed\df_data.csv')
co = pd.read_csv('.\data\data_compressed\df_cohort.csv')
df_static = pd.read_csv('.\data\data_compressed\df_static_data.csv')
df_death = pd.read_csv('.\data\data_compressed\df_death.csv')
co = co.drop(co.columns[0], axis=1)
df_static = df_static.drop(df_static.columns[0], axis=1)
df_death = df_death.drop(df_death.columns[0], axis=1)

### Baseline model 1.

In [3]:
var_min, var_max, var_first, var_last, var_sum, var_first_early, var_last_early, var_static = mp.vars_of_interest()
sid = np.sort(np.unique(df_death['subject_id'].values))


W = 24
W_extra = 24
y_outcome_label = 'death_in_hospital'

df_tmp=co.copy().set_index('icustay_id')
time_dict = df_tmp.copy()
time_dict['windowtime'] = W
time_dict = time_dict['windowtime'].to_dict()

In [4]:
current_study = 'baseline'
exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
    
# Data preparation
df_data = mp.get_design_matrix(df, time_dict, W=W, W_extra=W_extra)
iid_keep = exclFcn(co)
df_data = df_data.reindex(index = iid_keep)
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
idxMap = np.searchsorted(sid, X['subject_id'].values)

# assign k-fold
K = 5
np.random.seed(871)
idxK_sid = np.random.permutation(sid.shape[0])
idxK_sid = np.mod(idxK_sid,K)

idxK = idxK_sid[idxMap]
X.drop('subject_id',axis=1,inplace=True)
X = X.values
y = X[:,-1]
X = X[:,0:-1]

# Model evaluation
mdl_val = list()
results_val = list() # initialize list for scores
pred_val = list()
tar_val = list()

# Hyperparameters tuning
# no pre-processing of data necessary for xgb
model_pipeline = Pipeline([('xgb', xgb.XGBClassifier())])

param_grid = {
        'xgb__max_depth': [1, 3, 6, 8, 10],
        'xgb__learning_rate': [0.001, 0.05, 0.01, 0.1],
        'xgb__subsample': [0.5, 0.7, 0.9, 1.0],
        'xgb__colsample_bytree': [0.4, 0.6, 0.8, 1.0],
        'xgb__colsample_bylevel': [0.5, 0.7, 0.9, 1.0],
        'xgb__min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
        'xgb__gamma': [0, 0.25, 0.5, 1.0],
        'xgb__reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
        'xgb__n_estimators': [100, 200, 300, 400]}
fit_params = {'xgb__eval_metric': 'logloss'}


estimator = RandomizedSearchCV(model_pipeline, param_grid, cv=5)

for k in range(K):
    # train the model using all but the kth fold
    curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k], **fit_params)
    curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
    curr_prob = curr_prob[:,1]

    pred_val.append(curr_prob)
    tar_val.append(y[idxK == k])

    # calculate score (AUROC)
    curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)
    results_val.append(curr_score)

    print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))

# Final results print
best_params = estimator.best_params_
print('')
print("Best params: {}".format(best_params))
print('')
print('StudyName,SampleSize',end='')
print(', xgb ',end='')
print('')
print( '{},{}'.format(current_study, X.shape[0] ), end='' )
print(',{:0.6f}'.format(np.mean(results_val) ), end='')
print('\n')

2021-04-19 14:06:46.925729 - Finished fold 1 of 5. AUROC 0.885.
2021-04-19 14:17:26.212363 - Finished fold 2 of 5. AUROC 0.887.
2021-04-19 14:32:32.428986 - Finished fold 3 of 5. AUROC 0.878.
2021-04-19 14:54:47.993055 - Finished fold 4 of 5. AUROC 0.892.
2021-04-19 15:05:37.829748 - Finished fold 5 of 5. AUROC 0.894.

Best params: {'xgb__subsample': 0.7, 'xgb__reg_lambda': 0.1, 'xgb__n_estimators': 300, 'xgb__min_child_weight': 3.0, 'xgb__max_depth': 8, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.5, 'xgb__colsample_bytree': 0.6, 'xgb__colsample_bylevel': 0.9}

StudyName,SampleSize, xgb 
baseline,38687,0.887036



In [5]:
# Original model cv AUCROC scores:
##  Fold 1 of 5. AUROC 0.880.
##  Fold 2 of 5. AUROC 0.889.
##  Fold 3 of 5. AUROC 0.888.
##  Fold 4 of 5. AUROC 0.886.
##  Fold 5 of 5. AUROC 0.897.

# Original params: max_depth=3, n_estimators=300, learning_rate=0.05

# Original model final result:
# StudyName,SampleSize,xgb
# baseline,38687,0.887931

### Baseline model 2.

In [9]:
current_study = 'baseline_withdrawal'
exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values    
# Data preparation
df_data = mp.get_design_matrix(df, time_dict, W=W, W_extra=W_extra)
iid_keep = exclFcn(co)
df_data = df_data.reindex(index = iid_keep)
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
idxMap = np.searchsorted(sid, X['subject_id'].values)

# assign k-fold
K = 5
np.random.seed(871)
idxK_sid = np.random.permutation(sid.shape[0])
idxK_sid = np.mod(idxK_sid,K)

idxK = idxK_sid[idxMap]
X.drop('subject_id',axis=1,inplace=True)
X = X.values
y = X[:,-1]
X = X[:,0:-1]


# Model evaluation
mdl_val = list()
results_val = list() # initialize list for scores
pred_val = list()
tar_val = list()

# Hyperparameters tuning
# no pre-processing of data necessary for xgb
model_pipeline = Pipeline([('xgb', xgb.XGBClassifier())])


param_grid = {
        'xgb__max_depth': [1, 3, 6, 8, 10],
        'xgb__learning_rate': [0.001, 0.05, 0.01, 0.1],
        'xgb__subsample': [0.5, 0.7, 0.9, 1.0],
        'xgb__colsample_bytree': [0.4, 0.6, 0.8, 1.0],
        'xgb__colsample_bylevel': [0.5, 0.7, 0.9, 1.0],
        'xgb__min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
        'xgb__gamma': [0, 0.25, 0.5, 1.0],
        'xgb__reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
        'xgb__n_estimators': [100, 200, 300, 400]}

fit_params = {'xgb__eval_metric': 'logloss'}

estimator = RandomizedSearchCV(model_pipeline, param_grid, cv=5)

for k in range(K):
    # train the model using all but the kth fold
    curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k], **fit_params)
    curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
    curr_prob = curr_prob[:,1]

    pred_val.append(curr_prob)
    tar_val.append(y[idxK == k])

    # calculate score (AUROC)
    curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)
    results_val.append(curr_score)

    print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
        


# Final results print
best_params = estimator.best_params_
print('')
print("Best params: {}".format(best_params))
print('')
print('StudyName,SampleSize',end='')
print(', xgb ',end='')
print('')
print( '{},{}'.format(current_study, X.shape[0] ), end='' )
print(',{:0.6f}'.format(np.mean(results_val) ), end='')
print('\n')

2021-04-19 15:36:52.718529 - Finished fold 1 of 5. AUROC 0.885.
2021-04-19 15:45:21.796123 - Finished fold 2 of 5. AUROC 0.887.
2021-04-19 15:55:15.375815 - Finished fold 3 of 5. AUROC 0.878.
2021-04-19 16:06:56.798799 - Finished fold 4 of 5. AUROC 0.892.
2021-04-19 16:16:18.863232 - Finished fold 5 of 5. AUROC 0.894.

Best params: {'xgb__subsample': 0.7, 'xgb__reg_lambda': 0.1, 'xgb__n_estimators': 300, 'xgb__min_child_weight': 3.0, 'xgb__max_depth': 8, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.5, 'xgb__colsample_bytree': 0.6, 'xgb__colsample_bylevel': 0.9}

StudyName,SampleSize, xgb 
baseline_withdrawal,38687,0.887036



In [7]:
# Original model cv AUCROC scores:
##  Fold 1 of 5. AUROC 0.880.
##  Fold 2 of 5. AUROC 0.889.
##  Fold 3 of 5. AUROC 0.888.
##  Fold 4 of 5. AUROC 0.886.
##  Fold 5 of 5. AUROC 0.897.

# Original params: max_depth=3, n_estimators=300, learning_rate=0.05

# Original model final result:
# StudyName,SampleSize,xgb
# baseline_withdrawal,38687,0.887931