<img src = "../../Data/bgsedsc_0.jpg">

# Project: Decision trees

In [30]:
## Set up ----
%matplotlib inline
import matplotlib.pylab as plt
import seaborn as sns

import random
import time
import scipy
import datetime
import pandas as pd
import numpy as np
import sklearn
import pandas as pd
import numpy as np
import time

from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import StandardScaler, OneHotEncoder,  scale
import category_encoders as ce
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# kernel approximators
from sklearn.kernel_approximation import Nystroem, RBFSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import gc
from sklearn.metrics import accuracy_score

# Random state
rand_state = 1111
np.random.seed(rand_state) # impose random seed for reproducibility

# Training dataset
data=pd.read_csv('../Data/mimic_train.csv')
data_test=pd.read_csv('../Data/mimic_test_los.csv')

In [31]:
## Outcome variable ----
y = data['LOS']

## Pre-processing

I have moved the preprocessing to a separate file as it's quite extensive and this way it's more easily shared across prediction problems. It saves data which I read below in order to not have to run pre-processing each time. If need be, pre-processing can be run by uncommenting the code in the cell below.

In [32]:
#%run ./preproc.ipynb

In [33]:
# Read pre-processed data:
y = data.loc[:,'LOS']
X = pd.read_csv("../Data/los/X_preproc.csv")
X_test = pd.read_csv("../Data/los/X_test_preproc.csv")
data = pd.concat([y,X], axis=1)

In [34]:
# Sub-sampling:
#data = data.sample(
#    frac=0.1, random_state=rand_state
#)

In [35]:
# Check:
print(X.shape)
print(X_test.shape)
print(y.shape)

(20885, 43)
(5221, 43)
(20885,)


# Prediction

In [36]:
# Check:
print(X.shape)
print(X_test.shape)
print(y.shape)

(20885, 43)
(5221, 43)
(20885,)


## Grid search

In [37]:
from xgboost import XGBRegressor
classif = XGBRegressor()

In [None]:
%%time
grid_values = {
    'nthread':[1], #when use hyperthread, xgboost may become slower
    'objective':['reg:squarederror'],
    'learning_rate': [0.01,0.02], #so called `eta` value
    'max_depth': [3,6],
    'min_child_weight': [20],
    'subsample': [0.5,0.75],
    'colsample_bytree': [0.7],
    'n_estimators': [100,200,300], #number of trees 
    'reg_lambda':[0.5,1,1.5],
    'reg_alpha':[0, 0.1, 0.5]
}
grid_acc = GridSearchCV(
    classif, 
    param_grid = grid_values,
    scoring = 'neg_root_mean_squared_error', cv=5
)
grid_acc.fit(X, y)

Let us analyse the grid search in some more detail:

In [None]:
# Report best choices:
n_est_best = str(grid_acc.best_estimator_.n_estimators)
lr_best = str(grid_acc.best_estimator_.learning_rate)
max_depth_best = str(grid_acc.best_estimator_.max_depth)
subsample_best = str(grid_acc.best_estimator_.subsample)
reg_best = str(grid_acc.best_estimator_.reg_lambda)
min_child = str(grid_acc.best_estimator_.min_child_weight)
alpha = str(grid_acc.best_estimator_.reg_alpha)
score_best = str(np.round(grid_acc.best_score_, 5))
print('Best n_estimators parameter : '+ n_est_best)
print('Best learning rate: '+ lr_best)
print('Best maximumg depth: '+ max_depth_best)
print('Best subsample size: '+ subsample_best)
print('Best reguarization param: '+ reg_best)
print('Best min child weight: '+ min_child)
print('Best L1 reg: '+ alpha)
print('Accuracy score:' + score_best)

In [None]:
GridSearch_table_plot(grid_acc, "reg_alpha", negative=False, display_all_params=False)

#### Export output

In [None]:
#Predict values based on optimized parameters
y_hat = grid_acc.predict(X_test)
y_hat

In [None]:
# Test dataset (to produce predictions)
data_test=pd.read_csv('../Data/mimic_test_death.csv')

In [None]:
predictions = pd.DataFrame({'icustay_id': data_test.icustay_id.values, 'LOS': y_hat})
output_name = f"output/predictions_score:{score_best}_nEst:{n_est_best}_lr:{lr_best}_maxDepth:{max_depth_best}_subsample:{subsample_best}" 
    
predictions.to_csv(output_name, index=False)

In [None]:
predictions.describe()