<img src = "../../Data/bgsedsc_0.jpg">

# Project: Decision trees

In [1]:
## Set up ----
%matplotlib inline
import matplotlib.pylab as plt
import seaborn as sns

import random
import time
import scipy
import datetime
import pandas as pd
import numpy as np
import sklearn
import pandas as pd
import numpy as np
import time

from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import StandardScaler, OneHotEncoder,  scale
import category_encoders as ce
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# kernel approximators
from sklearn.kernel_approximation import Nystroem, RBFSampler

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import gc
from sklearn.metrics import accuracy_score

# Random state
rand_state = 1111
np.random.seed(rand_state) # impose random seed for reproducibility

# Training dataset
data=pd.read_csv('../Data/mimic_train.csv')
data_test=pd.read_csv('../Data/mimic_test_los.csv')

In [2]:
## Outcome variable ----
y = data['LOS']

## Pre-processing

I have moved the preprocessing to a separate file as it's quite extensive and this way it's more easily shared across prediction problems. It saves data which I read below in order to not have to run pre-processing each time. If need be, pre-processing can be run by uncommenting the code in the cell below.

In [3]:
#%run ./preproc.ipynb

In [4]:
# Read pre-processed data:
y = data.loc[:,'HOSPITAL_EXPIRE_FLAG']
X = pd.read_csv("../Data/los/X_preproc.csv")
X_test = pd.read_csv("../Data/los/X_test_preproc.csv")
data = pd.concat([y,X], axis=1)

In [5]:
# Sub-sampling:
#data = data.sample(
#    frac=0.1, random_state=rand_state
#)

In [6]:
# Check:
print(X.shape)
print(X_test.shape)
print(y.shape)

(20885, 35)
(5221, 35)
(20885,)


# Prediction

In [7]:
# Check:
print(X.shape)
print(X_test.shape)
print(y.shape)

(20885, 35)
(5221, 35)
(20885,)


## Grid search

In [8]:
from xgboost import XGBRegressor

# Balancing:
# ros = RandomUnderSampler(random_state=rand_state)
# X, y = ros.fit_resample(X, y)

classif = XGBRegressor()

In [None]:
%%time
grid_values = {
    'nthread':[1], #when use hyperthread, xgboost may become slower
    'objective':['reg:squarederror'],
    'learning_rate': [0.03,0.06,0.09], #so called `eta` value
    'max_depth': [3,6,9,12],
    'min_child_weight': [11],
    'subsample': [0.5,0.8],
    'colsample_bytree': [0.7],
    'n_estimators': [100,500], #number of trees 
    'missing':[-999],
    'reg_lambda':[1.5],
}
grid_acc = GridSearchCV(
    classif, 
    param_grid = grid_values,
    scoring = 'neg_mean_squared_error', cv=5
)
grid_acc.fit(X, y)

In [217]:
# Report best choices:
n_est_best = str(grid_acc.best_estimator_.n_estimators)
lr_best = str(grid_acc.best_estimator_.learning_rate)
max_depth_best = str(grid_acc.best_estimator_.max_depth)
subsample_best = str(grid_acc.best_estimator_.subsample)
score_best = str(np.round(grid_acc.best_score_, 5))
print('Best n_estimators parameter : '+ n_est_best)
print('Best learning rate: '+ lr_best)
print('Best maximumg depth: '+ max_depth_best)
print('Best subsample size: '+ subsample_best)
print('Accuracy score:' + score_best)

Best n_estimators parameter : 500
Best learning rate: 0.03
Best maximumg depth: 3
Best subsample size: 0.5
Accuracy score:-24.37524


#### Export output

In [218]:
#Predict values based on optimized parameters
y_hat = grid_acc.predict(X_test)
y_hat

array([ 4.307766 , 11.744605 ,  6.5130134, ...,  2.8215551,  1.9722649,
        1.8441986], dtype=float32)

In [219]:
# Test dataset (to produce predictions)
data_test=pd.read_csv('../Data/mimic_test_death.csv')

In [220]:
predictions = pd.DataFrame({'icustay_id': data_test.icustay_id.values, 'LOS': y_hat})
output_name = f"output/predictions_score:{score_best}_nEst:{n_est_best}_lr:{lr_best}_maxDepth:{max_depth_best}_subsample:{subsample_best}" 
    
predictions.to_csv(output_name, index=False)

In [221]:
predictions.describe()

Unnamed: 0,icustay_id,LOS
count,5221.0,5221.0
mean,249925.293239,3.707606
std,28763.030681,1.604979
min,200011.0,-0.388591
25%,225118.0,2.596703
50%,249759.0,3.521484
75%,274576.0,4.575222
max,299979.0,18.009064
