# Model Selection


In [5]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
from scipy.stats import ks_2samp
from scipy import interp
from modshogun import *

# below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML

%matplotlib inline
plt.style.use('ggplot')

In [2]:
# SQL database config
sqluser = ''
dbname = 'MIMIC3'
schema_name = 'mimiciii'
hostname = ''
port = 5432
pwd = getpass.getpass()

········


In [35]:
# Connect to local postgres version of mimic
con = psycopg2.connect(dbname=dbname, user=sqluser, host=hostname, port=5432, password=pwd)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)

In [120]:
query = \
"""
-- Table #3: Services
with serv AS
(
SELECT icu.hadm_id, icu.icustay_id, se.curr_service
, CASE
    WHEN curr_service like '%SURG' then 1
    WHEN curr_service = 'ORTHO' then 1
    ELSE 0 END
  as surgical
, RANK() OVER (PARTITION BY icu.hadm_id ORDER BY se.transfertime DESC) as rank
FROM icustays icu
LEFT JOIN services se
 ON icu.hadm_id = se.hadm_id
AND se.transfertime < icu.intime + interval '12' hour
)

-- Table #4: Clinical data + demographics
, co AS
(
SELECT icu.subject_id, icu.hadm_id, icu.icustay_id, first_careunit, admission_type
, icu.los as icu_los
, round((EXTRACT(EPOCH FROM (adm.dischtime-adm.admittime))/60/60/24) :: NUMERIC, 4) as hosp_los
, EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 as age_icu_in
, pat.gender
, RANK() OVER (PARTITION BY icu.subject_id ORDER BY icu.intime) AS icustay_id_order
, hospital_expire_flag
, CASE WHEN pat.dod IS NOT NULL 
       AND pat.dod >= icu.intime - interval '6 hour'
       AND pat.dod <= icu.outtime + interval '6 hour' THEN 1 
       ELSE 0 END AS icu_expire_flag
, CASE WHEN pat.dod IS NOT NULL
    AND pat.dod < adm.admittime + interval '30' day THEN 1 
    ELSE 0 END as hospital30day_expire_flag
, CASE WHEN pat.dod IS NOT NULL
    AND pat.dod < adm.admittime + interval '1' year THEN 1 
    ELSE 0 END as hospital1year_expire_flag      
FROM icustays icu
INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
INNER JOIN admissions adm
ON adm.hadm_id = icu.hadm_id    
)

-- Table #5: Exclusions
, excl AS
(
SELECT
  co.subject_id, co.hadm_id, co.icustay_id, co.icu_los, co.hosp_los
  , co.age_icu_in
  , co.gender
  , co.icustay_id_order
  , serv.curr_service
  , co.first_careunit
  , co.hospital_expire_flag
  , co.icu_expire_flag
  , CASE
        WHEN co.icu_los < 1 then 1
    ELSE 0 END
    AS exclusion_los
  , CASE
        WHEN co.age_icu_in < 16 then 1
    ELSE 0 END
    AS exclusion_age
  , CASE 
        WHEN co.icustay_id_order != 1 THEN 1
    ELSE 0 END 
    AS exclusion_first_stay
  , CASE
        WHEN serv.surgical = 1 THEN 1
    ELSE 0 END
    as exclusion_surgical
FROM co
LEFT JOIN serv
  ON  co.icustay_id = serv.icustay_id
  AND serv.rank = 1
)

SELECT vital.icustay_id, vital.subject_id, vital.hadm_id
-- vital signs for the first 24 hours of the icu stay
, HeartRate_Min
, HeartRate_Mean
, HeartRate_Max
, DiasBP_Min
, DiasBP_Max
, SysBP_Min
, SysBP_Max
, MeanBP_Min
, MeanBP_Mean
, MeanBP_Max
, RespRate_Min
, RespRate_Mean
, RespRate_Max
, TempC_Min
, TempC_Max
, SpO2_Min
, SpO2_Max

-- Glasgow coma score
, MinGCS
, GCSMotor
, GCSVerbal
, GCSEyes

-- lab values
, ANIONGAP_min
, ANIONGAP_max
, ALBUMIN_min
, ALBUMIN_max
, BANDS_min
, BANDS_max
, BICARBONATE_min
, BICARBONATE_max
, BILIRUBIN_min
, BILIRUBIN_max
, CREATININE_min
, CREATININE_max
, CHLORIDE_min
, CHLORIDE_max
, lab.GLUCOSE_min
, lab.GLUCOSE_max
, HEMATOCRIT_min
, HEMATOCRIT_max
, HEMOGLOBIN_min
, HEMOGLOBIN_max
, LACTATE_min
, LACTATE_max
, PLATELET_min
, PLATELET_max
, POTASSIUM_min
, POTASSIUM_max
, PTT_min
, PTT_max
, INR_min
, INR_max
, PT_min
, PT_max
, SODIUM_min
, SODIUM_max
, BUN_min
, BUN_max
, WBC_min
, WBC_max

, urineoutput

-- whether the patient is ventilated on the first day of the ICU stay
, vent

-- demographic data
, co.age_icu_in, co.first_careunit, co.gender, co.admission_type
, hw.height_first, hw.weight_first

-- outcomes
, co.hospital_expire_flag, co.icu_expire_flag
, co.hosp_los, co.icu_los, co.icustay_id_order
, co.hospital1year_expire_flag, hospital30day_expire_flag

-- exclusions
, excl.exclusion_los, excl.exclusion_age
, excl.exclusion_first_stay, excl.exclusion_surgical

FROM mimiciii_dev.vitalsfirstday vital
LEFT JOIN mimiciii_dev.labsfirstday lab
  ON vital.icustay_id = lab.icustay_id
LEFT JOIN mimiciii_dev.gcsfirstday gcs
  ON vital.icustay_id = gcs.icustay_id
LEFT JOIN mimiciii_dev.uofirstday uo
  ON vital.icustay_id = uo.icustay_id
LEFT JOIN mimiciii_dev.ventfirstday vent
  ON vital.icustay_id = vent.icustay_id
left join co
  ON vital.icustay_id = co.icustay_id
left join public.heightweight hw
  ON vital.icustay_id = hw.icustay_id
left join excl
  on vital.icustay_id = excl.icustay_id;
"""

query_output = pd.read_sql_query(query,con) #.dropna().reset_index(drop=True)
query_output.head()

Unnamed: 0,icustay_id,subject_id,hadm_id,heartrate_min,heartrate_mean,heartrate_max,diasbp_min,diasbp_max,sysbp_min,sysbp_max,...,icu_expire_flag,hosp_los,icu_los,icustay_id_order,hospital1year_expire_flag,hospital30day_expire_flag,exclusion_los,exclusion_age,exclusion_first_stay,exclusion_surgical
0,263738,13,143045,60.0,83.6,124.0,53.0,84.0,102.0,151.0,...,0,6.8556,3.666,1,0,0,0,0,0,0
1,211832,71,111944,98.0,112.444444,137.0,31.0,130.0,94.0,157.0,...,0,4.6618,2.8609,1,0,0,0,0,0,0
2,233150,78,100536,56.0,63.117647,73.0,78.0,119.0,134.0,206.0,...,0,3.9139,1.4891,1,0,0,0,0,0,0
3,233111,101,175533,72.0,87.034483,117.0,19.0,85.0,57.0,182.0,...,0,15.7785,9.8919,1,1,1,0,0,0,0
4,212246,103,130744,42.0,51.5,60.0,40.0,71.0,112.0,161.0,...,0,7.7347,3.0237,1,1,1,0,0,0,1


In [75]:
query_output = pd.read_csv('./full-features-materialized.csv') #.dropna().reset_index(drop=True)

query_output.first_careunit = pd.Categorical(query_output.first_careunit)
query_output.gender = pd.Categorical(query_output.gender)
query_output.admission_type = pd.Categorical(query_output.admission_type)

query_output['gender'] = query_output.gender.cat.codes
query_output['first_careunit'] = query_output.first_careunit.cat.codes
query_output['admission_type'] = query_output.admission_type.cat.codes

query_output = query_output[(query_output.exclusion_los == 0) & (query_output.exclusion_age == 0) 
                            & (query_output.exclusion_first_stay == 0) ]#& (query_output.age_icu_in < 250)]

# Median age of patients > 89 is 91.6
query_output.loc[query_output.age_icu_in > 89, 'age_icu_in'] = 91.6

mortality_names = ['hospital_expire_flag', 'icu_expire_flag', 
                   'hospital1year_expire_flag', 'hospital30day_expire_flag']

los_names = ['hosp_los', 'icu_los']

feature_names = [
            # Demographic information
            'age_icu_in', 'gender', 'first_careunit',
            'admission_type',
    
            # Vital signs
            'heartrate_min', 'heartrate_max', 
            'meanbp_min', 'meanbp_max', 
            'diasbp_min', 'diasbp_max',
            'sysbp_min', 'sysbp_max',
            'resprate_min', 'resprate_max',
            'mingcs', 'gcsmotor',
            'gcsverbal', 'gcseyes',             
            'tempc_min', 'tempc_max',
            'spo2_min', 'spo2_max',
                 
            # Laboratory measurements
            
            'aniongap_min', 'aniongap_max',
            #'albumin_min', 'albumin_max',
            #'bands_min', 'bands_max',
            'bicarbonate_min', 'bicarbonate_max',
            #'bilirubin_min', 'bilirubin_max',
            'creatinine_min', 'creatinine_max',
            'chloride_min', 'chloride_max',
            'glucose_min', 'glucose_max',
            'hematocrit_min', 'hematocrit_max',
            'hemoglobin_min', 'hemoglobin_max',
            #'lactate_min', 'lactate_max',
            'platelet_min', 'platelet_max',
            'potassium_min', 'potassium_max',
            #'ptt_min', 'ptt_max',
            #'inr_min', 'inr_max',
            #'pt_min', 'pt_max',
            'sodium_min', 'sodium_max',
            'bun_min', 'bun_max',
            'wbc_min', 'wbc_max',                 
            
            # Other
            'vent', 
            'urineoutput'
            ]

query_output = query_output.loc[:, mortality_names + feature_names]
query_output = query_output.fillna(query_output.mean()).reset_index(drop=True)

#query_output = query_output.loc[:, mortality_names + feature_names].dropna().reset_index(drop=True) # drop NaNs

outcomes = query_output.loc[:, mortality_names]
features = query_output.loc[:, query_output.columns.difference(mortality_names + los_names)]

X = features
y = outcomes['hospital_expire_flag'].replace(0, -1)

print(len(query_output))
query_output.tail()

32290


Unnamed: 0,hospital_expire_flag,icu_expire_flag,hospital1year_expire_flag,hospital30day_expire_flag,age_icu_in,gender,first_careunit,admission_type,heartrate_min,heartrate_max,...,potassium_min,potassium_max,sodium_min,sodium_max,bun_min,bun_max,wbc_min,wbc_max,vent,urineoutput
32285,0,0,0,0,70.826046,0,5,0,62.0,119.0,...,3.1,3.9,135.0,144.0,7.0,7.0,3.3,4.5,1,1995.0
32286,0,0,0,0,63.893494,0,2,1,60.0,95.0,...,3.5,3.9,139.0,143.0,16.0,31.0,5.3,8.6,0,1400.0
32287,0,0,0,0,43.594414,0,0,1,58.0,80.0,...,3.8,4.1,139.0,140.0,12.0,13.0,11.7,11.8,0,1250.0
32288,1,1,1,1,86.957864,1,2,1,52.0,68.0,...,5.0,5.7,123.0,128.0,41.0,42.0,11.4,13.3,1,730.0
32289,0,0,0,0,77.48827,1,0,1,28.0,74.0,...,4.4,4.9,135.0,138.0,21.0,22.0,10.8,11.0,0,1800.0


## Data preprocessing

Because our features are quite different and vary in categories, we'll standardize the features by subtracting the mean and removing data with zerio variance.

In [76]:
split = int(len(X) * 0.7)

X_train = RealFeatures(np.array(X[:split].T))
X_test = RealFeatures(np.array(X[split:].T))

y_train = BinaryLabels(np.array(y[:split]))
y_test = BinaryLabels(np.array(y[split:]))

print("Number of training samples:", y_train.get_num_labels())
print("Number of testing samples:", y_test.get_num_labels())

('Number of training samples:', 22603)
('Number of testing samples:', 9687)


In [77]:
preprocessor = PruneVarSubMean(True)
preprocessor.init(X_train)
X_train.add_preprocessor(preprocessor)
X_train.apply_preprocessor(True)

preprocessor.init(X_test)
X_test.add_preprocessor(preprocessor)
X_test.apply_preprocessor(True)

True

In [27]:
%%time

C = 1
kernel = GaussianKernel(2, 0.001)
kernel.init(X_train, X_train)
kernel.set_normalizer(SqrtDiagKernelNormalizer())
svm = LibSVM(C, kernel, y_train)

svm.train()

roc = ROCEvaluation()

y_pred = svm.apply(X_train)

roc.evaluate(y_pred, y_train)

auc = roc.get_auROC()

print(auc)

1.0
CPU times: user 14.1 s, sys: 88 ms, total: 14.1 s
Wall time: 5.09 s


In [80]:
%%time

gauss_kernel = GaussianKernel(X_train, X_train, 15)
#gauss_kernel.init(X_train, X_train)
#gauss_kernel.set_normalizer(SqrtDiagKernelNormalizer())

# Parameters to svm
C = 0.1
epsilon = 0.001

svm = LibSVM(C, gauss_kernel, y_train)
svm.set_epsilon(epsilon)

svm.train()

y_pred = svm.apply(X_test)

roc = ROCEvaluation()
roc.evaluate(y_pred, y_test)

auc = roc.get_auROC()
print("Area under ROC(%):", auc)

('Area under ROC(%):', 0.8489579509987765)
CPU times: user 22min 33s, sys: 5.16 s, total: 22min 38s
Wall time: 9min 26s


## Gridsearch to determine the best parameters

In [53]:
#Root
param_tree_root=ModelSelectionParameters()

#kernel object
param_gaussian_kernel = ModelSelectionParameters("kernel", gauss_kernel)
gaussian_kernel_width = ModelSelectionParameters("log_width")
gaussian_kernel_width.build_values(0.1, 6.0, R_LINEAR, 0.5, 2.0)

#kernel parameter 
param_gaussian_kernel.append_child(gaussian_kernel_width)
param_tree_root.append_child(param_gaussian_kernel)

k = 10
stratified_split = StratifiedCrossValidationSplitting(y_train, k)

gauss_kernel = GaussianKernel(X_train, X_train, 15)

# Parameters to svm
C = 0.1
epsilon = 0.001

svm = LibSVM(C, gauss_kernel, y_train)
svm.set_epsilon(epsilon)

# cross validation instance used
cross_validation=CrossValidation(svm, X_train, y_train, stratified_split, roc)
cross_validation.set_num_runs(1)

# model selection instance
model_selection=GridSearchModelSelection(cross_validation, param_tree_root)

In [55]:
model_selection.select_model(True)

SystemError: [1;31m[ERROR][0m In file /build/shogun-v9ad6W/shogun-6.0.0+1SNAPSHOT201704270057/src/shogun/base/Parameter.cpp line 2748: Character 0 of parameter with name "<IDS|MSG>" is illegal (only alnum or underscore is allowed)


In [None]:
%%time

sigmas = [1, 10] #5, 10, 50, 100, 500]
gammas = [1/(2 * sigma**2) for sigma in sigmas]
C = [0.5, 1, 2, 5, 10, 20, 50, 100]

parameters = {'C': C, 'sigma': sigmas}

best_auc = -1

scores = {}

for c in C:
    for sigma in sigmas:
        for i in range(k):
            train_idx = stratified_split.generate_subset_inverse(i)
            test_idx = stratified_split.generate_subset_indices(i)
            X_train = RealFeatures(np.array(X.loc[train_idx]).T)
            y_train = BinaryLabels(np.array(y.loc[train_idx]))    

            X_test = RealFeatures(np.array(X.loc[test_idx]).T)
            y_test = BinaryLabels(np.array(y.loc[test_idx]))

            preprocessor = PruneVarSubMean(True)
            preprocessor.init(X_train)
            X_train.add_preprocessor(preprocessor)
            X_train.apply_preprocessor(True)

            preprocessor.init(X_test)
            X_test.add_preprocessor(preprocessor)
            X_test.apply_preprocessor(True)  
            gauss_kernel = GaussianKernel(X_train, X_train, sigma)

            epsilon = 0.001

            svm = LibSVM(c, gauss_kernel, y_train)
            svm.set_epsilon(epsilon)
            svm.train()

            y_pred = svm.apply(X_test)

            metric = ROCEvaluation()

            roc = metric.evaluate(y_pred, y_test)

            auc = metric.get_auROC()
            aucs.append(auc)
            
        mean_auc = np.mean(aucs)
        if mean_auc > best_auc:
            best_auc = mean_auc
            best_parameters = (c, sigma)

        scores[(c, sigma)] = mean_auc
        print((c, sigma), mean_auc)

((0.5, 1), 0.81213351183232674)
((0.5, 10), 0.82323128845700877)
((1, 1), 0.81225600919725272)


In [82]:
labels = BinaryLabels(np.array(y))

k = 10
stratified_split = StratifiedCrossValidationSplitting(labels, k)
stratified_split.build_subsets()

C = [0.01, 0.1, 0.5, 1, 2, 5, 10, 20, 50, 100]
scores = {}

for c in C:
    aucs = []
    for i in range(k):
        train_idx = stratified_split.generate_subset_inverse(i)
        test_idx = stratified_split.generate_subset_indices(i)
        X_train = RealFeatures(np.array(X.loc[train_idx]).T)
        y_train = BinaryLabels(np.array(y.loc[train_idx]))    

        X_test = RealFeatures(np.array(X.loc[test_idx]).T)
        y_test = BinaryLabels(np.array(y.loc[test_idx]))

        preprocessor = PruneVarSubMean(True)
        preprocessor.init(X_train)
        X_train.add_preprocessor(preprocessor)
        X_train.apply_preprocessor(True)

        preprocessor.init(X_test)
        X_test.add_preprocessor(preprocessor)
        X_test.apply_preprocessor(True)      
    
        epsilon = 0.001
        svm = LibLinear(c, X_train, y_train)
        svm.set_liblinear_solver_type(L2R_L2LOSS_SVC)
        svm.set_epsilon(epsilon)
        svm.train()

        y_pred = svm.apply(X_test)

        metric = ROCEvaluation()

        roc = metric.evaluate(y_pred, y_test)

        auc = metric.get_auROC()
        aucs.append(auc)
    print((c), np.mean(aucs))

(0.01, 0.84507765525877987)
(0.1, 0.84508086982827013)
(0.5, 0.84509060954496462)
(1, 0.84508942998811187)
(2, 0.84509031453440853)
(5, 0.84509090261403463)
(10, 0.84509217921341528)
(20, 0.84509257223724232)
(50, 0.84509286676242679)
(100, 0.84509257247992797)
