## Enviroment setup

In [1]:
# Libs and enviroment customizations

# Data wrangling
import pandas as pd
import numpy as np

# Data import
from google.colab import files

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
pd.set_option('display.max_rows', 100)

In [3]:
# Getting reproducible results across multiple executions
rng = np.random.RandomState(0)
seed = 42

# Data query

This study's dataset is a subset of MIMIC-IV in which each row represents the latest (and therefore the unique) stay of a patient in the intensive care unit. A stay stands for the event of being in one hospital sector during a certain period of time.

MIMIC (Medical Information Mart for Intensive Care) is a large, freely-available database comprising deidentified health-related data from patients who were admitted to the critical care units of the Beth Israel Deaconess Medical Center.

MIMIC-IV contains data from 2008-2019. The admission vital signs, clinical measurements, and laboratory values were defined as either the first value recorded after or closest to the index CICU admission. Vital signs were recorded every 15 min during the first hour after ICU admission from Metavision bedside monitors. Admission diagnoses were defined as all International Classification of Diseases‐9 diagnostic codes.


Ref: https://mimic.mit.edu/docs/iv/

### Database querys

In [4]:
## Intermediate table 1
# DROP TABLE IF EXISTS mimiciv_derived.cohort_distinct_subject;
# CREATE TABLE mimiciv_derived.cohort_distinct_subject AS
# SELECT lab.subject_id,
# icu.admission_age,
# icu.gender,
# icu.race,
# h.height,
# w.weight,
# icu.los_icu,
# lab.stay_id,
# vs.sbp_mean,
# lab.creatinine_max,
# lab.bun_max,
# CASE
#     WHEN icu.dod IS NOT NULL THEN 1
#     ELSE 0 END AS hospital_death,
# DENSE_RANK() OVER (PARTITION BY icu.subject_id ORDER BY icu.stay_id ASC) AS icustay_seq
# FROM mimiciv_derived.first_day_lab lab
# LEFT JOIN mimiciv_derived.first_day_vitalsign vs
# ON lab.stay_id = vs.stay_id
# LEFT JOIN mimiciv_derived.icustay_detail icu
# ON lab.stay_id = icu.stay_id
# LEFT JOIN mimiciv_derived.first_day_height h
# ON lab.stay_id = h.stay_id
# LEFT JOIN mimiciv_derived.first_day_weight w
# ON lab.stay_id = w.stay_id;

## Intermediate table 2
# DROP TABLE IF EXISTS mimiciv_derived.cohort_icds;
# CREATE TABLE mimiciv_derived.cohort_icds AS
# SELECT DISTINCT subject_id
# FROM mimiciv_hosp.diagnoses_icd
# WHERE icd_code in ('40201', '40211', '40291', '40401', '40403', '40411', '40413', '40491', '40493', '42801', '42802','42803','42804', '51881', '51882', '51884') AND icd_version=9;

## Final table
# DROP TABLE IF EXISTS mimiciv_derived.cohort_subjects_included;
# CREATE TABLE mimiciv_derived.cohort_subjects_included AS
# SELECT * from mimiciv_derived.cohort_distinct_subject where icustay_seq=1 and subject_id in (
# SELECT distinct icd.subject_id
# FROM mimiciv_derived.cohort_distinct_subject subject
# LEFT JOIN mimiciv_derived.cohort_icds icd
# ON subject.subject_id = icd.subject_id
# );

# Relevant counts:

# Number of stays available to analyse:
# Number of distinct patients:
# Number of patients with cid-9 codes of interest:


In [5]:
# # Including more data

# DROP TABLE IF EXISTS mimiciv_derived.cohort_distinct_subject;
# CREATE TABLE mimiciv_derived.cohort_distinct_subject AS
# SELECT lab.subject_id,
# lab.stay_id,
# icu.admission_age,
# h.height,
# w.weight,
# vs.sbp_mean,
# vs.dbp_mean,
# vs.mbp_mean,
# vs.resp_rate_mean,
# vs.temperature_mean,
# vs.spo2_mean,
# uo.urineoutput,
# lab.hematocrit_max,
# lab.hemoglobin_max,
# lab.wbc_max,
# lab.platelets_max,
# lab.abs_neutrophils_max,
# lab.abs_basophils_max,
# lab.abs_lymphocytes_max,
# lab.pt_max,
# lab.inr_max,
# lab.ptt_max,
# lab.ck_cpk_max,
# lab.creatinine_max,
# lab.bun_max,
# lab.glucose_max,
# lab.potassium_max,
# lab.sodium_max,
# lab.calcium_max,
# lab.chloride_max,
# lab.aniongap_max,
# bg.ph_min,
# bg.bicarbonate_min,
# bg.lactate_max,
# bg.pco2_max,
# icu.gender,
# icu.race,
# CASE
#     WHEN icu.dod < icu.dischtime THEN 1
#     ELSE 0 END AS hospital_death,
# DENSE_RANK() OVER (PARTITION BY icu.subject_id ORDER BY icu.stay_id ASC) AS icustay_seq
# FROM mimiciv_derived.first_day_lab lab
# LEFT JOIN mimiciv_derived.first_day_vitalsign vs
# ON lab.stay_id = vs.stay_id
# LEFT JOIN mimiciv_derived.icustay_detail icu
# ON lab.stay_id = icu.stay_id
# LEFT JOIN mimiciv_derived.first_day_height h
# ON lab.stay_id = h.stay_id
# LEFT JOIN mimiciv_derived.first_day_weight w
# ON lab.stay_id = w.stay_id
# LEFT JOIN mimiciv_derived.first_day_urine_output uo
# ON lab.stay_id = uo.stay_id
# LEFT JOIN mimiciv_derived.first_day_bg_art bg
# ON lab.stay_id = bg.stay_id;

### Database access

In [7]:
# We are using a private repository with access token to enable multiple users to run this notebook. This will not be available in the final project.
!pip install python-dotenv

# Load .env file using:
from dotenv import load_dotenv
load_dotenv()

import os
import json, requests, urllib, io

user=os.getenv("GIT_USER")
pao=os.getenv("GIT_PAO")

github_session = requests.Session()
github_session.auth = (user, pao)

# providing raw url to download csv from github
csv_url = 'https://raw.githubusercontent.com/pedrogemal/mimic-iv-data/main/data/hf_cohort_060623.csv'

download = github_session.get(csv_url).content
df = pd.read_csv(io.StringIO(download.decode('utf-8')))

df_raw = df.copy()



### Basic dataset information

In [8]:
df.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'admission_age', 'height', 'weight',
       'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean',
       'temperature_mean', 'spo2_mean', 'urineoutput', 'hematocrit_max',
       'hemoglobin_max', 'wbc_max', 'platelets_max', 'abs_neutrophils_max',
       'abs_basophils_max', 'abs_lymphocytes_max', 'pt_max', 'inr_max',
       'ptt_max', 'ck_cpk_max', 'creatinine_max', 'bun_max', 'glucose_max',
       'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max',
       'aniongap_max', 'ph_min', 'bicarbonate_min', 'magnesium_min',
       'lactate_max', 'pco2_max', 'gender', 'race', 'hospital_death',
       'chronic_pulmonary_disease', 'diabetes_without_cc', 'renal_disease',
       'myocardial_infarct', 'peripheral_vascular_disease',
       'cerebrovascular_disease', 'heart_rate_max', 'mch_max', 'mchc_max',
       'mcv_max', 'ntprobnp_max'],
      dtype='object')

In [None]:
# Checking for NaN values

num_na = df.isna().sum()

perc_na = df.isna().sum() * 100 / len(df)

df_na = pd.DataFrame({
    'Column': df.columns,
    'Absent data': num_na,
    'Percentage': perc_na
})

df_na.sort_values(by='Percentage', ascending=False)

In [None]:
r, c = df.shape
print(r)
print(c)

## Data research parameters

### Categorical data encoding

#### Race data encoding

In [None]:
# Data encoding for race

def return_white_race_value(df):
    if df['race'] == 'WHITE':
        return 1
    elif df['race'] == 'WHITE - BRAZILIAN':
        return 1
    elif df['race'] == 'WHITE - EASTERN EUROPEAN':
        return 1
    elif df['race'] == 'WHITE - OTHER EUROPEAN':
        return 1
    elif df['race'] == 'WHITE - RUSSIAN':
        return 1
    else:
        return 0

def create_white_race_column(df):
    df['race_white'] = df.apply(return_white_race_value, axis=1)

def return_black_race_value(df):
    if df['race'] == 'BLACK/AFRICAN':
        return 1
    elif df['race'] == 'BLACK/AFRICAN AMERICAN':
        return 1
    elif df['race'] == 'BLACK/CAPE VERDEAN':
        return 1
    elif df['race'] == 'BLACK/CARIBBEAN ISLAND':
        return 1
    else:
        return 0

def create_black_race_column(df):
    df['race_black'] = df.apply(return_black_race_value, axis=1)

In [None]:
create_white_race_column(df)
create_black_race_column(df)

In [None]:
df.drop(['race'], axis=1, errors='ignore')

#### Gender data encoding

In [None]:
# Data treatment for categorical variables
# Data encoding for gender

def return_male_gender_value(df):
    if df['gender'] == 'M':
        return 1
    else:
        return 0

def create_male_gender_column(df):
    df['gender_male'] = df.apply(return_male_gender_value, axis=1)

def return_female_gender_value(df):
    if df['gender'] == 'F':
        return 1
    else:
        return 0

def create_female_gender_column(df):
    df['gender_female'] = df.apply(return_female_gender_value, axis=1)

In [None]:
create_male_gender_column(df)
create_female_gender_column(df)

In [None]:
df.drop(['gender'], axis=1, inplace=True, errors='ignore')

# Exploratory analysis

### Data distribution profile

In [None]:
# import matplotlib.pyplot as plt

# df[['sbp_mean', 'creatinine_max', 'bun_max']].dropna().plot.kde(figsize=[12,8])
# plt.legend(['SysABP (mmHg)', 'Creatinine (mg/dL)', 'BUN (mg/dL)'])
# plt.xlim([-30,250])

In [None]:
# df[['sbp_mean', 'creatinine_max', 'bun_max']].boxplot(whis=3)
# plt.show()

In [None]:
# # col_list = df.columns.values.tolist()
# # print(col_list)

## for i, column in enumerate(df[['admission_age', 'height', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'urineoutput', 'hematocrit_max', 'hemoglobin_max', 'wbc_max', 'platelets_max', 'abs_neutrophils_max', 'abs_basophils_max', 'abs_lymphocytes_max', 'pt_max', 'inr_max', 'ptt_max', 'ck_cpk_max', 'creatinine_max', 'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max', 'aniongap_max', 'ph_min', 'bicarbonate_min', 'lactate_max', 'pco2_max']].columns, 1):
# for i, column in enumerate(df.columns, 1):

#     plt.subplot(6,6,i+1)
#     sns.histplot(df[column])

# Feature evaluation

### Identify highly correlated features

In [None]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(to_drop)

# Drop features
# df.drop(to_drop, axis=1, inplace=True, errors='ignore')

### Evaluate features with low variance

In [None]:
from sklearn.feature_selection import VarianceThreshold

def variance_threshold(df,th):
    var_thres=VarianceThreshold(threshold=th)
    var_thres.fit(df)
    new_cols = var_thres.get_support()
    return df.iloc[:,new_cols]

In [None]:
# variance_threshold(df, 0.8)

### Checking for data consistency and fixing if needed

In [None]:
# (df.urineoutput < 0).sum()

df.loc[df["urineoutput"] < 0, "urineoutput"] = 0

In [None]:
(df.urineoutput < 0).sum()

# Dataframe filtering

In [None]:
# To meet inclusion criteria

df_only_available_pacients = df.dropna(subset=['sbp_mean', 'bun_max', 'creatinine_max', 'sodium_max', 'admission_age', 'heart_rate_max', 'race_black', 'chronic_pulmonary_disease'], how = 'any')

In [None]:
df = df_only_available_pacients.copy()

In [None]:
# Declare feature vector and target variable
# Created columns related to the scores: 'adhere_low', 'adhere_int1', 'adhere_int2', 'adhere_int3', 'adhere_high', 'adhere_class', 'gwtg_1', 'gwtg_2', 'gwtg_3', 'gwtg_4', 'gwtg_5', 'gwtg_6', 'gwtg_7', 'gwtg_8', 'gwtg_9', 'gwtg_score'

all_features = ['admission_age', 'height', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'urineoutput', 'hematocrit_max', 'hemoglobin_max', 'wbc_max', 'platelets_max', 'abs_neutrophils_max', 'abs_basophils_max', 'abs_lymphocytes_max', 'pt_max', 'inr_max', 'ptt_max', 'ck_cpk_max', 'creatinine_max', 'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max', 'aniongap_max', 'ph_min', 'bicarbonate_min', 'magnesium_min', 'lactate_max', 'pco2_max', 'gender_male', 'race_black', 'hospital_death', 'chronic_pulmonary_disease', 'diabetes_without_cc', 'renal_disease', 'myocardial_infarct', 'peripheral_vascular_disease', 'cerebrovascular_disease', 'heart_rate_max', 'mch_max', 'mchc_max', 'mcv_max', 'ntprobnp_max']

# features_included = ['admission_age', 'height', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'magnesium_min', 'urineoutput', 'hematocrit_max', 'wbc_max', 'platelets_max', 'abs_neutrophils_max', 'abs_basophils_max', 'abs_lymphocytes_max', 'pt_max', 'ptt_max', 'ck_cpk_max', 'creatinine_max', 'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max', 'aniongap_max', 'ph_min', 'bicarbonate_min', 'lactate_max', 'pco2_max', 'heart_rate_max', 'race_black', 'chronic_pulmonary_disease']
features_included = ['admission_age', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'magnesium_min', 'urineoutput', 'hematocrit_max', 'wbc_max', 'platelets_max', 'pt_max', 'ptt_max', 'creatinine_max', 'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max', 'aniongap_max', 'bicarbonate_min', 'heart_rate_max', 'race_black', 'chronic_pulmonary_disease']

# Testing magnesium influence

X = df[features_included]
X_all = df[all_features]

y = df['hospital_death']

# Testing different feature selection methods

In [None]:
# from sklearn.model_selection import train_test_split

# X_train_fs, X_test_fs, y_train_fs, y_test_fs = train_test_split(X_all, y, test_size=0.3, random_state=seed)

In [None]:
# # Steps needed to perform feature selection

# from sklearn.impute import KNNImputer
# imputer = KNNImputer()

# X_train_fs_imputed = pd.DataFrame(imputer.fit_transform(X_train_fs), columns=X_train_fs.columns)

#### Lasso

In [None]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV
# from sklearn.linear_model import Lasso

# pipeline = Pipeline([
#                      ('scaler',StandardScaler()),
#                      ('model',Lasso())
# ])

In [None]:
# search = GridSearchCV(pipeline,
#                       {'model__alpha':np.arange(0.1,10,0.1)},
#                       cv = 5, scoring="neg_mean_squared_error",verbose=3
#                       )

In [None]:
# features = X_train_fs.columns

In [None]:
# search.fit(X_train_fs_imputed, y_train_fs)

In [None]:
# search.best_params_

In [None]:
# coefficients = search.best_estimator_.named_steps['model'].coef_

In [None]:
# coefficients

In [None]:
# from sklearn.feature_selection import SelectFromModel
# from sklearn.linear_model import Lasso, LogisticRegression

# sel_ = SelectFromModel(
#     LogisticRegression(C=0.5, penalty='l1', solver='liblinear', random_state=10))

# sel_.fit(X_train_fs_imputed, y_train_fs)

In [None]:
# selected_feat = X_train_fs_imputed.columns[(sel_.get_support())]

# print('total features: {}'.format((X_train_fs_imputed.shape[1])))
# print('selected features: {}'.format(len(selected_feat)))

In [None]:
# from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

# reg = LassoCV()
# reg.fit(X_train_fs_imputed, y_train_fs)
# print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
# print("Best score using built-in LassoCV: %f" %reg.score(X_train_fs_imputed,y_train_fs))
# coef = pd.Series(reg.coef_, index = X_train_fs_imputed.columns)

In [None]:
# print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

In [None]:
# imp_coef = coef.sort_values(ascending=True).to_csv("lasso_fs.csv", index = True)

In [None]:
# imp_coef = coef.sort_values()
# plt.rcParams['figure.figsize'] = (8.0, 10.0)
# imp_coef.plot(kind = "barh")
# plt.title("Feature importance using Lasso Model")

#### Univariate Selection

In [None]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import f_classif

# test = SelectKBest(score_func=f_classif, k=30)
# fit = test.fit(X_train_fs_imputed, y_train_fs)

In [None]:
# from numpy import set_printoptions

# # summarize scores
# set_printoptions(precision=3)
# # print(fit.scores_)
# features = fit.transform(X_train_fs_imputed)
# # summarize selected features
# print(features[0:5,:])


#### PCA

In [None]:
# # WIP

# from sklearn.decomposition import PCA

# nf = 30
# #pca = PCA()
# pca = PCA(n_components=nf)
# fit = pca.fit(X_all_imputed)
# # X_train_pca = pca.fit_transform(X_train)
# # X_test_pca = pca.transform(X_test)

In [None]:
# print(X_train_pca.shape)
# print(X_train_pca)

In [None]:
# print("Explained Variance: %s" % fit.explained_variance_ratio_)

In [None]:
# print(fit.components_)

#### RFE

In [None]:
# from sklearn.feature_selection import RFE
# from sklearn.tree import DecisionTreeClassifier
# # define RFE
# rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=30)
# # fit RFE
# rfe.fit(X_all_imputed, y)
# # summarize all features
# for i in range(X.shape[1]):
#  print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Data preprocessing

## Train and test split

In [None]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.44, stratify=y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed)

In [None]:
# For table 1

X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y, test_size=0.2, stratify=y, random_state=seed)

## Train data imputation

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=10)
# X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_scaled), columns=X_train_scaled.columns)
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

In [None]:
# X_test_imputed = pd.DataFrame(imputer.fit_transform(X_test_scaled), columns=X_test_scaled.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

## Train data resampling (disabled)

In [None]:
# # https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTENC.html#imblearn.over_sampling.SMOTENC
# import imblearn
# # from imblearn.over_sampling import SMOTE
# from imblearn.over_sampling import SMOTENC

In [None]:
# idx_dic = {}
# for col in X_train_imputed.columns:
#     idx_dic[col] = X_train_imputed.columns.get_loc(col)
# print(idx_dic)

In [None]:
# # smote = SMOTENC(categorical_features=[33, 34])
# smote = SMOTENC(categorical_features=[25, 26], random_state=seed)

# X_train_sm, y_train_sm = smote.fit_resample(X_train_imputed, y_train)

In [None]:
# X_train_sm = pd.DataFrame(X_train_sm)
# X_train_sm.columns = X_train_sm.keys().tolist()
# y_train_sm = pd.DataFrame(y_train_sm)

In [None]:
# X_test_sm, y_test_sm = smote.fit_resample(X_test_imputed, y_test)

In [None]:
# X_test_sm = pd.DataFrame(X_test_sm)
# X_test_sm.columns = X_test_sm.keys().tolist()
# y_test_sm = pd.DataFrame(y_test_sm)

In [None]:
# X_train_sm.shape

In [None]:
# X_test.shape

In [None]:
# To override oversampling
X_train_sm = X_train_imputed.copy()
y_train_sm = y_train.copy()
X_test_sm = X_test_imputed.copy()
y_test_sm = y_test.copy()

## Data handling

### Adhere score definitions

In [None]:
def create_adhere_score(df):

  adhere_list = df.filter(['adhere_low', 'adhere_int1', 'adhere_int2', 'adhere_int3', 'adhere_high', 'adhere_class'])
  df.drop(adhere_list, inplace=True, axis=1, errors='ignore')

  df['adhere_low'] = np.where(((df['sbp_mean'] >= 115.0) & (df['bun_max'] < 43.0)), 1, 0)
  df['adhere_int1'] = np.where(((df['sbp_mean'] < 115.0) & (df['bun_max'] >= 43.0) & (df['creatinine_max'] < 2.75)), 1, 0)
  df['adhere_int2'] = np.where(((df['sbp_mean'] >= 115.0) & (df['bun_max'] >= 43.0)), 1, 0)
  df['adhere_int3'] = np.where(((df['sbp_mean'] < 115.0) & (df['bun_max'] < 43.0)), 1, 0)
  df['adhere_high'] = np.where(((df['sbp_mean'] < 115.0) & (df['bun_max'] >= 43.0) & (df['creatinine_max'] >= 2.75)), 1, 0)

In [None]:
def create_adhere_class(df):

  df['adhere_class'] = np.where((df['adhere_low'] == 1), 'adhere_low',
                       np.where((df['adhere_int1'] == 1), 'adhere_int1',
                       np.where((df['adhere_int2'] == 1), 'adhere_int2',
                       np.where((df['adhere_int3'] == 1), 'adhere_int2',
                       np.where((df['adhere_int3'] == 1), 'adhere_int3',
                       np.where((df['adhere_high'] == 1), 'adhere_high'))))))

In [None]:
def return_adhere_class(df):
    if df['adhere_low'] == 1:
        return 'adhere_low'
    elif df['adhere_int1'] == 1:
        return 'adhere_int1'
    elif df['adhere_int2'] == 1:
        return 'adhere_int2'
    elif df['adhere_int3'] == 1:
        return 'adhere_int3'
    else:
        return 'adhere_high'

In [None]:
# def return_adhere_bin_death(df):
#     if df['adhere_low'] == 1:
#         return 0
#     elif df['adhere_int1'] == 1:
#         return 1
#     elif df['adhere_int2'] == 1:
#         return 0
#     elif df['adhere_int3'] == 1:
#         return 0
#     else:
#         return 1

In [None]:
def return_adhere_bin_score(df):

    result = 0

    if df['adhere_low'] == 1:
        result = 1
    elif df['adhere_int3'] == 1:
        result = 2
    elif df['adhere_int2'] == 1:
        result = 3
    elif df['adhere_int1'] == 1:
        result = 4
    elif df['adhere_high'] == 1:
        result = 5

    return result

In [None]:
def create_adhere_label(df):
    df['adhere_class'] = df.apply(return_adhere_class, axis=1)

In [None]:
def create_adhere_bin_stats(df):
    df['adhere_score'] = df.apply(return_adhere_bin_score, axis=1)

In [None]:
# def create_adhere_bin_death(df):
#     df['adhere_bin_death'] = df.apply(return_adhere_bin_death, axis=1)

In [None]:
def delete_all_adhere(df):
    df.drop('adhere_low', inplace=True, axis=1, errors='ignore')
    df.drop('adhere_int1', inplace=True, axis=1, errors='ignore')
    df.drop('adhere_int2', inplace=True, axis=1, errors='ignore')
    df.drop('adhere_int3', inplace=True, axis=1, errors='ignore')
    df.drop('adhere_high', inplace=True, axis=1, errors='ignore')
    df.drop('adhere_class', inplace=True, axis=1, errors='ignore')

In [None]:
def delete_adhere_label(df):
    df.drop('adhere_class', inplace=True, axis=1, errors='ignore')

### GWTG definitions

In [None]:
def calculate_gwtg_score(row):

  sbp_points, bun_points, sodium_points, age_points, hr_points, race_points, copd_points = 0, 0, 0, 0, 0, 0, 0
  gwtg_score = 0

  if (round(row['sbp_mean']) <= 49):
    sbp_points = 0
  if (round(row['sbp_mean']) >= 50) & (round(row['sbp_mean']) <= 59):
    sbp_points = 28
  elif (round(row['sbp_mean']) >= 60) & (round(row['sbp_mean']) <= 69):
    sbp_points = 26
  elif (round(row['sbp_mean']) >= 70) & (round(row['sbp_mean']) <= 79):
    sbp_points = 24
  elif (round(row['sbp_mean']) >= 80) & (round(row['sbp_mean']) <= 89):
    sbp_points = 23
  elif (round(row['sbp_mean']) >= 90) & (round(row['sbp_mean']) <= 99):
    sbp_points = 21
  elif (round(row['sbp_mean']) >= 100) & (round(row['sbp_mean']) <= 109):
    sbp_points = 19
  elif (round(row['sbp_mean']) >= 110) & (round(row['sbp_mean']) <= 119):
    sbp_points = 17
  elif (round(row['sbp_mean']) >= 120) & (round(row['sbp_mean']) <= 129):
    sbp_points = 15
  elif (round(row['sbp_mean']) >= 130) & (round(row['sbp_mean']) <= 139):
    sbp_points = 13
  elif (round(row['sbp_mean']) >= 140) & (round(row['sbp_mean']) <= 149):
    sbp_points = 11
  elif (round(row['sbp_mean']) >= 150) & (round(row['sbp_mean']) <= 159):
    sbp_points = 9
  elif (round(row['sbp_mean']) >= 160) & (round(row['sbp_mean']) <= 169):
    sbp_points = 8
  elif (round(row['sbp_mean']) >= 170) & (round(row['sbp_mean']) <= 179):
    sbp_points = 6
  elif (round(row['sbp_mean']) >= 180) & (round(row['sbp_mean']) <= 189):
    sbp_points = 4
  elif (round(row['sbp_mean']) >= 190) & (round(row['sbp_mean']) <= 199):
    sbp_points = 2
  elif (round(row['sbp_mean']) >= 200):
    sbp_points = 0
  else:
    sbp_points = 0

  if (round(row['bun_max']) >= 150):
    bun_points = 28
  elif (round(row['bun_max']) >= 140) & (round(row['bun_max']) <= 149):
    bun_points = 27
  elif (round(row['bun_max']) >= 130) & (round(row['bun_max']) <= 139):
    bun_points = 25
  elif (round(row['bun_max']) >= 120) & (round(row['bun_max']) <= 129):
    bun_points = 23
  elif (round(row['bun_max']) >= 110) & (round(row['bun_max']) <= 119):
    bun_points = 21
  elif (round(row['bun_max']) >= 100) & (round(row['bun_max']) <= 109):
    bun_points = 19
  elif (round(row['bun_max']) >= 90) & (round(row['bun_max']) <= 99):
    bun_points = 17
  elif (round(row['bun_max']) >= 80) & (round(row['bun_max']) <= 89):
    bun_points = 15
  elif (round(row['bun_max']) >= 70) & (round(row['bun_max']) <= 79):
    bun_points = 13
  elif (round(row['bun_max']) >= 60) & (round(row['bun_max']) <= 69):
    bun_points = 11
  elif (round(row['bun_max']) >= 50) & (round(row['bun_max']) <= 59):
    bun_points = 9
  elif (round(row['bun_max']) >= 40) & (round(row['bun_max']) <= 49):
    bun_points = 8
  elif (round(row['bun_max']) >= 30) & (round(row['bun_max']) <= 39):
    bun_points = 6
  elif (round(row['bun_max']) >= 20) & (round(row['bun_max']) <= 29):
    bun_points = 4
  elif (round(row['bun_max']) >= 10) & (round(row['bun_max']) <= 19):
    bun_points = 2
  elif (round(row['bun_max']) <= 9):
    bun_points = 0
  else:
    bun_points = 0

  if (round(row['sodium_max']) <= 130):
    sodium_points = 4
  elif (round(row['sodium_max']) >= 131) & (round(row['sbp_mean']) <= 133):
    sodium_points = 3
  elif (round(row['sodium_max']) >= 134) & (round(row['sodium_max']) <= 136):
    sodium_points = 2
  elif (round(row['sodium_max']) >= 137) & (round(row['sodium_max']) <= 138):
    sodium_points = 1
  elif (round(row['sodium_max']) >= 139):
    sodium_points = 0
  else:
    sodium_points = 0

  if (round(row['admission_age']) >= 110):
    age_points = 28
  elif (round(row['admission_age']) >= 100) & (round(row['admission_age']) <= 109):
    age_points = 25
  elif (round(row['admission_age']) >= 90) & (round(row['admission_age']) <= 99):
    age_points = 22
  elif (round(row['admission_age']) >= 80) & (round(row['admission_age']) <= 89):
    age_points = 19
  elif (round(row['admission_age']) >= 70) & (round(row['admission_age']) <= 79):
    age_points = 17
  elif (round(row['admission_age']) >= 60) & (round(row['admission_age']) <= 69):
    age_points = 14
  elif (round(row['admission_age']) >= 50) & (round(row['admission_age']) <= 59):
    age_points = 11
  elif (round(row['admission_age']) >= 40) & (round(row['admission_age']) <= 49):
    age_points = 8
  elif (round(row['admission_age']) >= 30) & (round(row['admission_age']) <= 39):
    age_points = 6
  elif (round(row['admission_age']) >= 20) & (round(row['admission_age']) <= 29):
    age_points = 3
  elif (round(row['admission_age']) <= 19):
    age_points = 0
  else:
    age_points = 0

  if (round(row['heart_rate_max']) >= 105):
    hr_points = 8
  elif (round(row['heart_rate_max']) >= 100) & (round(row['heart_rate_max']) <= 104):
    hr_points = 6
  elif (round(row['heart_rate_max']) >= 95) & (round(row['heart_rate_max']) <= 99):
    hr_points = 5
  elif (round(row['heart_rate_max']) >= 90) & (round(row['heart_rate_max']) <= 94):
    hr_points = 4
  elif (round(row['heart_rate_max']) >= 85) & (round(row['heart_rate_max']) <= 89):
    hr_points = 3
  elif (round(row['heart_rate_max']) >= 80) & (round(row['heart_rate_max']) <= 84):
    hr_points = 1
  elif (round(row['heart_rate_max']) <= 79):
    hr_points = 0
  else:
    hr_points = 0

  if (row['race_black'] == 0):
    race_points = 0
  else:
    race_points = 3

  if (row['chronic_pulmonary_disease'] == 0):
    copd_points = 0
  else:
    copd_points = 2

  gwtg_score = sbp_points + bun_points + sodium_points + age_points + hr_points + race_points + copd_points
  return gwtg_score

In [None]:
def calculate_gwtg_bin(row):

  sbp_points, bun_points, sodium_points, age_points, hr_points, race_points, copd_points = 0, 0, 0, 0, 0, 0, 0
  gwtg_score = 0
  gwtg_prob_group = 0

  if (round(row['sbp_mean']) <= 49):
    sbp_points = 0
  if (round(row['sbp_mean']) >= 50) & (round(row['sbp_mean']) <= 59):
    sbp_points = 28
  elif (round(row['sbp_mean']) >= 60) & (round(row['sbp_mean']) <= 69):
    sbp_points = 26
  elif (round(row['sbp_mean']) >= 70) & (round(row['sbp_mean']) <= 79):
    sbp_points = 24
  elif (round(row['sbp_mean']) >= 80) & (round(row['sbp_mean']) <= 89):
    sbp_points = 23
  elif (round(row['sbp_mean']) >= 90) & (round(row['sbp_mean']) <= 99):
    sbp_points = 21
  elif (round(row['sbp_mean']) >= 100) & (round(row['sbp_mean']) <= 109):
    sbp_points = 19
  elif (round(row['sbp_mean']) >= 110) & (round(row['sbp_mean']) <= 119):
    sbp_points = 17
  elif (round(row['sbp_mean']) >= 120) & (round(row['sbp_mean']) <= 129):
    sbp_points = 15
  elif (round(row['sbp_mean']) >= 130) & (round(row['sbp_mean']) <= 139):
    sbp_points = 13
  elif (round(row['sbp_mean']) >= 140) & (round(row['sbp_mean']) <= 149):
    sbp_points = 11
  elif (round(row['sbp_mean']) >= 150) & (round(row['sbp_mean']) <= 159):
    sbp_points = 9
  elif (round(row['sbp_mean']) >= 160) & (round(row['sbp_mean']) <= 169):
    sbp_points = 8
  elif (round(row['sbp_mean']) >= 170) & (round(row['sbp_mean']) <= 179):
    sbp_points = 6
  elif (round(row['sbp_mean']) >= 180) & (round(row['sbp_mean']) <= 189):
    sbp_points = 4
  elif (round(row['sbp_mean']) >= 190) & (round(row['sbp_mean']) <= 199):
    sbp_points = 2
  elif (round(row['sbp_mean']) >= 200):
    sbp_points = 0
  else:
    sbp_points = 0

  if (round(row['bun_max']) >= 150):
    bun_points = 28
  elif (round(row['bun_max']) >= 140) & (round(row['bun_max']) <= 149):
    bun_points = 27
  elif (round(row['bun_max']) >= 130) & (round(row['bun_max']) <= 139):
    bun_points = 25
  elif (round(row['bun_max']) >= 120) & (round(row['bun_max']) <= 129):
    bun_points = 23
  elif (round(row['bun_max']) >= 110) & (round(row['bun_max']) <= 119):
    bun_points = 21
  elif (round(row['bun_max']) >= 100) & (round(row['bun_max']) <= 109):
    bun_points = 19
  elif (round(row['bun_max']) >= 90) & (round(row['bun_max']) <= 99):
    bun_points = 17
  elif (round(row['bun_max']) >= 80) & (round(row['bun_max']) <= 89):
    bun_points = 15
  elif (round(row['bun_max']) >= 70) & (round(row['bun_max']) <= 79):
    bun_points = 13
  elif (round(row['bun_max']) >= 60) & (round(row['bun_max']) <= 69):
    bun_points = 11
  elif (round(row['bun_max']) >= 50) & (round(row['bun_max']) <= 59):
    bun_points = 9
  elif (round(row['bun_max']) >= 40) & (round(row['bun_max']) <= 49):
    bun_points = 8
  elif (round(row['bun_max']) >= 30) & (round(row['bun_max']) <= 39):
    bun_points = 6
  elif (round(row['bun_max']) >= 20) & (round(row['bun_max']) <= 29):
    bun_points = 4
  elif (round(row['bun_max']) >= 10) & (round(row['bun_max']) <= 19):
    bun_points = 2
  elif (round(row['bun_max']) <= 9):
    bun_points = 0
  else:
    bun_points = 0

  if (round(row['sodium_max']) <= 130):
    sodium_points = 4
  elif (round(row['sodium_max']) >= 131) & (round(row['sbp_mean']) <= 133):
    sodium_points = 3
  elif (round(row['sodium_max']) >= 134) & (round(row['sodium_max']) <= 136):
    sodium_points = 2
  elif (round(row['sodium_max']) >= 137) & (round(row['sodium_max']) <= 138):
    sodium_points = 1
  elif (round(row['sodium_max']) >= 139):
    sodium_points = 0
  else:
    sodium_points = 0

  if (round(row['admission_age']) >= 110):
    age_points = 28
  elif (round(row['admission_age']) >= 100) & (round(row['admission_age']) <= 109):
    age_points = 25
  elif (round(row['admission_age']) >= 90) & (round(row['admission_age']) <= 99):
    age_points = 22
  elif (round(row['admission_age']) >= 80) & (round(row['admission_age']) <= 89):
    age_points = 19
  elif (round(row['admission_age']) >= 70) & (round(row['admission_age']) <= 79):
    age_points = 17
  elif (round(row['admission_age']) >= 60) & (round(row['admission_age']) <= 69):
    age_points = 14
  elif (round(row['admission_age']) >= 50) & (round(row['admission_age']) <= 59):
    age_points = 11
  elif (round(row['admission_age']) >= 40) & (round(row['admission_age']) <= 49):
    age_points = 8
  elif (round(row['admission_age']) >= 30) & (round(row['admission_age']) <= 39):
    age_points = 6
  elif (round(row['admission_age']) >= 20) & (round(row['admission_age']) <= 29):
    age_points = 3
  elif (round(row['admission_age']) <= 19):
    age_points = 0
  else:
    age_points = 0

  if (round(row['heart_rate_max']) >= 105):
    hr_points = 8
  elif (round(row['heart_rate_max']) >= 100) & (round(row['heart_rate_max']) <= 104):
    hr_points = 6
  elif (round(row['heart_rate_max']) >= 95) & (round(row['heart_rate_max']) <= 99):
    hr_points = 5
  elif (round(row['heart_rate_max']) >= 90) & (round(row['heart_rate_max']) <= 94):
    hr_points = 4
  elif (round(row['heart_rate_max']) >= 85) & (round(row['heart_rate_max']) <= 89):
    hr_points = 3
  elif (round(row['heart_rate_max']) >= 80) & (round(row['heart_rate_max']) <= 84):
    hr_points = 1
  elif (round(row['heart_rate_max']) <= 79):
    hr_points = 0
  else:
    hr_points = 0

  if (row['race_black'] == 0):
    race_points = 0
  else:
    race_points = 3

  if (row['chronic_pulmonary_disease'] == 0):
    copd_points = 0
  else:
    copd_points = 2

  gwtg_score = sbp_points + bun_points + sodium_points + age_points + hr_points + race_points + copd_points

  if (gwtg_score >= 0) & (gwtg_score <= 33):
    gwtg_prob_group = 1
  elif (gwtg_score >= 34) & (gwtg_score <= 50):
    gwtg_prob_group = 2
  elif (gwtg_score >= 51) & (gwtg_score <= 57):
    gwtg_prob_group = 3
  elif (gwtg_score >= 58) & (gwtg_score <= 61):
    gwtg_prob_group = 4
  elif (gwtg_score >= 62) & (gwtg_score <= 65):
    gwtg_prob_group = 5
  elif (gwtg_score >= 66) & (gwtg_score <= 70):
    gwtg_prob_group = 6
  elif (gwtg_score >= 71) & (gwtg_score <= 74):
    gwtg_prob_group = 7
  elif (gwtg_score >= 75) & (gwtg_score <= 78):
    gwtg_prob_group = 8
  else:
    gwtg_prob_group = 9

  return gwtg_prob_group

In [None]:
# def return_gwtg_bin_death(df):
#     if df['gwtg_bin'] == 1:
#         return 0
#     elif df['gwtg_bin'] == 2:
#         return 0
#     elif df['gwtg_bin'] == 3:
#         return 1
#     elif df['gwtg_bin'] == 4:
#         return 1
#     elif df['gwtg_bin'] == 5:
#         return 1
#     elif df['gwtg_bin'] == 6:
#         return 1
#     elif df['gwtg_bin'] == 7:
#         return 1
#     elif df['gwtg_bin'] == 8:
#         return 1
#     else:
#         return 1

In [None]:
# def create_gwtg_bin_death(df):
#     df['gwtg_bin_death'] = df.apply(return_gwtg_bin_death, axis=1)

In [None]:
def create_gwtg_bin_label(df):
    df['gwtg_bin'] = df.apply(calculate_gwtg_bin, axis=1)

In [None]:
def delete_gwtg_bin_label(df):
    df.drop('gwtg_bin', inplace=True, axis=1, errors='ignore')

In [None]:
def create_gwtg_bin_encoded(df):
    df['gwtg_1'] = np.where(((df['gwtg_bin'] == 1)), 1, 0)
    df['gwtg_2'] = np.where(((df['gwtg_bin'] == 2)), 1, 0)
    df['gwtg_3'] = np.where(((df['gwtg_bin'] == 3)), 1, 0)
    df['gwtg_4'] = np.where(((df['gwtg_bin'] == 4)), 1, 0)
    df['gwtg_5'] = np.where(((df['gwtg_bin'] == 5)), 1, 0)
    df['gwtg_6'] = np.where(((df['gwtg_bin'] == 6)), 1, 0)
    df['gwtg_7'] = np.where(((df['gwtg_bin'] == 7)), 1, 0)
    df['gwtg_8'] = np.where(((df['gwtg_bin'] == 8)), 1, 0)
    df['gwtg_9'] = np.where(((df['gwtg_bin'] == 9)), 1, 0)

In [None]:
def create_gwtg_score(df):
    df['gwtg_score'] = df.apply(calculate_gwtg_score, axis=1)

In [None]:
def delete_all_gwtg(df):
    df.drop('gwtg_bin', inplace=True, axis=1, errors='ignore')
    df.drop('gwtg_1', inplace=True, axis=1, errors='ignore')
    df.drop('gwtg_2', inplace=True, axis=1, errors='ignore')
    df.drop('gwtg_3', inplace=True, axis=1, errors='ignore')
    df.drop('gwtg_4', inplace=True, axis=1, errors='ignore')
    df.drop('gwtg_5', inplace=True, axis=1, errors='ignore')
    df.drop('gwtg_6', inplace=True, axis=1, errors='ignore')
    df.drop('gwtg_7', inplace=True, axis=1, errors='ignore')
    df.drop('gwtg_8', inplace=True, axis=1, errors='ignore')
    df.drop('gwtg_9', inplace=True, axis=1, errors='ignore')
    df.drop('gwtg_score', inplace=True, axis=1, errors='ignore')

### Adhere score labeling

In [None]:
create_adhere_score(df_only_available_pacients)
create_adhere_score(X_train_imputed)
create_adhere_score(X_train_sm)
create_adhere_score(X_test)
create_adhere_score(X_all)

create_adhere_bin_stats(X_test)
create_adhere_bin_stats(X_all)

create_adhere_label(X_all)

### GWTG score labeling

In [None]:
create_gwtg_score(df_only_available_pacients)
create_gwtg_score(X_train_imputed)
create_gwtg_score(X_train_sm)
create_gwtg_score(X_test)

create_gwtg_score(X_all)

In [None]:
create_gwtg_bin_label(df_only_available_pacients)
create_gwtg_bin_label(X_train_imputed)
create_gwtg_bin_label(X_train_sm)
create_gwtg_bin_label(X_test)
# create_gwtg_bin_label(X_test_sm)

In [None]:
create_gwtg_bin_encoded(df_only_available_pacients)
create_gwtg_bin_encoded(X_train_imputed)
create_gwtg_bin_encoded(X_train_sm)
create_gwtg_bin_encoded(X_test)
# create_gwtg_bin_encoded(X_test_sm)

In [None]:
delete_gwtg_bin_label(df_only_available_pacients)
delete_gwtg_bin_label(X_train_imputed)
delete_gwtg_bin_label(X_train_sm)
delete_gwtg_bin_label(X_test)
# delete_gwtg_bin_label(X_test_sm)

### Scores sanity check

In [None]:
# df['invalid_adhere'] = df.apply(lambda x: x.adhere_low == x.adhere_int1 == x.adhere_int2 == x.adhere_int3 == x.adhere_high, axis=1)

In [None]:
# df['invalid_adhere'].value_counts()

In [None]:
# df['invalid_gwtg'] = df.apply(lambda x: x.gwtg_1 == x.gwtg_2 == x.gwtg_3 == x.gwtg_4 == x.gwtg_5 == x.gwtg_6 == x.gwtg_7 == x.gwtg_8 == x.gwtg_9, axis=1)

In [None]:
# df['invalid_gwtg'].value_counts()

## Data normalization (disabled)

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

In [None]:
# X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_sm), columns = X_train_sm.columns)

In [None]:
# X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler(feature_range=(0, 1))
# # X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_sm), columns = X_train_sm.columns)

# # X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

# # X_train_scaled = X_train
# # X_train_sm[['admission_age', 'height', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'magnesium_min', 'urineoutput', 'hematocrit_max', 'wbc_max', 'platelets_max', 'abs_neutrophils_max', 'abs_basophils_max', 'abs_lymphocytes_max', 'pt_max', 'ptt_max', 'ck_cpk_max', 'creatinine_max', 'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max', 'aniongap_max', 'ph_min', 'bicarbonate_min', 'lactate_max', 'pco2_max', 'heart_rate_max']] = scaler.fit_transform(X_train_sm[['admission_age', 'height', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'magnesium_min', 'urineoutput', 'hematocrit_max', 'wbc_max', 'platelets_max', 'abs_neutrophils_max', 'abs_basophils_max', 'abs_lymphocytes_max', 'pt_max', 'ptt_max', 'ck_cpk_max', 'creatinine_max', 'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max', 'aniongap_max', 'ph_min', 'bicarbonate_min', 'lactate_max', 'pco2_max', 'heart_rate_max']])
# # X_train_sm[['admission_age', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'magnesium_min', 'urineoutput', 'hematocrit_max', 'wbc_max', 'platelets_max', 'pt_max', 'ptt_max', 'creatinine_max', 'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max', 'aniongap_max', 'bicarbonate_min', 'heart_rate_max']] = scaler.fit_transform(X_train_sm[['admission_age', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'magnesium_min', 'urineoutput', 'hematocrit_max', 'wbc_max', 'platelets_max', 'pt_max', 'ptt_max', 'creatinine_max', 'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max', 'aniongap_max', 'bicarbonate_min', 'heart_rate_max']])

In [None]:
# # X_test_scaled = pd.DataFrame(scaler.transform(X_test_sm), columns = X_test_sm.columns)
# # X_test_sm[['admission_age', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'magnesium_min', 'urineoutput', 'hematocrit_max', 'wbc_max', 'platelets_max', 'pt_max', 'ptt_max', 'creatinine_max', 'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max', 'aniongap_max', 'bicarbonate_min', 'heart_rate_max']] = scaler.fit_transform(X_test_sm[['admission_age', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'magnesium_min', 'urineoutput', 'hematocrit_max', 'wbc_max', 'platelets_max', 'pt_max', 'ptt_max', 'creatinine_max', 'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max', 'aniongap_max', 'bicarbonate_min', 'heart_rate_max']])

In [None]:
# To override data normalization
X_train_scaled = X_train_sm.copy()
X_test_scaled = X_test.copy()

## Data verification

In [None]:
# Verifying scores

# To print the specific row:
# print(df_only_available_pacients.iloc[1])

# To check the calculated score, change the number to the specific row of the spreadsheet, beggining with row 0. ie: first row = 0, second row = 1...
# print(calculate_gwtg_score(df_only_available_pacients.iloc[1]))

## Data exporting

In [None]:
from google.colab import drive

X_train_imputed['train'] = 1
X_train_imputed['test'] = 0

X_train_scaled['train'] = 1
X_train_scaled['test'] = 0

X_test['train'] = 0
X_test['test'] = 1

# TO DO: df_imputed, incluir escores e colunas de treino e teste; juntar com o mesmo df_test;
df_imputed_train = X_train_imputed.join(y_train)
df_imputed_test = X_test_imputed.join(y_test)
df_imputed_full = pd.concat([df_imputed_train, df_imputed_test])

df_resampled_train = X_train_scaled.join(y_train_sm)
df_resampled_test = X_test.join(y_test)
df_resampled_full = pd.concat([df_resampled_train, df_resampled_test])

In [None]:
# df_raw.to_excel('df_raw.xlsx', index=False)
# df_only_available_pacients.to_excel('df_raw_with_scores.xlsx', index=False)
# df_imputed_full.to_excel('df_imputed.xlsx', index=False)
# df_resampled_full.to_excel('df_resampled_13072023.xlsx', index=False)

In [None]:
# files.download('df_raw.xlsx')
# files.download('df_raw_with_scores.xlsx')
# files.download('df_imputed.xlsx')
# files.download('df_resampled_13072023.xlsx')

In [None]:
# To save test split scores before dropping it

X_test_adhere_scores = X_test['adhere_score'].to_numpy()
X_test_gwtg_scores = X_test['gwtg_score'].to_numpy()

### Patient data tables

In [None]:
# Configurations for the Tableone library
# Ref: https://pypi.org/project/tableone/

try:
   import tableone
except ModuleNotFoundError:
   print("module 'tableone' is not installed")
   !pip install tableone

import tableone
from tableone import TableOne

In [None]:
# # Ref: https://github.com/tompollard/tableone/blob/main/tableone.ipynb

# columns = ['admission_age', 'race_white', 'gender_male', 'height', 'weight', 'sbp_mean', 'creatinine_max', 'bun_max', 'temperature_mean', 'hospital_death']
# groupby = 'hospital_death'
# nonnormal = ['age']
# categorical = ['race_white', 'gender_male']

# table1 = TableOne(df, columns=columns, categorical=categorical,
#                    groupby=groupby, nonnormal=nonnormal, pval=True, smd=True,
#                   htest_name=True)

In [None]:
# table1

In [None]:
# columns = ['adhere_low', 'adhere_int1', 'adhere_int2', 'adhere_int3', 'adhere_high', 'hospital_death']
# groupby = 'hospital_death'
# labels={'hospital_death': 'Mortality'}

# table2 = TableOne(df, columns=columns, rename=labels,
#                    groupby=groupby)

In [None]:
# table2

In [None]:
X_all.columns

In [None]:
columns = ['admission_age', 'height', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean',
       'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'urineoutput',
       'hematocrit_max', 'hemoglobin_max', 'wbc_max', 'platelets_max',
       'abs_neutrophils_max', 'abs_basophils_max', 'abs_lymphocytes_max',
       'pt_max', 'inr_max', 'ptt_max', 'ck_cpk_max', 'creatinine_max',
       'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max',
       'chloride_max', 'aniongap_max', 'ph_min', 'bicarbonate_min',
       'magnesium_min', 'lactate_max', 'pco2_max', 'gender_male', 'race_black',
       'hospital_death', 'chronic_pulmonary_disease', 'diabetes_without_cc',
       'renal_disease', 'myocardial_infarct', 'peripheral_vascular_disease',
       'cerebrovascular_disease', 'heart_rate_max', 'mch_max', 'mchc_max',
       'mcv_max', 'ntprobnp_max', 'adhere_class', 'gwtg_score']
groupby = ['hospital_death']
labels={'hospital_death': 'Mortality'}
nonnormal = ['admission_age']
categorical = ['race_black', 'chronic_pulmonary_disease', 'diabetes_without_cc',
       'renal_disease', 'myocardial_infarct', 'peripheral_vascular_disease',
       'cerebrovascular_disease','adhere_class']

# # limit the binary variable "death" to a single row
# limit = {"death": 1}

# # set the order of the categorical variables
# order = {"ICU": ["MICU", "SICU", "CSRU", "CCU"]}

# # set decimal places for age to 0
decimals = {"admission_age": 0}

# # rename the death column
# labels={'death': 'Mortality'}

# # display minimum and maximum for listed variables
min_max = ['magnesium_min']


table1 = TableOne(X_all, columns=columns, categorical=categorical, rename=labels,
                   groupby=groupby, decimals=decimals, nonnormal=nonnormal, min_max=min_max, pval=True, smd=True,
                  htest_name=True)

In [None]:
table1

In [None]:
## Save to Excel
# fn1 = 'tableone.xlsx'
# table1.to_excel(fn1)

# Machine learning experiments

In [None]:
# Data cleaning before further manipulation

label_list = ['train', 'test']

X_train_scaled.drop(label_list, inplace=True, axis=1, errors='ignore')
X_test.drop(label_list, inplace=True, axis=1, errors='ignore')

scores_columns = ['adhere_score', 'adhere_low', 'adhere_int1', 'adhere_int2', 'adhere_int3', 'adhere_high', 'gwtg_1', 'gwtg_2', 'gwtg_3', 'gwtg_4', 'gwtg_5', 'gwtg_6', 'gwtg_7', 'gwtg_8', 'gwtg_9', 'gwtg_score']

X_train_scaled.drop(scores_columns, axis=1, inplace=True, errors='ignore')
X_test.drop(scores_columns, axis=1, inplace=True, errors='ignore')

## Feature selection

In [None]:
# Added to test_sm for feature selection

# delete_all_adhere(X_train_scaled)
# delete_all_adhere(X_test_scaled)
delete_all_adhere(X_test)

# delete_all_gwtg(X_train_scaled)
# delete_all_gwtg(X_test_scaled)
delete_all_gwtg(X_test)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

selector = SelectKBest(chi2, k=3).fit(X_train_scaled, y_train_sm)
X_train_best3feat = selector.transform(X_train_scaled)
# X_test_best3feat = selector.transform(X_test_scaled)

In [None]:
names = X_train_scaled.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)

In [None]:
X_test_best3feat = X_test_imputed[names]

In [None]:
selector = SelectKBest(chi2, k=7).fit(X_train_scaled, y_train_sm)
X_train_best7feat = selector.transform(X_train_scaled)
# X_test_best7feat = selector.transform(X_test_scaled)

In [None]:
names = X_train_scaled.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)

In [None]:
X_test_best7feat = X_test_imputed[names]

## Model selection and evaluation

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

# numeric_features = ['admission_age', 'weight', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'spo2_mean', 'magnesium_min', 'urineoutput', 'hematocrit_max', 'wbc_max', 'platelets_max', 'pt_max', 'ptt_max', 'creatinine_max', 'bun_max', 'glucose_max', 'potassium_max', 'sodium_max', 'calcium_max', 'chloride_max', 'aniongap_max', 'bicarbonate_min', 'heart_rate_max']
# numeric_transformer = Pipeline(steps=['scaler', MinMaxScaler()])

# categorical_features = ['race_black', 'chronic_pulmonary_disease']
# categorical_transformer = Pipeline(steps=[])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_features),
#         ('cat', categorical_transformer, categorical_features)])

In [None]:
# Build a pipeline for training
from sklearn.pipeline import Pipeline

# # gradient boosting - Ref: https://github.com/dmlc/xgboost
import xgboost as xgb

opt = Pipeline(
    steps=[
        # ('preprocessor', preprocessor),
        ("clf", xgb.XGBClassifier(learning_rate=0.1, max_depth=9 , n_estimators=180))
        ]
#   steps=[("clf", xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05))]
)

## Cross-validation to evaluate estimator performance

In [None]:
# # define data_dmatrix
# data_dmatrix = xgb.DMatrix(data=X_train_scaled,label=y_train_sm)

In [None]:
# from xgboost import cv

# params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
#                 'max_depth': 5, 'alpha': 10}

# xgb_cv_5_folds = cv(dtrain=data_dmatrix, params=params, nfold=5,
#                     num_boost_round=50, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=seed)

# xgb_cv_10_folds = cv(dtrain=data_dmatrix, params=params, nfold=10,
#                     num_boost_round=50, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=seed)

In [None]:
# xgb_cv_5_folds.head()

In [None]:
# xgb_cv_10_folds.head()

## Tuning the hyper-parameters of the estimator

In [None]:
# estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=9 , n_estimators=180)

In [None]:
# parameters = {
#     'max_depth': range (2, 10, 1),
#     'n_estimators': range(60, 220, 40),
#     'learning_rate': [0.1, 0.01, 0.05]
# }

In [None]:
# from sklearn.model_selection import GridSearchCV

# grid_search = GridSearchCV(
#     estimator=estimator,
#     param_grid=parameters,
#     scoring = 'roc_auc',
#     n_jobs = -1,
#     cv = 5,
#     verbose=True
# )

In [None]:
# opt = grid_search

## Metrics and scoring

### XGB model fitted with all variables

In [None]:
# Quick change to test influence of a certain feature
# X_train_scaled.drop([''], inplace=True, axis=1, errors='ignore')
# X_test.drop([''], inplace=True, axis=1, errors='ignore')

In [None]:
# X_train_scaled.columns

In [None]:
# X_test.columns

In [None]:
# define the datasets to evaluate each iteration
evalset = [(X_train_scaled, y_train_sm), (X_test, y_test)]

In [None]:
opt.fit(X_train_scaled, y_train_sm, clf__eval_metric=['logloss','error'], clf__eval_set=evalset)

In [None]:
# opt.best_estimator_

In [None]:
# print("Best parameter (CV score=%0.3f):" % opt.best_score_)

In [None]:
# print(opt.best_params_)

In [None]:
opt.score(X_test_imputed, y_test)

In [None]:
# predict probabilities to be used for roc curve
pred_xgb = opt.predict(X_test_imputed)
pred_prob_xgb = opt.predict_proba(X_test_imputed)

In [None]:
# # export grid search results
# gs = pd.DataFrame(opt.cv_results_)
# #gs = df.sort_values("rank_test_score")
# gs.to_csv("cv_results.csv", index = False)
# files.download('cv_results.csv')

In [None]:
# retrieve performance metrics
results = opt.named_steps.clf.evals_result()
# plot learning curves
plt.plot(results['validation_0']['logloss'], label='train')
plt.plot(results['validation_1']['logloss'], label='test')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
epochs = len(results["validation_0"]["error"])
x_axis = range(0, epochs)

fig, ax = plt.subplots(figsize=(12,12))
ax.plot(x_axis, results["validation_0"]["error"], label="Train")
ax.plot(x_axis, results["validation_1"]["error"], label="Test")
ax.legend()
plt.ylabel("Classification Error")
plt.title("XGBoost Classification Error")
plt.show()

##### Tree visualization

In [None]:
fig, ax = plt.subplots(figsize=(30, 30))
xgb.plot_tree(opt.named_steps['clf'], num_trees=0, rankdir='LR', ax=ax)
plt.show()

#### Feature importance from XGBoost

##### Xgboost Built-in Feature Importance

In [None]:
feature_names = opt.named_steps['clf'].get_booster().feature_names

In [None]:
feature_names

In [None]:
feature_names_renamed = ['Age', 'Weight', 'SBP', 'DBP', 'MBP', 'RR', 'Temperature', 'SpO2', 'Magnesium', 'Urine output', 'Hematocrit', 'WBC', 'Platelets', 'PT', 'PTT', 'Creatinine', 'BUN', 'Glucose', 'Potassium', 'Sodium', 'Calcium', 'Chloride', 'Anion gap', 'Bicarbonate', 'HR', 'Black race', 'COPD']

In [None]:
opt.named_steps['clf'].get_booster().feature_names = feature_names_renamed

In [None]:
# Renaming columns

# X_train_scaled = X_train_scaled.rename(mapper={'admission_age': 'Age', 'height': 'Height', 'weight': 'Weight', 'sbp_mean': 'Systolic blood pressure', 'dbp_mean': 'Diastolic blood pressure', 'mbp_mean': 'Mean blood pressure', 'resp_rate_mean': 'Respiratory rate'},
#                 axis='columns')

In [None]:
# xgb.plot_importance(opt.named_steps['clf'], importance_type='weight', title='Xgboost Built-in Feature Importance type weight')
# plt.figure(figsize = (16, 12))
# plt.show()

In [None]:
xgb.plot_importance(opt.named_steps['clf'], importance_type='gain', title='XGB selected predictors', show_values=False,  xlabel='Variable importance', height=0.5)
plt.figure()
plt.rcParams["figure.figsize"] = (25,12)
plt.show()

In [None]:
### STOP

In [None]:
# xgb.plot_importance(opt.named_steps['clf'], importance_type='cover', title='Xgboost Built-in Feature Importance type cover')
# plt.figure(figsize = (16, 12))
# plt.show()

In [None]:
# feature_important = opt.named_steps['clf'].get_booster().get_score(importance_type='gain')
# keys = list(feature_important.keys())
# values = list(feature_important.values())

# xgb_fi = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)

In [None]:
# xgb_fi.to_csv('xgb_fi.csv', encoding='utf-8', index=True)

##### Permutation Based Feature Importance

In [None]:
# from sklearn.inspection import permutation_importance
# perm_importance = permutation_importance(opt.named_steps['clf'], X_test, y_test)

In [None]:
# sorted_idx = perm_importance.importances_mean.argsort()
# plt.barh(X_test.columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
# plt.xlabel("Permutation Importance")

##### SHAP analysis

In [None]:
# # Ref: https://shap.readthedocs.io/en/latest/example_notebooks/tabular_examples/tree_based_models/Front%20page%20example%20(XGBoost).html

# try:
#    import shap
# except ModuleNotFoundError:
#    print("module 'shap' is not installed")
#    !pip install shap

# import shap

In [None]:
# # explain the model's predictions using SHAP values
# # (same syntax works for LightGBM, CatBoost, and scikit-learn models)
# background = shap.maskers.Independent(X_train)
# def f(x):
#     return shap.links.identity(opt.predict_proba(x)[:,1])
# explainer = shap.Explainer(f, background, link=shap.links.logit)
# shap_values = explainer(X_train[:100])

# # visualize the first prediction's explanation
# shap.plots.waterfall(shap_values[0])

In [None]:
# # plot the global importance of each feature
# shap.plots.bar(shap_values)

#### Classification report

In [None]:
from sklearn.metrics import classification_report

target_names = ['survival', 'death']

print(classification_report(y_test, pred_xgb, target_names=target_names))

### XGB model with the highest 3 important features

In [None]:
opt.fit(X_train_best3feat, y_train_sm)

In [None]:
opt.score(X_test_best3feat, y_test)

In [None]:
# predict probabilities to be used for roc curve
pred_xgb_3_var = opt.predict(X_test_best3feat)
pred_prob_xgb_3_var = opt.predict_proba(X_test_best3feat)

#### SHAP analysis

In [None]:
# # explain the model's predictions using SHAP values
# # (same syntax works for LightGBM, CatBoost, and scikit-learn models)
# background = shap.maskers.Independent(X_train_best3feat)
# def f(x):
#     return shap.links.identity(opt.predict_proba(x)[:,1])
# explainer = shap.Explainer(f, background, link=shap.links.logit)
# shap_values = explainer(X_train_best3feat[:100])

# # visualize the first prediction's explanation
# shap.plots.waterfall(shap_values[0])

In [None]:
# # plot the global importance of each feature
# shap.plots.bar(shap_values)

### XGB model with the highest 7 important features

In [None]:
opt.fit(X_train_best7feat, y_train_sm)

In [None]:
opt.score(X_test_best7feat, y_test)

In [None]:
# predict probabilities to be used for roc curve
pred_xgb_7_var = opt.predict(X_test_best7feat)
pred_prob_xgb_7_var = opt.predict_proba(X_test_best7feat)

#### SHAP analysis

In [None]:
# # explain the model's predictions using SHAP values
# # (same syntax works for LightGBM, CatBoost, and scikit-learn models)
# background = shap.maskers.Independent(X_train_best7feat)
# def f(x):
#     return shap.links.identity(opt.predict_proba(x)[:,1])
# explainer = shap.Explainer(f, background, link=shap.links.logit)
# shap_values = explainer(X_train_best7feat[:100])

# # visualize the first prediction's explanation
# shap.plots.waterfall(shap_values[0])

In [None]:
# # plot the global importance of each feature
# shap.plots.bar(shap_values)

### XGB model fitted with the same variables of Adhere's score

In [None]:
X_train_adhere = X_train_imputed[['sbp_mean', 'bun_max', 'creatinine_max']]
X_test_adhere = X_test_imputed[['sbp_mean', 'bun_max', 'creatinine_max']]

In [None]:
opt.fit(X_train_adhere, y_train_sm)

In [None]:
opt.score(X_test_adhere, y_test)

In [None]:
# predict probabilities to be used for roc curve
pred_xgb_adhere = opt.predict(X_test_adhere)
pred_prob_xgb_adhere = opt.predict_proba(X_test_adhere)

### XGB model fitted with the same variables of Gwtg's score

In [None]:
# TO DO: Models with different features
X_train_gwtg = X_train_imputed[['sbp_mean', 'bun_max', 'sodium_max', 'admission_age', 'heart_rate_max', 'race_black', 'chronic_pulmonary_disease']]
X_test_gwtg = X_test_imputed[['sbp_mean', 'bun_max', 'sodium_max', 'admission_age', 'heart_rate_max', 'race_black', 'chronic_pulmonary_disease']]

In [None]:
opt.fit(X_train_gwtg, y_train_sm)

In [None]:
opt.score(X_test_gwtg, y_test)

In [None]:
# predict probabilities to be used for roc curve
pred_xgb_gwtg = opt.predict(X_test_gwtg)
pred_prob_xgb_gwtg = opt.predict_proba(X_test_gwtg)

# Metrics evaluation

In [None]:
from sklearn.metrics import roc_curve

# # roc curve for models
# fpr1, tpr1, thresh1 = roc_curve(y_true, y_pred_adhere, pos_label=1)

# fpr2, tpr2, thresh2 = roc_curve(y_true, y_pred_gwtg, pos_label=1)

fpr3, tpr3, thresh3 = roc_curve(y_test, pred_prob_xgb_7_var[:,1], pos_label=1)

fpr4, tpr4, thresh4 = roc_curve(y_test, pred_prob_xgb_3_var[:,1], pos_label=1)

fpr5, tpr5, thresh5 = roc_curve(y_test, pred_prob_xgb[:,1], pos_label=1)

fpr6, tpr6, thresh6 = roc_curve(y_test, pred_prob_xgb_adhere[:,1], pos_label=1)

fpr7, tpr7, thresh7 = roc_curve(y_test, pred_prob_xgb_gwtg[:,1], pos_label=1)

# roc curve for tpr = fpr
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

In [None]:
from sklearn.metrics import roc_auc_score

# auc scores
# auc_score1 = roc_auc_score(y_true, y_pred_adhere)
# auc_score2 = roc_auc_score(y_true, y_pred_gwtg)
auc_score3 = roc_auc_score(y_test, pred_prob_xgb_7_var[:,1])
auc_score4 = roc_auc_score(y_test, pred_prob_xgb_3_var[:,1])
auc_score5 = roc_auc_score(y_test, pred_prob_xgb[:,1])
auc_score6 = roc_auc_score(y_test, pred_prob_xgb_adhere[:,1])
auc_score7 = roc_auc_score(y_test, pred_prob_xgb_gwtg[:,1])

# print(auc_score1)
# print(auc_score2)
print(auc_score3)
print(auc_score4)
print(auc_score5)
print(auc_score6)
print(auc_score7)

In [None]:
plt.style.use('seaborn')

# plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Adhere model: AUC = %0.2f' % auc_score1)
# plt.plot(fpr2, tpr2, linestyle='--',color='green', label='GWTG model: AUC = %0.2f' % auc_score2)
plt.plot(fpr3, tpr3, linestyle='--',color='red', label='XGB model with best 7 variables: AUC = %0.2f' % auc_score3)
plt.plot(fpr4, tpr4, linestyle='--',color='black', label='XGB model with best 3 variables: AUC = %0.2f' % auc_score4)
plt.plot(fpr5, tpr5, linestyle='--',color='purple', label='XGB model with all variables: AUC = %0.2f' % auc_score5)
plt.plot(fpr6, tpr6, linestyle='--',color='blue', label='XGB model with Adhere variables: AUC = %0.2f' % auc_score6)
plt.plot(fpr7, tpr7, linestyle='--',color='pink', label='XGB model with Gwtg variables: AUC = %0.2f' % auc_score7)

plt.plot(p_fpr, p_tpr, linestyle='--', color='gray', label='Random classifier')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive Rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show();

## Results export

In [None]:
pd.DataFrame(data={'adhere_score' : X_test_adhere_scores, 'gwtg_score' : X_test_gwtg_scores, 'y_allvar': pred_xgb, 'y_allvar_prob': pred_prob_xgb[:,1], 'y_b7': pred_xgb_7_var, 'y_b7_prob': pred_prob_xgb_7_var[:,1], 'y_b3': pred_xgb_3_var, 'y_b3_prob': pred_prob_xgb_3_var[:,1], 'y_gwtg': pred_xgb_gwtg, 'y_gwtg_prob': pred_prob_xgb_gwtg[:,1], 'y_adhere': pred_xgb_adhere, 'y_adhere_prob': pred_prob_xgb_adhere[:,1], 'hospital_death': y_test}).to_csv('roc_data.csv', index=False, decimal=',')

In [None]:
files.download('roc_data.csv')

In [None]:
# pd.DataFrame(data={'predictions': pred_xgb_7_var, 'actual': y_test}).to_excel('prediction_xgb_best_7_variables.xlsx', index=False)
# pd.DataFrame(data={'predictions': pred_xgb_3_var, 'actual': y_test}).to_excel('prediction_xgb_best_3_variables.xlsx', index=False)
# pd.DataFrame(data={'predictions': pred_xgb, 'actual': y_test}).to_excel('prediction_xgb_all_variables.xlsx', index=False)
# pd.DataFrame(data={'predictions': pred_xgb_adhere, 'actual': y_test}).to_excel('prediction_xgb_adhere_variables.xlsx', index=False)
# pd.DataFrame(data={'predictions': pred_xgb_gwtg, 'actual': y_test}).to_excel('prediction_xgb_gwtg_variables.xlsx', index=False)

In [None]:
# pd.DataFrame(data={'y_predict': pred_xgb_7_var, 'y_predict_proba': pred_prob_xgb_7_var[:,1], 'y_true': y_test}).to_csv('prediction_xgb_best_7_variables.csv', index=False)
# pd.DataFrame(data={'y_predict': pred_xgb_3_var, 'y_predict_proba': pred_prob_xgb_3_var[:,1], 'y_true': y_test}).to_csv('prediction_xgb_best_3_variables.csv', index=False)
# pd.DataFrame(data={'y_predict': pred_xgb, 'y_predict_proba': pred_prob_xgb[:,1], 'y_true': y_test}).to_csv('prediction_xgb_all_variables.csv', index=False)
# pd.DataFrame(data={'y_predict': pred_xgb_adhere, 'y_predict_proba': pred_prob_xgb_adhere[:,1], 'y_true': y_test}).to_csv('prediction_xgb_adhere_variables.csv', index=False)
# pd.DataFrame(data={'y_predict': pred_xgb_gwtg, 'y_predict_proba': pred_prob_xgb_gwtg[:,1], 'y_true': y_test}).to_csv('prediction_xgb_gwtg_variables.csv', index=False)

In [None]:
# files.download('prediction_xgb_best_7_variables.csv')
# files.download('prediction_xgb_best_3_variables.csv')
# files.download('prediction_xgb_all_variables.csv')
# files.download('prediction_xgb_adhere_variables.csv')
# files.download('prediction_xgb_gwtg_variables.csv')

# Model comparison sandbox

In [None]:
# # Import the classifiers
# from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import GaussianNB
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.neural_network import MLPClassifier
# from sklearn.svm import SVC

# from sklearn.metrics import roc_curve, roc_auc_score

# # Instantiate the classfiers and make a list
# classifiers = [
#                LogisticRegression(fit_intercept=True),
#                SVC(kernel = 'rbf',gamma='scale', probability=True),
#                DecisionTreeClassifier(criterion = "entropy"),
#                RandomForestClassifier(n_estimators=100, criterion="entropy"),
#                GaussianNB(),
#                MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1),
#                KNeighborsClassifier(n_neighbors=5, metric="minkowski",p=2),
#                xgb.XGBClassifier(learning_rate=0.1, max_depth=9 , n_estimators=180)]

# # Define a result table as a DataFrame
# result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])

# # Train the models and record the results
# for cls in classifiers:
#     model = cls.fit(X_train_scaled, y_train_sm)
#     yproba = model.predict_proba(X_test_scaled)
#    # yproba = model.predict_proba(X_test)

#     fpr, tpr, _ = roc_curve(y_test_sm, yproba[:,1])
#     auc = roc_auc_score(y_test_sm, yproba[:,1])

#     # fpr, tpr, _ = roc_curve(y_test, yproba[:,1])
#     # auc = roc_auc_score(y_test, yproba[:,1])

#     result_table = result_table.append({'classifiers':cls.__class__.__name__,
#                                         'fpr':fpr,
#                                         'tpr':tpr,
#                                         'auc':auc}, ignore_index=True)

# # Set name of the classifiers as index labels
# result_table.set_index('classifiers', inplace=True)

In [None]:
# fig = plt.figure(figsize=(8,6))

# for i in result_table.index:
#     plt.plot(result_table.loc[i]['fpr'],
#              result_table.loc[i]['tpr'],
#              label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))

# plt.plot([0,1], [0,1], color='orange', linestyle='--')

# plt.xticks(np.arange(0.0, 1.1, step=0.1))
# plt.xlabel("Flase Positive Rate", fontsize=15)

# plt.yticks(np.arange(0.0, 1.1, step=0.1))
# plt.ylabel("True Positive Rate", fontsize=15)

# plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
# plt.legend(prop={'size':13}, loc='lower right')

# plt.show()