# Implementation of the solution to project 2 from scratch
To note:
- For clarity the df_test was renamed to df_val as the test word was used when splitting the labeled data into train and test. 
    - Val stands for validation

To try out:
- Preprocessing
    - Tweak data imputer
    - Tweak scaler (Robust scaler, minmax, etc..)
    - Tweak feature selection parameter
    - Tweak order of operations above to see the effect
- Modelling
    - XGBoost
    - SVM
    
Following conversation with Gian:
* Rewrite code such that XGBoost is used everywhere
* RandomUnderSampler (without replacement, but should not be a problem)
* Use one line per patient
* Don't do imputation
* Run 150 fits per label

## Import modules

In [1]:
from __future__ import print_function

import argparse
import logging
import os
import shutil
import sys
import zipfile
import time
import sys
import torch

import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import Counter
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler

from torch.utils.data import Dataset, DataLoader
from sklearn.feature_selection import SelectKBest, f_regression, chi2, f_classif
from sklearn.metrics import f1_score, mean_squared_error, accuracy_score, r2_score, roc_auc_score, recall_score, precision_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import label_ranking_average_precision_score as LRAPS
from sklearn.metrics import label_ranking_loss as LRL
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, BayesianRidge, LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

## Define global variables

In [2]:
# Global variables
IDENTIFIERS = ["pid", "Time"]
MEDICAL_TESTS = [
    "LABEL_BaseExcess",
    "LABEL_Fibrinogen",
    "LABEL_AST",
    "LABEL_Alkalinephos",
    "LABEL_Bilirubin_total",
    "LABEL_Lactate",
    "LABEL_TroponinI",
    "LABEL_SaO2",
    "LABEL_Bilirubin_direct",
    "LABEL_EtCO2",
]
VITAL_SIGNS = ["LABEL_RRate", "LABEL_ABPm", "LABEL_SpO2", "LABEL_Heartrate"]
SEPSIS = ["LABEL_Sepsis"]
ESTIMATOR = {"bayesian": BayesianRidge(), "decisiontree": DecisionTreeRegressor(max_features="sqrt", random_state=0), 
                "extratree": ExtraTreesRegressor(n_estimators=10, random_state=0), 
                "knn": KNeighborsRegressor(n_neighbors=10, weights="distance")}

FEATURES_MNAR = ["EtCO2", "PTT", "BUN", "Lactate", "Hgb", "HCO3", "BaseExcess",
                          "Fibrinogen", "Phosphate", "WBC", "Creatinine", "PaCO2", "AST",
                          "FiO2", "Platelets", "SaO2", "Glucose", "Magnesium", "Potassium",
                          "Calcium", "Alkalinephos", "Bilirubin_direct", "Chloride", "Hct",
                          "Bilirubin_total", "TroponinI", "pH"]

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda:0


In [3]:
def sigmoid_f(x):
    """To get predictions as confidence level, the model predicts for all 12 sets of measures for
    each patient a distance to the hyperplane ; it is then transformed into a confidence level using
    the sigmoid function ; the confidence level reported is the mean of all confidence levels for a
    single patient

    Args:
        x (float): input of the sigmoid function

    Returns:
       float: result of the sigmoid computation.

    """
    return 1 / (1 + np.exp(-x))

## Load Data

In [16]:
df_train = pd.read_csv(r"data/train_features.csv")
df_train_label = pd.read_csv(r"data/train_labels.csv")


In [80]:
df_val = pd.read_csv(r"data/test_features.csv")

## Data imputation methodology

In [17]:
# Adding engineered features
mnar_columns = [
        sub + "_presence" for sub in FEATURES_MNAR
    ]
pid = df_train["pid"].unique()
for patient in tqdm(pid):
    for column in FEATURES_MNAR:
        presence = int(df_train.loc[
            df_train["pid"] == patient
            ][column].any())
        df_train.at[patient, column] = presence
print("Done adding features about MNAR features")

HBox(children=(FloatProgress(value=0.0, max=18995.0), HTML(value='')))


Done adding features about MNAR features


In [81]:
# Adding engineered features
pid = df_val["pid"].unique()
for patient in tqdm(pid):
    for column in FEATURES_MNAR:
        presence = int(df_val.loc[
            df_val["pid"] == patient
            ][column].any())
        df_val.at[patient, column] = presence
print("Done adding features about MNAR features")

HBox(children=(FloatProgress(value=0.0, max=12664.0), HTML(value='')))


Done adding features about MNAR features


In [19]:
columns_for_regression = ["Temp", "Hgb", "RRate", "BaseExcess", "WBC", "PaCO2", "FiO2", "Glucose", "ABPm", "ABPd", "SpO2", "Hct", "Heartrate", "ABPs", "pH"]
columns_for_regression_trend = [
        sub + "_trend" for sub in columns_for_regression
    ]
columns_for_regression_std = [
        sub + "_std" for sub in columns_for_regression
    ]
columns_for_regression_min = [
        sub + "_min" for sub in columns_for_regression
    ]
columns_for_regression_max = [
        sub + "_max" for sub in columns_for_regression
    ]
cols_to_add = columns_for_regression_trend + columns_for_regression_std + columns_for_regression_min + columns_for_regression_max

df_train = df_train.reindex(
        df_train.columns.tolist() + cols_to_add,
        axis=1,
    )

pid = df_train["pid"].unique()

for patient in tqdm(pid):
    for column in columns_for_regression:
        if df_train.loc[df_train["pid"] == patient][column].isna().sum() <= 8:
            series = df_train.loc[df_train["pid"] == patient][column]
            # Fill missing values between two non nans with their average
            series = (series.ffill() + series.bfill()) / 2
            # Drop the rest of the value
            series = series.dropna()
            standard_deviation = series.std()
            minimum = series.min()
            maximum = series.max()
            X = [i for i in range(0, len(series))]
            X = np.reshape(X, (len(X), 1))
            y = series
            model = LinearRegression()
            try:
                model.fit(X, y)
                df_train.at[patient, column + "_trend"] = model.coef_
            except ValueError:
                df_train.at[patient, column + "_trend"] = 0
            df_train.at[patient, column + "_std"] = standard_deviation
            df_train.at[patient, column + "_min"] = minimum
            df_train.at[patient, column + "_max"] = maximum

    # fill rest of values with 0 for trends col umns
    df_train[columns_for_regression_trend] = df_train[
        columns_for_regression_trend
    ].fillna(value=0)
    df_train[columns_for_regression_std] = df_train[
        columns_for_regression_std
    ].fillna(value=0)
    df_train[columns_for_regression_min] = df_train[
        columns_for_regression_min
    ].fillna(value=0)
    df_train[columns_for_regression_max] = df_train[
        columns_for_regression_max
    ].fillna(value=0)

HBox(children=(FloatProgress(value=0.0, max=18995.0), HTML(value='')))




In [82]:
pid = df_val["pid"].unique()

df_val = df_val.reindex(
        df_val.columns.tolist() + cols_to_add,
        axis=1,
    )

In [83]:
for patient in tqdm(pid):
    for column in columns_for_regression:
        if df_val.loc[df_val["pid"] == patient][column].isna().sum() <= 8:
            series = df_val.loc[df_val["pid"] == patient][column]
            # Fill missing values between two non nans with their average
            series = (series.ffill() + series.bfill()) / 2
            # Drop the rest of the value
            series = series.dropna()
            standard_deviation = series.std()
            minimum = series.min()
            maximum = series.max()
            X = [i for i in range(0, len(series))]
            X = np.reshape(X, (len(X), 1))
            y = series
            model = LinearRegression()
            try:
                model.fit(X, y)
                df_val.at[patient, column + "_trend"] = model.coef_
            except ValueError:
                df_val.at[patient, column + "_trend"] = 0
            df_val.at[patient, column + "_std"] = standard_deviation
            df_val.at[patient, column + "_min"] = minimum
            df_val.at[patient, column + "_max"] = maximum

    # fill rest of values with 0 for trends col umns
    df_val[columns_for_regression_trend] = df_val[
        columns_for_regression_trend
    ].fillna(value=0)
    df_val[columns_for_regression_std] = df_val[
        columns_for_regression_std
    ].fillna(value=0)
    df_val[columns_for_regression_min] = df_val[
        columns_for_regression_min
    ].fillna(value=0)
    df_val[columns_for_regression_max] = df_val[
        columns_for_regression_max
    ].fillna(value=0)

HBox(children=(FloatProgress(value=0.0, max=12664.0), HTML(value='')))




In [84]:
df_train_grouped = pd.DataFrame(index=df_train["pid"].unique(), columns=df_train.columns)

for patient in tqdm(df_train["pid"].unique()):
    for column in df_train.columns:
        patient_timeseries = df_train.loc[df_train["pid"] == patient][column]
        if patient_timeseries.isnull().all():
            df_train_grouped.at[patient, column] = np.nan
        elif column is not "pid":
            df_train_grouped.at[patient, column] = patient_timeseries.mean()
df_train = df_train_grouped

HBox(children=(FloatProgress(value=0.0, max=18995.0), HTML(value='')))




In [85]:
df_val_grouped = pd.DataFrame(index=df_val["pid"].unique(), columns=df_val.columns)

for patient in tqdm(df_val["pid"].unique()):
    for column in df_val.columns:
        patient_timeseries = df_val.loc[df_val["pid"] == patient][column]
        if patient_timeseries.isnull().all():
            df_val_grouped.at[patient, column] = np.nan
        elif column is not "pid":
            df_val_grouped.at[patient, column] = patient_timeseries.mean()
df_val = df_val_grouped

HBox(children=(FloatProgress(value=0.0, max=12664.0), HTML(value='')))




In [88]:
df_train.to_csv("df_train_philip.csv")
df_val.to_csv("df_val_philip.csv")

## Data formatting

In [89]:
df_train_preprocessed = pd.read_csv("df_train_philip.csv")
df_val_preprocessed = pd.read_csv("df_val_philip.csv")

In [90]:
train_pids = np.unique(df_val_preprocessed["pid"].values)
val_pids = np.unique(df_val_preprocessed["pid"].values)

In [91]:
df_train_preprocessed = df_train_preprocessed.sort_values(by=["pid"])
df_train_preprocessed = df_train_preprocessed.drop(columns=IDENTIFIERS)
df_val_preprocessed = df_val_preprocessed.sort_values(by=["pid"])
df_val_preprocessed = df_val_preprocessed.drop(columns=IDENTIFIERS)
df_train_label = df_train_label.sort_values(by=["pid"])

In [92]:
# Data formatting
X_train = df_train_preprocessed.values
X_val = df_val_preprocessed.values
# Create list with different label for each medical test
print("Creating a list of labels for each medical test")
y_train_medical_tests = []
for test in MEDICAL_TESTS:
    y_train_medical_tests.append(df_train_label[test].astype(int).values)

# Create list with different label for sepsis
print("Creating a list of labels for sepsis")
y_train_sepsis = []
for sepsis in SEPSIS:
    y_train_sepsis.append(df_train_label[sepsis].astype(int).values)

# Create list with different label for each vital sign
print("Creating a list of labels for each vital sign")
y_train_vital_signs = []
for sign in VITAL_SIGNS:
    y_train_vital_signs.append(df_train_label[sign].astype(int).values)

Creating a list of labels for each medical test
Creating a list of labels for sepsis
Creating a list of labels for each vital sign


## Scaler

In [93]:
# Scale data 
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

## Modelling medical tests

In [101]:
# Modelling using extreme gradient boosting
clf = xgb.XGBClassifier(objective="binary:logistic", n_thread=-1)
models = []
losses = []
feature_selector_medical_tests = []
for i, test in enumerate(MEDICAL_TESTS):
    print(f"Fitting model for {test}.")
    X_train, X_test, y_train, y_test = train_test_split(
    X_train_scaled, y_train_medical_tests[i], test_size=0.10, random_state=42, shuffle=True
    )
    
    print("Resampling")
    sampler = RandomUnderSampler(random_state=42)
    X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
    
    print("Fitting coarse model")
    # Coarse parameter grid not optimized at all yet
    coarse_param_grid = {
        "booster": ["dart"],
        "eta": np.arange(0,1,0.1),
        "min_child_weight": range(1, 10, 1),
        "max_depth": range(4, 10, 1),
        "gamma": range(0, 100, 1),
        "max_delta_step": range(1, 10, 1),
        "subsample": np.arange(0.1, 1, 0.05),
        "colsample_bytree": np.arange(0.3, 1, 0.05),
        "n_estimators": range(50, 150, 1),
        "scale_pos_weight": [1],
        "reg_lambda": [0, 1], # Ridge regularization
        "reg_alpha": [0, 1], # Lasso regularization
        "eval_metric": ["error"],
        "verbosity": [1]
    }
    coarse_search = RandomizedSearchCV(estimator=clf,
            param_distributions=coarse_param_grid, scoring="roc_auc",
            n_jobs=-1, cv=10, n_iter=10, verbose=1)
    coarse_search.fit(X_train_res, y_train_res)
    print(coarse_search.best_estimator_.predict_proba(X_test)[:,1])
    print(f"ROC score on test set {roc_auc_score(y_test, coarse_search.best_estimator_.predict_proba(X_test)[:,1])}")
    print(f"CV score {coarse_search.best_score_}")
    best_params = coarse_search.best_params_
    print(f"ROC score on test set {roc_auc_score(y_test, coarse_search.best_estimator_.predict_proba(X_test)[:,1])}")
    print(f"CV score {coarse_search.best_score_}")
    
    
    
    models.append(coarse_search.best_estimator_)
    
print(f"Finished test for medical tests.")

Fitting model for LABEL_BaseExcess.
Resampling
Fitting coarse model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   53.0s finished


[0.9192943  0.03941115 0.0429096  ... 0.87684566 0.03840721 0.76820076]
ROC score on test set 0.9212526847831162
CV score 0.9142709197202634
ROC score on test set 0.9212526847831162
CV score 0.9142709197202634
Fitting model for LABEL_Fibrinogen.
Resampling
Fitting coarse model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   20.4s finished


[0.85556257 0.3864742  0.30642387 ... 0.8257802  0.30642387 0.33315018]
ROC score on test set 0.7606602816901408
CV score 0.7779127706692913
ROC score on test set 0.7606602816901408
CV score 0.7779127706692913
Fitting model for LABEL_AST.
Resampling
Fitting coarse model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   53.1s finished


[0.76991904 0.599469   0.32829252 ... 0.5248063  0.3760242  0.4299633 ]
ROC score on test set 0.7509185296214378
CV score 0.7110957763236169
ROC score on test set 0.7509185296214378
CV score 0.7110957763236169
Fitting model for LABEL_Alkalinephos.
Resampling
Fitting coarse model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   47.8s finished


[0.79145104 0.5689337  0.44030726 ... 0.5926671  0.42972842 0.35894385]
ROC score on test set 0.7496564160271546
CV score 0.7205849316152287
ROC score on test set 0.7496564160271546
CV score 0.7205849316152287
Fitting model for LABEL_Bilirubin_total.
Resampling
Fitting coarse model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   58.0s finished


[0.8335465  0.71473825 0.3518789  ... 0.66789967 0.3129973  0.43256885]
ROC score on test set 0.7332163345410627
CV score 0.7041664446694017
ROC score on test set 0.7332163345410627
CV score 0.7041664446694017
Fitting model for LABEL_Lactate.
Resampling
Fitting coarse model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   45.3s finished


[0.77554786 0.15303849 0.18401696 ... 0.743133   0.15504944 0.78095853]
ROC score on test set 0.7815666824205799
CV score 0.7837864854008978
ROC score on test set 0.7815666824205799
CV score 0.7837864854008978
Fitting model for LABEL_TroponinI.
Resampling
Fitting coarse model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   21.7s finished


[0.13606526 0.9004724  0.82134575 ... 0.12675641 0.40537813 0.10361692]
ROC score on test set 0.878312215320911
CV score 0.8716603569497025
ROC score on test set 0.878312215320911
CV score 0.8716603569497025
Fitting model for LABEL_SaO2.
Resampling
Fitting coarse model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.0min finished


[0.5971507  0.242005   0.2411923  ... 0.80634934 0.2411923  0.6802159 ]
ROC score on test set 0.8196282021782482
CV score 0.8077480428282527
ROC score on test set 0.8196282021782482
CV score 0.8077480428282527
Fitting model for LABEL_Bilirubin_direct.
Resampling
Fitting coarse model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


[0.9866333  0.6103902  0.6540704  ... 0.22768466 0.34102306 0.00376799]
ROC score on test set 0.7038407494145199
CV score 0.7144712962078519
ROC score on test set 0.7038407494145199
CV score 0.7144712962078519
Fitting model for LABEL_EtCO2.
Resampling
Fitting coarse model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.1s finished


[0.05827218 0.5136681  0.35009816 ... 0.23681766 0.19505642 0.03189045]
ROC score on test set 0.9286404220779221
CV score 0.9264675148121094
ROC score on test set 0.9286404220779221
CV score 0.9264675148121094
Finished test for medical tests.


In [102]:
import joblib
for i, model in enumerate(models):
    joblib.dump(models[i], f"xgboost_fine_{MEDICAL_TESTS[i]}.pkl")

In [105]:
# Get predictions for medical tests
df_pred_medical_test = pd.DataFrame(index=val_pids, columns=MEDICAL_TESTS)
for i, test in enumerate(MEDICAL_TESTS):
    model_for_test = models[i]
#     print(model_for_test.predict_proba(X_val_vital_sign))
    y_pred = model_for_test.predict_proba(X_val_scaled)[:, 1]
    df_pred_medical_test[test] = y_pred

df_pred_medical_test = df_pred_medical_test.reset_index().rename(columns={"index": "pid"})

## Modelling sepsis

In [107]:
# Model and predict sepsis

clf = xgb.XGBClassifier(objective="binary:logistic", n_thread=-1)


X_train, X_test, y_train, y_test = train_test_split(
    X_train_scaled, y_train_sepsis[0], test_size=0.10, random_state=42, shuffle=True
)


param_grid = {
        "booster": ["dart"],
        "eta": np.arange(0,1,0.1),
        "min_child_weight": range(1, 10, 1),
        "max_depth": range(4, 10, 1),
        "gamma": range(0, 100, 1),
        "max_delta_step": range(1, 10, 1),
        "subsample": np.arange(0.1, 1, 0.05),
        "colsample_bytree": np.arange(0.3, 1, 0.05),
        "n_estimators": range(50, 150, 1),
        "scale_pos_weight": [1],
        "reg_lambda": [0, 1], # Ridge regularization
        "reg_alpha": [0, 1], # Lasso regularization
        "eval_metric": ["error"],
        "verbosity": [1]
    }

print("Resampling")
sampler = RandomUnderSampler(random_state=42)
X_train, y_train = sampler.fit_resample(X_train, y_train)

print("Fitting model")
coarse_search = RandomizedSearchCV(estimator=clf,
        param_distributions=param_grid, scoring="roc_auc",
        n_jobs=-1, cv=10, n_iter=10, verbose=1)
print(y_train_res)
coarse_search.fit(X_train, y_train)

sepsis_model = coarse_search.best_estimator_
print(f"ROC score on test set {roc_auc_score(y_test, coarse_search.best_estimator_.predict_proba(X_test)[:,1])}")
print(f"CV score {coarse_search.best_score_}")
print(f"Finished test for medical tests.")


Resampling
Fitting model
[0 0 0 ... 1 1 1]
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.1s


ROC score on test set 0.7255442706556066
CV score 0.6844090404040404
Finished test for medical tests.


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.2s finished


In [110]:
y_pred = sepsis_model.predict_proba(X_val_scaled)[:,1]
df_pred_sepsis = pd.DataFrame(y_pred, index=val_pids, columns=SEPSIS)
df_pred_sepsis = df_pred_sepsis.reset_index().rename(columns={"index": "pid"})

In [111]:
joblib.dump(sepsis_model, f"xgboost_fine_sepsis.pkl")

['xgboost_fine_sepsis.pkl']

## Modelling vital signs

In [115]:
# Modelling of vital signs
models = []
losses = []
feature_selectors_vital_signs = []
clf = xgb.XGBRegressor(objective="reg:squarederror", n_thread=-1)

for i, sign in enumerate(VITAL_SIGNS):
    print(f"Fitting model for {sign}.")
    X_train, X_test, y_train, y_test = train_test_split(
        X_train_scaled, y_train_vital_signs[i], test_size=0.10, random_state=42, shuffle=True
    )

    print("Fitting model")
    
    param_grid = {
        "booster": ["dart"],
        "eta": np.arange(0,1,0.1),
        "min_child_weight": range(1, 10, 1),
        "max_depth": range(4, 10, 1),
        "gamma": range(0, 100, 1),
        "max_delta_step": range(1, 10, 1),
        "subsample": np.arange(0.1, 1, 0.05),
        "colsample_bytree": np.arange(0.3, 1, 0.05),
        "n_estimators": range(50, 150, 1),
        "scale_pos_weight": [1],
        "reg_lambda": [0, 1], # Ridge regularization
        "reg_alpha": [0, 1], # Lasso regularization
        "eval_metric": ["error"],
        "verbosity": [1]
    }


    
    coarse_search = RandomizedSearchCV(estimator=clf,
            param_distributions=param_grid, scoring="r2",
            n_jobs=-1, cv=10, n_iter=10, verbose=1)
    coarse_search.fit(X_train, y_train)
    models.append(coarse_search.best_estimator_)
    print(f"CV score {coarse_search.best_score_}")
    print(f"Test score is {r2_score(y_test, coarse_search.best_estimator_.predict(X_test))}")
    print(f"Finished test for medical tests.")

Fitting model for LABEL_RRate.
Fitting model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.4min finished


CV score 0.4051125661920107
Test score is 0.39979092274094374
Finished test for medical tests.
Fitting model for LABEL_ABPm.
Fitting model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.7min finished


CV score 0.5836022226887561
Test score is 0.6043359316452224
Finished test for medical tests.
Fitting model for LABEL_SpO2.
Fitting model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   31.2s finished


CV score 0.32253994174038814
Test score is 0.34612475097599815
Finished test for medical tests.
Fitting model for LABEL_Heartrate.
Fitting model
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.3min finished


CV score 0.6042153104770616
Test score is 0.609124975655355
Finished test for medical tests.


In [116]:
for i, model in enumerate(models):
    joblib.dump(models[i], f"xgboost_fine_{VITAL_SIGNS[i]}.pkl")

In [117]:
# Get predictions for vital signs using ANN
df_pred_vital_signs = pd.DataFrame(index=val_pids, columns=VITAL_SIGNS)
for model in models:
    y_pred = model.predict(X_val_scaled)
    df_pred_vital_signs[test] = y_pred

df_pred_vital_signs = df_pred_vital_signs.reset_index().rename(columns={"index": "pid"})

## Export to ZIP file

In [118]:
df_predictions = pd.merge(df_pred_medical_test, df_pred_sepsis, on="pid")
df_predictions = pd.merge(df_predictions, df_pred_vital_signs, on="pid")
print("Export predictions DataFrame to a zip file")
print(df_predictions)
df_predictions.to_csv(
    "predictions.csv",
    index=None,
    sep=",",
    header=True,
    encoding="utf-8-sig",
    float_format="%.2f",
)

with zipfile.ZipFile("predictions.zip", "w", compression=zipfile.ZIP_DEFLATED) as zf:
    zf.write("predictions.csv")
os.remove("predictions.csv")

Export predictions DataFrame to a zip file
           pid  LABEL_BaseExcess  LABEL_Fibrinogen  LABEL_AST  \
0          0.0          0.879785          0.802845   0.866690   
1          3.0          0.097245          0.306424   0.580433   
2          5.0          0.141181          0.306424   0.408596   
3          7.0          0.943669          0.855563   0.921647   
4          9.0          0.340743          0.489300   0.399829   
...        ...               ...               ...        ...   
12659  31647.0          0.111564          0.306424   0.408596   
12660  31649.0          0.862636          0.381854   0.816294   
12661  31651.0          0.881238          0.503553   0.326502   
12662  31652.0          0.026464          0.306424   0.438133   
12663  31655.0          0.180528          0.514729   0.503258   

       LABEL_Alkalinephos  LABEL_Bilirubin_total  LABEL_Lactate  \
0                0.798704               0.747181       0.799121   
1                0.522864               0.