# Implementation of the solution to project 2 from scratch
To note:
- For clarity the df_test was renamed to df_val as the test word was used when splitting the labeled data into train and test. 
    - Val stands for validation

To try out:
- Preprocessing
    - Tweak data imputer
    - Tweak scaler (Robust scaler, minmax, etc..)
    - Tweak feature selection parameter
    - Tweak order of operations above to see the effect
- Modelling
    - XGBoost
    - SVM

## Import modules

In [2]:
import argparse
import logging
import os
import shutil
import sys
import zipfile
import time
import sys
import torch

import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from collections import Counter
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler

from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.feature_selection import SelectKBest, f_regression, chi2, f_classif
from sklearn.metrics import f1_score, mean_squared_error, accuracy_score, r2_score, roc_auc_score, recall_score, precision_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import label_ranking_average_precision_score as LRAPS
from sklearn.metrics import label_ranking_loss as LRL
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, BayesianRidge, LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

## Define global variables

In [3]:
# Global variables
IDENTIFIERS = ["pid", "Time"]
MEDICAL_TESTS = [
    "LABEL_BaseExcess",
    "LABEL_Fibrinogen",
    "LABEL_AST",
    "LABEL_Alkalinephos",
    "LABEL_Bilirubin_total",
    "LABEL_Lactate",
    "LABEL_TroponinI",
    "LABEL_SaO2",
    "LABEL_Bilirubin_direct",
    "LABEL_EtCO2",
]
VITAL_SIGNS = ["LABEL_RRate", "LABEL_ABPm", "LABEL_SpO2", "LABEL_Heartrate"]
SEPSIS = ["LABEL_Sepsis"]
ESTIMATOR = {"bayesian": BayesianRidge(), "decisiontree": DecisionTreeRegressor(max_features="sqrt", random_state=0), 
                "extratree": ExtraTreesRegressor(n_estimators=10, random_state=0), 
                "knn": KNeighborsRegressor(n_neighbors=10, weights="distance")}
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda:0


In [4]:
def sigmoid_f(x):
    """To get predictions as confidence level, the model predicts for all 12 sets of measures for
    each patient a distance to the hyperplane ; it is then transformed into a confidence level using
    the sigmoid function ; the confidence level reported is the mean of all confidence levels for a
    single patient

    Args:
        x (float): input of the sigmoid function

    Returns:
       float: result of the sigmoid computation.

    """
    return 1 / (1 + np.exp(-x))

## Load Data

In [5]:
df_train = pd.read_csv(r"data/train_features.csv")
df_train_label = pd.read_csv(r"data/train_labels.csv")
df_val = pd.read_csv(r"data/test_features.csv")

## Data imputation methodology

In [None]:
# Fit imputer to missing data
pid_train = df_train["pid"].unique()
columns = df_train.columns
df_train_preprocessed = pd.DataFrame(columns=columns, index=pid_train)

imputer = SimpleImputer()
columns = df_train.columns
df_train = imputer.fit_transform(df_train.values)
df_train = pd.DataFrame(df_train, columns=columns)
for patient in tqdm(pid_train):
    for column in df_train.columns:
        df_train_preprocessed.at[patient, column] = df_train.loc[
            df_train["pid"] == patient
        ][column].mean()

In [None]:
# Tranform test data according to same imputer
pid_val = df_val["pid"].unique()
columns = df_val.columns
df_val_preprocessed = pd.DataFrame(columns=columns, index=pid_val)

columns = df_val.columns
df_val = imputer.transform(df_val.values)
df_val = pd.DataFrame(df_val, columns=columns)
for patient in tqdm(pid_val):
    for column in df_val.columns:
        df_val_preprocessed.at[patient, column] = df_val.loc[
            df_val["pid"] == patient
        ][column].mean()

In [None]:
df_train_preprocessed.to_csv("df_train_philip.csv")
df_val_preprocessed.to_csv("df_val_philip.csv")

## Data formatting

In [6]:
df_train_preprocessed = pd.read_csv("df_train_philip.csv")
df_val_preprocessed = pd.read_csv("df_val_philip.csv")

In [36]:
# Data formatting
X_train = df_train_preprocessed.drop(columns=IDENTIFIERS).values
X_val = df_val_preprocessed.drop(columns=IDENTIFIERS).values
# Create list with different label for each medical test
print("Creating a list of labels for each medical test")
y_train_medical_tests = []
for test in MEDICAL_TESTS:
    y_train_medical_tests.append(df_train_label[test].astype(int).values)

# Create list with different label for sepsis
print("Creating a list of labels for each medical test")
y_train_sepsis = []
for sepsis in SEPSIS:
    y_train_sepsis.append(df_train_label[sepsis].astype(int).values)

# Create list with different label for each vital sign
print("Creating a list of labels for each vital sign")
y_train_vital_signs = []
for sign in VITAL_SIGNS:
    y_train_vital_signs.append(df_train_label[sign].astype(int).values)

Creating a list of labels for each medical test
Creating a list of labels for each medical test
Creating a list of labels for each vital sign


## Scaler

In [37]:
# Scale data 
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

## Modelling medical tests

In [22]:
# # Modelling of medical tests using logistic regression with cross validation
# models = []
# losses = []
# columns_medical_tests = []
# for i, test in enumerate(MEDICAL_TESTS):
#     print(f"Fitting model for {test}.")

#     print("Applying feature selection")
#     feature_selector = SelectKBest(score_func=f_classif, k=3)
#     X_train = feature_selector.fit_transform(X_train, y_train_medical_tests[i])
#     X_test = feature_selector.transform(X_test)
#     columns = feature_selector.get_support(indices=True)
#     columns_medical_tests.append(columns)

#     print("Fitting model")
#     clf = LogisticRegressionCV(cv=5, random_state=42).fit(X_train, y_train_medical_tests[i])
#     models.append(clf)
#     print(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))
#     print(f"Finished test for medical tests.")

In [15]:
# Modelling using extreme gradient boosting
clf = xgb.XGBClassifier(objective="binary:logistic", n_thread=-1)
models = []
losses = []
feature_selector_medical_tests = []
for i, test in enumerate(MEDICAL_TESTS):
    print(f"Fitting model for {test}.")
    X_train, X_test, y_train, y_test = train_test_split(
    X_train_scaled, y_train_medical_tests[i], test_size=0.10, random_state=42, shuffle=True
    )
    # Coarse parameter grid not optimized at all yet
    param_grid = {
        "booster": ["dart", "gbtree", "gblinear"],
        "eta": np.arange(0.01, 0.4, 0.015),
        "min_child_weight": range(1, 10, 1),
        "max_depth": range(3, 8, 1),
        "gamma": range(0, 100, 2),
        "max_delta_step": range(1, 10, 1),
        "subsample": np.arange(0.1, 1, 0.05),
        "colsample_bytree": np.arange(0.3, 1, 0.05),
        "n_estimators": range(50, 120, 2),
        "scale_pos_weight": [1],
        "reg_lambda": [0, 1], # Ridge regularization
        "reg_alpha": [0, 1], # Lasso regularization
        "eval_metric": ["error"],
        "verbosity": [1]
    }
    
    print("Resampling")
    sampler = RandomUnderSampler(random_state=42)
    X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
    
    print("Applying feature selection")
    feature_selector = SelectKBest(score_func=f_classif, k=5)
    X_train_selected = feature_selector.fit_transform(X_train_res, y_train_res)
    X_test = feature_selector.transform(X_test)
    feature_selector_medical_tests.append(feature_selector)

    print("Fitting model")
    coarse_search = RandomizedSearchCV(estimator=clf,
            param_distributions=param_grid, scoring="roc_auc",
            n_jobs=-1, cv=5, n_iter=100, verbose=1)
    coarse_search.fit(X_train_selected, y_train_res)
    
    models.append(coarse_search.best_estimator_)
    print(coarse_search.best_estimator_.predict_proba(X_test)[:,1])
    print(f"ROC score on test set {roc_auc_score(y_test, coarse_search.best_estimator_.predict_proba(X_test)[:,1])}")
    print(f"CV score {coarse_search.best_score_}")
print(f"Finished test for medical tests.")

Fitting model for LABEL_BaseExcess.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   24.3s finished


[0.2007269  0.15766926 0.7120504  ... 0.2723564  0.22081883 0.21603867]
ROC score on test set 0.8637883581727395
CV score 0.8614883692850748
Fitting model for LABEL_Fibrinogen.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 477 out of 500 | elapsed:    5.9s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    6.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


[0.355957   0.3287823  0.62840647 ... 0.6543695  0.343452   0.3434726 ]
ROC score on test set 0.7571069414918199
CV score 0.7337220367413428
Fitting model for LABEL_AST.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 450 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   24.2s finished


[0.3520337  0.29511055 0.27799216 ... 0.6747026  0.3986621  0.34382242]
ROC score on test set 0.7188894287127875
CV score 0.69814238805551
Fitting model for LABEL_Alkalinephos.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   25.3s finished


[0.34052482 0.35821488 0.3142905  ... 0.64606506 0.36751089 0.350552  ]
ROC score on test set 0.7209169397186102
CV score 0.70713486050038
Fitting model for LABEL_Bilirubin_total.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 224 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 474 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   26.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


[0.34983394 0.3609466  0.2783976  ... 0.64436424 0.38911775 0.34983394]
ROC score on test set 0.7286266962138997
CV score 0.7159922726347869
Fitting model for LABEL_Lactate.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   20.1s finished


[0.6617697  0.2889779  0.6748714  ... 0.72084343 0.29953173 0.34955317]
ROC score on test set 0.7643579273144993
CV score 0.7541476305699097
Fitting model for LABEL_TroponinI.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 477 out of 500 | elapsed:    8.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    8.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


[0.28923148 0.64543474 0.36900583 ... 0.25142214 0.60242796 0.22499003]
ROC score on test set 0.7173066052971575
CV score 0.7307102328512185
Fitting model for LABEL_SaO2.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   30.4s finished


[0.6241678  0.39949998 0.7719695  ... 0.3062304  0.38566342 0.17879733]
ROC score on test set 0.7549642873545678
CV score 0.7497046076580526
Fitting model for LABEL_Bilirubin_direct.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 477 out of 500 | elapsed:    3.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    3.5s finished


[0.41245383 0.27477345 0.37068772 ... 0.5520532  0.27477345 0.36429635]
ROC score on test set 0.7674199843871976
CV score 0.6987457964381653
Fitting model for LABEL_EtCO2.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 477 out of 500 | elapsed:    5.6s remaining:    0.3s


[0.33455676 0.2665775  0.11448496 ... 0.3534253  0.17914297 0.17474297]
ROC score on test set 0.8407192524989135
CV score 0.803455114638448
Finished test for medical tests.


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    5.9s finished


In [16]:
# Get predictions for medical tests
val_pids = np.unique(df_val["pid"].values)
df_pred_medical_test = pd.DataFrame(index=val_pids, columns=MEDICAL_TESTS)
for i, test in enumerate(MEDICAL_TESTS):
    col_for_medical_test = columns_medical_tests[i]
    feature_selector = feature_selector_medical_tests[i]
    X_val_vital_sign = feature_selector.transform(X_val_scaled)
    model_for_test = models[i]
#     print(model_for_test.predict_proba(X_val_vital_sign))
    y_pred = model_for_test.predict_proba(X_val_vital_sign)[:, 1]
    df_pred_medical_test[test] = y_pred

df_pred_medical_test = df_pred_medical_test.reset_index().rename(columns={"index": "pid"})

## Modelling sepsis

In [42]:
# Model and predict sepsis

clf = xgb.XGBClassifier(objective="binary:logistic", n_thread=-1)


X_train, X_test, y_train, y_test = train_test_split(
    X_train_scaled, y_train_sepsis[0], test_size=0.10, random_state=42, shuffle=True
)


param_grid = {
        "booster": ["dart", "gbtree", "gblinear"],
        "eta": np.arange(0.01, 0.4, 0.015),
        "min_child_weight": range(1, 10, 1),
        "max_depth": range(3, 8, 1),
        "gamma": range(0, 100, 2),
        "max_delta_step": range(1, 10, 1),
        "subsample": np.arange(0.1, 1, 0.05),
        "colsample_bytree": np.arange(0.3, 1, 0.05),
        "n_estimators": range(50, 120, 2),
        "scale_pos_weight": [1],
        "reg_lambda": [0, 1], # Ridge regularization
        "reg_alpha": [0, 1], # Lasso regularization
        "eval_metric": ["error"],
        "verbosity": [1]
    }

print("Resampling")
sampler = RandomUnderSampler()
X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)

print("Applying feature selection")
feature_selector = SelectKBest(score_func=f_classif, k=30)
X_train = feature_selector.fit_transform(X_train_res, y_train_res)
X_test = feature_selector.transform(X_test)


print("Fitting model")
coarse_search = RandomizedSearchCV(estimator=clf,
        param_distributions=param_grid, scoring="roc_auc",
        n_jobs=-1, cv=5, n_iter=200, verbose=1)
print(y_train_res)
coarse_search.fit(X_train, y_train_res)

sepsis_model = coarse_search.best_estimator_
print(f"ROC score on test set {roc_auc_score(y_test, coarse_search.best_estimator_.predict_proba(X_test)[:,1])}")
print(f"CV score {coarse_search.best_score_}")
print(f"Finished test for medical tests.")

Resampling
Applying feature selection
Fitting model
[0 0 0 ... 1 1 1]
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 606 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 956 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   29.0s finished


ROC score on test set 0.7158712541620421
CV score 0.7174778083868992
Finished test for medical tests.


In [43]:
X_val_sepsis = feature_selector.transform(X_val_scaled)
y_pred = sepsis_model.predict_proba(X_val_sepsis)[:,1]
# y_pred = model.predict_proba(X_test)[:,1]
df_pred_sepsis = pd.DataFrame(y_pred, index=val_pids, columns=SEPSIS)
df_pred_sepsis = df_pred_sepsis.reset_index().rename(columns={"index": "pid"})

(12664, 30)

## Modelling vital signsy_train_sepsis

In [46]:
# Modelling of vital signs
models = []
losses = []
feature_selectors_vital_signs = []
clf = xgb.XGBRegressor(objective="reg:squarederror", n_thread=-1)

for i, sign in enumerate(VITAL_SIGNS):
    print(f"Fitting model for {sign}.")
    X_train, X_test, y_train, y_test = train_test_split(
        X_train_scaled, y_train_vital_signs[i], test_size=0.10, random_state=42, shuffle=True
    )

    print("Applying feature selection")
    feature_selector = SelectKBest(score_func=f_classif, k=5)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    X_test_selected = feature_selector.transform(X_test)
    feature_selectors_vital_signs.append(feature_selector)

    print("Fitting model")
    
    param_grid = {
        "booster": ["dart", "gbtree", "gblinear"],
        "eta": np.arange(0.01, 0.4, 0.015),
        "min_child_weight": range(1, 10, 1),
        "max_depth": range(3, 8, 1),
        "gamma": range(0, 100, 2),
        "max_delta_step": range(1, 10, 1),
        "subsample": np.arange(0.1, 1, 0.05),
        "colsample_bytree": np.arange(0.3, 1, 0.05),
        "n_estimators": range(50, 120, 2),
        "scale_pos_weight": [1],
        "reg_lambda": [0], # Ridge regularization
        "reg_alpha": [1], # Lasso regularization
        "eval_metric": ["error"],
        "verbosity": [1]
    }
    
    coarse_search = RandomizedSearchCV(estimator=clf,
            param_distributions=param_grid, scoring="r2",
            n_jobs=-1, cv=5, n_iter=100, verbose=1)
    coarse_search.fit(X_train_selected, y_train)
    models.append(coarse_search.best_estimator_)
    print(f"CV score {coarse_search.best_score_}")
    print(f"Test score is {r2_score(y_test, coarse_search.best_estimator_.predict(X_test_selected))}")
    print(f"Finished test for medical tests.")

Fitting model for LABEL_RRate.
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   52.3s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.0min finished


CV score 0.3785672394199969
Test score is 0.3918968001089592
Finished test for medical tests.
Fitting model for LABEL_ABPm.
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   40.5s finished


CV score 0.5795911299758122
Test score is 0.5640360411805216
Finished test for medical tests.
Fitting model for LABEL_SpO2.
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   38.1s finished


CV score 0.30082646302476773
Test score is 0.229747787761021
Finished test for medical tests.
Fitting model for LABEL_Heartrate.
Applying feature selection
Fitting model
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   35.2s finished


CV score 0.5989895392555413
Test score is 0.59719616504937
Finished test for medical tests.


In [49]:
# Get predictions for medical tests
val_pids = np.unique(df_val["pid"].values)
df_pred_vital_signs = pd.DataFrame(index=val_pids, columns=VITAL_SIGNS)
for i, test in enumerate(VITAL_SIGNS):
    feature_selector = feature_selectors_vital_signs[i]
    X_val_vital_sign = feature_selector.transform(X_val_scaled)
    model_for_test = models[i]
    y_pred = model_for_test.predict(X_val_vital_sign)
    df_pred_vital_signs[test] = y_pred

df_pred_vital_signs = df_pred_vital_signs.reset_index().rename(columns={"index": "pid"})

## Export to ZIP file

In [50]:
df_predictions = pd.merge(df_pred_medical_test, df_pred_sepsis, on="pid")
df_predictions = pd.merge(df_predictions, df_pred_vital_signs, on="pid")
print("Export predictions DataFrame to a zip file")
print(df_predictions)
df_predictions.to_csv(
    "predictions.csv",
    index=None,
    sep=",",
    header=True,
    encoding="utf-8-sig",
    float_format="%.2f",
)

with zipfile.ZipFile("predictions.zip", "w", compression=zipfile.ZIP_DEFLATED) as zf:
    zf.write("predictions.csv")
os.remove("predictions.csv")

Export predictions DataFrame to a zip file
         pid  LABEL_BaseExcess  LABEL_Fibrinogen  LABEL_AST  \
0          0          0.861471          0.694429   0.895492   
1          3          0.125926          0.304176   0.583918   
2          5          0.032484          0.317638   0.416464   
3          7          0.294058          0.415359   0.523530   
4          9          0.212313          0.360426   0.474583   
...      ...               ...               ...        ...   
12659  31647          0.607426          0.388649   0.486032   
12660  31649          0.637514          0.716042   0.302426   
12661  31651          0.883313          0.335106   0.383662   
12662  31652          0.951792          0.767715   0.657612   
12663  31655          0.919913          0.361826   0.295301   

       LABEL_Alkalinephos  LABEL_Bilirubin_total  LABEL_Lactate  \
0                0.875957               0.890575       0.804769   
1                0.560731               0.551155       0.256450   