# Implementation of the solution to project 2 from scratch
To note:
- For clarity the df_test was renamed to df_val as the test word was used when splitting the labeled data into train and test. 
    - Val stands for validation

To try out:
- Preprocessing
    - Tweak data imputer
    - Tweak scaler (Robust scaler, minmax, etc..)
    - Tweak feature selection parameter
    - Tweak order of operations above to see the effect
- Modelling
    - XGBoost
    - SVM

## Import modules

In [1]:
import argparse
import logging
import os
import shutil
import sys
import zipfile
import time
import sys
import torch

import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from collections import Counter
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler

from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.feature_selection import SelectKBest, f_regression, chi2, f_classif
from sklearn.metrics import f1_score, mean_squared_error, accuracy_score, r2_score, roc_auc_score, recall_score, precision_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import label_ranking_average_precision_score as LRAPS
from sklearn.metrics import label_ranking_loss as LRL
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, BayesianRidge, LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

## Define global variables

In [2]:
# Global variables
IDENTIFIERS = ["pid", "Time"]
MEDICAL_TESTS = [
    "LABEL_BaseExcess",
    "LABEL_Fibrinogen",
    "LABEL_AST",
    "LABEL_Alkalinephos",
    "LABEL_Bilirubin_total",
    "LABEL_Lactate",
    "LABEL_TroponinI",
    "LABEL_SaO2",
    "LABEL_Bilirubin_direct",
    "LABEL_EtCO2",
]
VITAL_SIGNS = ["LABEL_RRate", "LABEL_ABPm", "LABEL_SpO2", "LABEL_Heartrate"]
SEPSIS = ["LABEL_Sepsis"]
ESTIMATOR = {"bayesian": BayesianRidge(), "decisiontree": DecisionTreeRegressor(max_features="sqrt", random_state=0), 
                "extratree": ExtraTreesRegressor(n_estimators=10, random_state=0), 
                "knn": KNeighborsRegressor(n_neighbors=10, weights="distance")}
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cpu


In [3]:
def sigmoid_f(x):
    """To get predictions as confidence level, the model predicts for all 12 sets of measures for
    each patient a distance to the hyperplane ; it is then transformed into a confidence level using
    the sigmoid function ; the confidence level reported is the mean of all confidence levels for a
    single patient

    Args:
        x (float): input of the sigmoid function

    Returns:
       float: result of the sigmoid computation.

    """
    return 1 / (1 + np.exp(-x))

## Load Data

In [4]:
df_train = pd.read_csv(r"data/train_features.csv")
df_train_label = pd.read_csv(r"data/train_labels.csv")
df_val = pd.read_csv(r"data/test_features.csv")

## Data imputation methodology

In [None]:
# Fit imputer to missing data
pid_train = df_train["pid"].unique()
columns = df_train.columns
df_train_preprocessed = pd.DataFrame(columns=columns, index=pid_train)

imputer = SimpleImputer()
columns = df_train.columns
df_train = imputer.fit_transform(df_train.values)
df_train = pd.DataFrame(df_train, columns=columns)
for patient in tqdm(pid_train):
    for column in df_train.columns:
        df_train.groupby(["pid"], as_index=False).mean()
        """df_train_preprocessed.at[patient, column] = df_train.loc[
            df_train["pid"] == patient
        ][column].mean()"""

In [None]:
# Tranform test data according to same imputer
pid_val = df_val["pid"].unique()
columns = df_val.columns
df_val_preprocessed = pd.DataFrame(columns=columns, index=pid_val)

df_val = imputer.transform(df_val.values)
df_val = pd.DataFrame(df_val, columns=columns)
for patient in tqdm(pid_val):
    for column in df_val.columns:
        df_val_preprocessed.at[patient, column] = df_val.loc[
            df_val["pid"] == patient
        ][column].mean()

In [None]:
df_train_preprocessed.to_csv("df_train_philip.csv")
df_val_preprocessed.to_csv("df_val_philip.csv")

## Data formatting

In [70]:
df_train_preprocessed = pd.read_csv("df_train_philip.csv")
df_val_preprocessed = pd.read_csv("df_val_philip.csv")
df_train_preprocessed = df_train_preprocessed.sort_values(by=["pid"])
df_val_preprocessed = df_val_preprocessed.sort_values(by=["pid"])
df_train_label = df_train_label.sort_values(by=["pid"])

In [74]:
# Data formatting
X_train = df_train_preprocessed.drop(columns=IDENTIFIERS+['Unnamed: 0']).values
X_val = df_val_preprocessed.drop(columns=IDENTIFIERS+['Unnamed: 0']).values
# Create list with different label for each medical test
print("Creating a list of labels for each medical test")
y_train_medical_tests = []
for test in MEDICAL_TESTS:
    y_train_medical_tests.append(df_train_label[test].astype(int).values)

# Create list with different label for sepsis
print("Creating a list of labels for each medical test")
y_train_sepsis = []
for sepsis in SEPSIS:
    y_train_sepsis.append(df_train_label[sepsis].astype(int).values)

# Create list with different label for each vital sign
print("Creating a list of labels for each vital sign")
y_train_vital_signs = []
for sign in VITAL_SIGNS:
    y_train_vital_signs.append(df_train_label[sign].astype(int).values)

Creating a list of labels for each medical test
Creating a list of labels for each medical test
Creating a list of labels for each vital sign


## Scaler

In [80]:
# Scale data 
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

## Modelling medical tests

In [22]:
# # Modelling of medical tests using logistic regression with cross validation
# models = []
# losses = []
# columns_medical_tests = []
# for i, test in enumerate(MEDICAL_TESTS):
#     print(f"Fitting model for {test}.")

#     print("Applying feature selection")
#     feature_selector = SelectKBest(score_func=f_classif, k=3)
#     X_train = feature_selector.fit_transform(X_train, y_train_medical_tests[i])
#     X_test = feature_selector.transform(X_test)
#     columns = feature_selector.get_support(indices=True)
#     columns_medical_tests.append(columns)

#     print("Fitting model")
#     clf = LogisticRegressionCV(cv=5, random_state=42).fit(X_train, y_train_medical_tests[i])
#     models.append(clf)
#     print(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))
#     print(f"Finished test for medical tests.")

In [81]:
# Modelling using extreme gradient boosting
clf = xgb.XGBClassifier(objective="binary:logistic", n_thread=-1)
models = []
losses = []
feature_selector_medical_tests = []
for i, test in enumerate(MEDICAL_TESTS):
    print(f"Fitting model for {test}.")
    X_train, X_test, y_train, y_test = train_test_split(
    X_train_scaled, y_train_medical_tests[i], test_size=0.10, random_state=42, shuffle=True
    )
    # Coarse parameter grid not optimized at all yet
    param_grid = {
        "booster": ["dart", "gbtree", "gblinear"],
        "eta": [0.4],
        "min_child_weight": range(1, 10, 1),
        "max_depth": range(3, 8, 1),
        "gamma": range(0, 100, 2),
        "max_delta_step": range(1, 10, 1),
        "subsample": np.arange(0.1, 1, 0.05),
        "colsample_bytree": np.arange(0.3, 1, 0.05),
        "n_estimators": range(50, 120, 2),
        "scale_pos_weight": [1],
        "reg_lambda": [0, 1], # Ridge regularization
        "reg_alpha": [0, 1], # Lasso regularization
        "eval_metric": ["error"],
        "verbosity": [1]
    }
    
    print("Resampling")
    sampler = RandomUnderSampler(random_state=42)
    X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
    
    print("Applying feature selection")
    feature_selector = SelectKBest(score_func=f_classif, k=5)
    X_train_selected = feature_selector.fit_transform(X_train_res, y_train_res)
    X_test = feature_selector.transform(X_test)
    feature_selector_medical_tests.append(feature_selector)

    print("Fitting model")
    coarse_search = RandomizedSearchCV(estimator=clf,
            param_distributions=param_grid, scoring="roc_auc",
            n_jobs=-1, cv=5, n_iter=10, verbose=1)
    coarse_search.fit(X_train_selected, y_train_res)
    
    models.append(coarse_search.best_estimator_)
    print(coarse_search.best_estimator_.predict_proba(X_test)[:,1])
    print(f"ROC score on test set {roc_auc_score(y_test, coarse_search.best_estimator_.predict_proba(X_test)[:,1])}")
    print(f"CV score {coarse_search.best_score_}")
print(f"Finished test for medical tests.")

Fitting model for LABEL_BaseExcess.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.8s finished


[0.9133649  0.15137842 0.35653293 ... 0.8404726  0.23370351 0.7504211 ]
ROC score on test set 0.8720664367091563
CV score 0.8499881710752918
Fitting model for LABEL_Fibrinogen.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[0.9167671  0.55924815 0.41805124 ... 0.43052113 0.17055517 0.38804427]
ROC score on test set 0.7164732394366197
CV score 0.7069711649365629
Fitting model for LABEL_AST.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    9.4s finished


[0.7652198  0.598451   0.38443807 ... 0.59787625 0.44218227 0.55905485]
ROC score on test set 0.735198420677427
CV score 0.6865353955978584
Fitting model for LABEL_Alkalinephos.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    6.4s finished


[0.85528696 0.62137836 0.3826815  ... 0.5930465  0.46166366 0.55182856]
ROC score on test set 0.7346409509051555
CV score 0.707020405683737
Fitting model for LABEL_Bilirubin_total.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    7.6s finished


[0.95152473 0.7586239  0.46238422 ... 0.35689822 0.37892026 0.4866535 ]
ROC score on test set 0.7109291968599034
CV score 0.6742333694448884
Fitting model for LABEL_Lactate.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.4s finished


[0.7189623  0.28091273 0.3274959  ... 0.52449685 0.28091273 0.52449685]
ROC score on test set 0.7413826979550004
CV score 0.7381401210895115
Fitting model for LABEL_TroponinI.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.0s finished


[0.06050481 0.72526693 0.8421724  ... 0.27953982 0.55598676 0.16569886]
ROC score on test set 0.7206476190476191
CV score 0.737759430773391
Fitting model for LABEL_SaO2.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.2s finished


[0.70416594 0.2661018  0.43171635 ... 0.5904     0.31374696 0.5670996 ]
ROC score on test set 0.7477238009408397
CV score 0.7326060895552016
Fitting model for LABEL_Bilirubin_direct.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.5s finished


[0.84498245 0.70148903 0.15177833 ... 0.34292573 0.4206406  0.5721898 ]
ROC score on test set 0.6785011709601874
CV score 0.6895317878818028
Fitting model for LABEL_EtCO2.
Resampling
Applying feature selection
Fitting model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    2.0s finished


[0.39310133 0.39310133 0.29227838 ... 0.49772987 0.29227838 0.3871921 ]
ROC score on test set 0.7955803571428572
CV score 0.7573856790289858
Finished test for medical tests.


In [82]:
val_pids = np.unique(df_val["pid"].values)

In [84]:
# Get predictions for medical tests
df_pred_medical_test = pd.DataFrame(index=val_pids, columns=MEDICAL_TESTS)
for i, test in enumerate(MEDICAL_TESTS):
    feature_selector = feature_selector_medical_tests[i]
    X_val_vital_sign = feature_selector.transform(X_val_scaled)
    model_for_test = models[i]
#     print(model_for_test.predict_proba(X_val_vital_sign))
    y_pred = model_for_test.predict_proba(X_val_vital_sign)[:, 1]
    df_pred_medical_test[test] = y_pred

df_pred_medical_test = df_pred_medical_test.reset_index().rename(columns={"index": "pid"})

In [85]:
df_pred_medical_test

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2
0,0,0.871570,0.414509,0.734938,0.797877,0.940729,0.727136,0.186444,0.632642,0.746171,0.393101
1,3,0.191440,0.262532,0.447075,0.462094,0.482383,0.280913,0.704378,0.347416,0.454505,0.292278
2,5,0.346440,0.262532,0.486703,0.442523,0.507434,0.327496,0.428069,0.469386,0.454505,0.292278
3,7,0.871570,0.993759,0.861413,0.887471,0.840414,0.554652,0.279540,0.590400,0.905170,0.393101
4,9,0.272734,0.281472,0.486703,0.555995,0.628334,0.532791,0.190030,0.257057,0.495337,0.292278
...,...,...,...,...,...,...,...,...,...,...,...
12659,31647,0.335368,0.262532,0.447075,0.391918,0.396152,0.280913,0.082862,0.330013,0.454505,0.292278
12660,31649,0.233704,0.473012,0.739733,0.742696,0.491331,0.409006,0.704378,0.313747,0.834073,0.292278
12661,31651,0.863581,0.347140,0.447075,0.491088,0.870510,0.608233,0.303133,0.684031,0.348651,0.393101
12662,31652,0.188422,0.244060,0.442182,0.461664,0.243381,0.280913,0.664075,0.266102,0.148458,0.393101


## Modelling sepsis

In [87]:
# Model and predict sepsis

clf = xgb.XGBClassifier(objective="binary:logistic", n_thread=-1)


X_train, X_test, y_train, y_test = train_test_split(
    X_train_scaled, y_train_sepsis[0], test_size=0.10, random_state=42, shuffle=True
)


param_grid = {
        "booster": ["dart", "gbtree", "gblinear"],
        "eta": [0.01],
        "min_child_weight": range(1, 10, 1),
        "max_depth": range(3, 8, 1),
        "gamma": range(0, 100, 2),
        "max_delta_step": range(1, 10, 1),
        "subsample": np.arange(0.1, 1, 0.05),
        "colsample_bytree": np.arange(0.3, 1, 0.05),
        "n_estimators": range(50, 120, 2),
        "scale_pos_weight": [1],
        "reg_lambda": [0, 1], # Ridge regularization
        "reg_alpha": [0, 1], # Lasso regularization
        "eval_metric": ["error"],
        "verbosity": [1]
    }

print("Resampling")
sampler = RandomUnderSampler()
X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)

print("Applying feature selection")
feature_selector = SelectKBest(score_func=f_classif, k=30)
X_train = feature_selector.fit_transform(X_train_res, y_train_res)
X_test = feature_selector.transform(X_test)


print("Fitting model")
coarse_search = RandomizedSearchCV(estimator=clf,
        param_distributions=param_grid, scoring="roc_auc",
        n_jobs=-1, cv=5, n_iter=50, verbose=1)
print(y_train_res)
coarse_search.fit(X_train, y_train_res)

sepsis_model = coarse_search.best_estimator_
print(f"ROC score on test set {roc_auc_score(y_test, coarse_search.best_estimator_.predict_proba(X_test)[:,1])}")
print(f"CV score {coarse_search.best_score_}")
print(f"Finished test for medical tests.")

Resampling
Applying feature selection
Fitting model
[0 0 0 ... 1 1 1]
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   29.7s finished


ROC score on test set 0.7518721421525136
CV score 0.6942415326633167
Finished test for medical tests.


In [88]:
X_val_sepsis = feature_selector.transform(X_val_scaled)
y_pred = sepsis_model.predict_proba(X_val_sepsis)[:,1]
df_pred_sepsis = pd.DataFrame(y_pred, index=val_pids, columns=SEPSIS)
df_pred_sepsis = df_pred_sepsis.reset_index().rename(columns={"index": "pid"})

## Modelling vital signsy_train_sepsis

In [91]:
# Modelling of vital signs
models = []
losses = []
feature_selectors_vital_signs = []
clf = xgb.XGBRegressor(objective="reg:squarederror", n_thread=-1)

for i, sign in enumerate(VITAL_SIGNS):
    print(f"Fitting model for {sign}.")
    X_train, X_test, y_train, y_test = train_test_split(
        X_train_scaled, y_train_vital_signs[i], test_size=0.10, random_state=42, shuffle=True
    )

    print("Applying feature selection")
    feature_selector = SelectKBest(score_func=f_classif, k=5)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    X_test_selected = feature_selector.transform(X_test)
    feature_selectors_vital_signs.append(feature_selector)

    print("Fitting model")
    
    param_grid = {
        "booster": ["dart", "gbtree", "gblinear"],
        "eta": [0.4],
        "min_child_weight": range(1, 10, 1),
        "max_depth": range(3, 8, 1),
        "gamma": range(0, 100, 2),
        "max_delta_step": range(1, 10, 1),
        "subsample": np.arange(0.1, 1, 0.05),
        "colsample_bytree": np.arange(0.3, 1, 0.05),
        "n_estimators": range(50, 120, 2),
        "scale_pos_weight": [1],
        "reg_lambda": [0], # Ridge regularization
        "reg_alpha": [1], # Lasso regularization
        "eval_metric": ["error"],
        "verbosity": [1]
    }
    
    coarse_search = RandomizedSearchCV(estimator=clf,
            param_distributions=param_grid, scoring="r2",
            n_jobs=-1, cv=5, n_iter=50, verbose=1)
    coarse_search.fit(X_train_selected, y_train)
    models.append(coarse_search.best_estimator_)
    print(f"CV score {coarse_search.best_score_}")
    print(f"Test score is {r2_score(y_test, coarse_search.best_estimator_.predict(X_test_selected))}")
    print(f"Finished test for medical tests.")

Fitting model for LABEL_RRate.
Applying feature selection
Fitting model
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   57.2s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  1.3min finished


CV score 0.37595759997833506
Test score is 0.35855085021886113
Finished test for medical tests.
Fitting model for LABEL_ABPm.
Applying feature selection
Fitting model
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   42.8s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  1.1min finished


CV score 0.5726707286905983
Test score is 0.5834844838976045
Finished test for medical tests.
Fitting model for LABEL_SpO2.
Applying feature selection
Fitting model
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   38.9s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   56.3s finished


CV score 0.30585736553756443
Test score is 0.3310165791896157
Finished test for medical tests.
Fitting model for LABEL_Heartrate.
Applying feature selection
Fitting model
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   56.0s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  1.2min finished


CV score 0.5960190629449306
Test score is 0.6037441796227132
Finished test for medical tests.


In [92]:
# Get predictions for medical tests
df_pred_vital_signs = pd.DataFrame(index=val_pids, columns=VITAL_SIGNS)
for i, test in enumerate(VITAL_SIGNS):
    feature_selector = feature_selectors_vital_signs[i]
    X_val_vital_sign = feature_selector.transform(X_val_scaled)
    model_for_test = models[i]
    y_pred = model_for_test.predict(X_val_vital_sign)
    df_pred_vital_signs[test] = y_pred

df_pred_vital_signs = df_pred_vital_signs.reset_index().rename(columns={"index": "pid"})

## Export to ZIP file

In [93]:
df_predictions = pd.merge(df_pred_medical_test, df_pred_sepsis, on="pid")
df_predictions = pd.merge(df_predictions, df_pred_vital_signs, on="pid")
print("Export predictions DataFrame to a zip file")
print(df_predictions)
df_predictions.to_csv(
    "predictions.csv",
    index=None,
    sep=",",
    header=True,
    encoding="utf-8-sig",
    float_format="%.2f",
)

with zipfile.ZipFile("predictions.zip", "w", compression=zipfile.ZIP_DEFLATED) as zf:
    zf.write("predictions.csv")
os.remove("predictions.csv")

Export predictions DataFrame to a zip file
         pid  LABEL_BaseExcess  LABEL_Fibrinogen  LABEL_AST  \
0          0          0.871570          0.414509   0.734938   
1          3          0.191440          0.262532   0.447075   
2          5          0.346440          0.262532   0.486703   
3          7          0.871570          0.993759   0.861413   
4          9          0.272734          0.281472   0.486703   
...      ...               ...               ...        ...   
12659  31647          0.335368          0.262532   0.447075   
12660  31649          0.233704          0.473012   0.739733   
12661  31651          0.863581          0.347140   0.447075   
12662  31652          0.188422          0.244060   0.442182   
12663  31655          0.191440          0.274955   0.506486   

       LABEL_Alkalinephos  LABEL_Bilirubin_total  LABEL_Lactate  \
0                0.797877               0.940729       0.727136   
1                0.462094               0.482383       0.280913   