In [None]:
import pandas as pd
import numpy as np
import re
import pickle

In [None]:
train = pd.read_csv("../data/TrainingWiDS2021.csv", index_col=0)
test = pd.read_csv("../data/UnlabeledWiDS2021.csv", index_col=0)
data_dict = pd.read_csv("../data/DataDictionaryWiDS2021.csv")

n_train = train.shape[0]
n_test = test.shape[0]

train_test = pd.concat([train, test], axis=0)

## Dummy variarble of patients in the first hour and 24 hours

In [None]:
first_24h = data_dict.loc[:, "Description"].apply(lambda x: bool(re.search("24 hours", str(x))))
first_1h = data_dict.loc[:, "Description"].apply(lambda x: bool(re.search("first hour", str(x))))

data_dict["first_24h"] = first_24h
data_dict["first_1h"] = first_1h

## Dummy variables about terms (oxygen, invasively, non-invasively)

In [None]:
oxygen = data_dict.loc[:, "Description"].apply(lambda x: bool(re.search("oxygen", str(x))))
invasively = data_dict.loc[:, "Description"].apply(lambda x: bool(re.search(" invasively", str(x))))
non_invasively = data_dict.loc[:, "Description"].apply(lambda x: bool(re.search("non-invasively", str(x))))

data_dict["oxygen"] = oxygen
data_dict["invasively"] = invasively
data_dict["non_invasively"] = non_invasively

## List of variables in condition (24h, 1h, category)

In [None]:
features_24h = data_dict.loc[data_dict["first_24h"].tolist(), "Variable Name"].tolist()
features_1h = data_dict.loc[data_dict["first_1h"].tolist(), "Variable Name"].tolist()

features_apache = data_dict.loc[data_dict["Category"] == "APACHE covariate", "Variable Name"].tolist()
features_vitals = data_dict.loc[data_dict["Category"] == "vitals", "Variable Name"].tolist()
features_labs = data_dict.loc[data_dict["Category"] == "labs", "Variable Name"].tolist()
features_labs_blood_gas = data_dict.loc[data_dict["Category"] == "labs blood gas", "Variable Name"].tolist()
features_comorbidity = data_dict.loc[data_dict["Category"] == "APACHE comorbidity", "Variable Name"].tolist()

features_oxygen = data_dict.loc[data_dict["oxygen"].tolist(), "Variable Name"].tolist()
features_invasively = data_dict.loc[data_dict["invasively"].tolist(), "Variable Name"].tolist()
features_non_invasively = data_dict.loc[data_dict["non_invasively"].tolist(), "Variable Name"].tolist()

## Creating new features

In [None]:
train_test["exams_24h"] = train_test.loc[:, features_24h].notna().sum(axis=1)
train_test["exams_1h"] = train_test.loc[:, features_1h].notna().sum(axis=1)
train_test["exams_apache"] = train_test.loc[:, features_apache].notna().sum(axis=1)
train_test["exams_vitals"] = train_test.loc[:, features_vitals].notna().sum(axis=1)
train_test["exams_labs"] = train_test.loc[:, features_labs].notna().sum(axis=1)
train_test["exams_labs_blood_gas"] = train_test.loc[:, features_labs_blood_gas].notna().sum(axis=1)
train_test["comorbidity"] = train_test.loc[:, features_comorbidity].sum(axis=1)
train_test["oxygen"] = train_test.loc[:, features_oxygen].notna().sum(axis=1)
train_test["invasively"] = train_test.loc[:, features_invasively].notna().sum(axis=1)
train_test["non_invasively"] = train_test.loc[:, features_non_invasively].notna().sum(axis=1)
train_test["exams_total"] = train_test.loc[:, np.concatenate([features_apache, features_vitals, features_labs, features_labs_blood_gas])].notna().sum(axis=1)

In [None]:
train_new = train_test.head(n_train)
test_new = train_test.tail(n_test)

data_dict = {"train": train_new, "test": test_new}

In [None]:
with open("../data/data_fe.pkl", "wb") as f:
    pickle.dump(data_dict, f)