In [None]:
import cudf
import pandas as pd
import numpy as np
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import re
from preprocessing_tools import inspect_missing_data, get_top_abs_correlations

In [None]:
train = cudf.read_csv("../data/TrainingWiDS2021.csv", index_col=0)
train_pd = pd.read_csv("../data/TrainingWiDS2021.csv", index_col=0)
print(train.shape)
train.head()

In [None]:
test = cudf.read_csv("../data/UnlabeledWiDS2021.csv", index_col=0)
test_pd = pd.read_csv("../data/UnlabeledWiDS2021.csv", index_col=0)
print(test.shape)
test.head()

In [None]:
data_dict = pd.read_csv("../data/DataDictionaryWiDS2021.csv")
data_dict.head()

In [None]:
train_test = pd.concat([train_pd, test_pd], axis=0)
train_test

## Different columns between train and test

In [None]:
set(train.columns.values) - set(test.columns.values)

## Distribution of response variable

In [None]:
train["diabetes_mellitus"].value_counts(normalize=True)

## Missing values

In [None]:
missing_data = train_test.isna().sum().reset_index().sort_values(by=0, ascending=False)
missing_data.rename({"index": "variable", 0: "n_missing"}, axis=1, inplace=True)
missing_data = missing_data[missing_data["n_missing"] > 0]
missing_data["prop"] = missing_data["n_missing"] / train.shape[0]
missing_data

- **Some exams are perfomed togheter**
- **Hypothesis: if an exam is nan it's because she/he didn't made**
- **Hypothesis: if pacient didn't made the exam it's because isn't necessary, for example, the pacient with diabetis to check their status. So, imputation with an outlier value should be a good option (-999)**

In [None]:
msno.heatmap(train_test, labels=False)

In [None]:
msno.matrix(train_test.loc[:,["bmi", "height", "weight"]])

In [None]:
px.histogram(missing_data, x="prop")

## Checking levels of categorical variable

In [None]:
train_test["ethnicity"].unique()

In [None]:
train_test["gender"].unique()

## Constant variables

### Numerical

In [None]:
numerical_cols = train.columns[train.dtypes != "object"].values
categorical_cols = train.columns[train.dtypes == "object"].values

### Categorical

In [None]:
df_std = train_pd.loc[:, numerical_cols].apply("std")
df_std[df_std == 0]

In [None]:
train_pd.loc[:, categorical_cols].astype(str).apply(lambda x: len(np.unique(x)))

## Correlation matrix of numerical covariables

In [None]:
missing_data, to_drop = inspect_missing_data(train.loc[:, numerical_cols])

to_drop = np.append(to_drop, ["readmission_status", "encounter_id"])

selected_cols = list(set(numerical_cols) - set(to_drop))

train_numerical = train_pd.loc[:, selected_cols].astype("float64")
mean_values = train_numerical.mean()
train_numerical = train_numerical.fillna(mean_values)

In [None]:
df_corr = train_numerical.corr().abs()

cols = df_corr.columns.tolist()

f, ax = plt.subplots(figsize=(18, 14))
mask = np.triu(df_corr)
sns.heatmap(df_corr, cmap="YlOrRd", xticklabels=cols, yticklabels=cols, mask=mask)

In [None]:
corr_info = get_top_abs_correlations(train_numerical, n=10000000)
corr_info[corr_info["corr_abs"] >= 0.8]

## Variable hospital_id

In [None]:
hospital_train_in_test = np.isin(train["hospital_id"].unique(), test["hospital_id"].unique())
hospital_train_in_test.value_counts()

In [None]:
icu_train_in_test = np.isin(train["icu_id"].unique(), test["icu_id"].unique())
icu_train_in_test.value_counts()

## Exploring new features

## Dummy variarble of patients in the first hour

In [None]:
first_24h = data_dict.loc[:, "Description"].apply(lambda x: bool(re.search("24 hours", str(x))))
first_1h = data_dict.loc[:, "Description"].apply(lambda x: bool(re.search("first hour", str(x))))

data_dict["first_24h"] = first_24h
data_dict["first_1h"] = first_1h

## Dummy variables about terms (oxygen, invasively, non-invasively)

In [None]:
oxygen = data_dict.loc[:, "Description"].apply(lambda x: bool(re.search("oxygen", str(x))))
invasively = data_dict.loc[:, "Description"].apply(lambda x: bool(re.search(" invasively", str(x))))
non_invasively = data_dict.loc[:, "Description"].apply(lambda x: bool(re.search("non-invasively", str(x))))

data_dict["oxygen"] = oxygen
data_dict["invasively"] = invasively
data_dict["non_invasively"] = non_invasively

In [None]:
data_dict.head()

## List of variables in condition (24h, 1h, category)

**Hyphotesis: There some patients that dind't stay for 24h/1h. Certain exams are applied only to group of patients**

In [None]:
features_24h = data_dict.loc[data_dict["first_24h"].tolist(), "Variable Name"].tolist()
features_1h = data_dict.loc[data_dict["first_1h"].tolist(), "Variable Name"].tolist()

features_apache = data_dict.loc[data_dict["Category"] == "APACHE covariate", "Variable Name"].tolist()
features_vitals = data_dict.loc[data_dict["Category"] == "vitals", "Variable Name"].tolist()
features_labs = data_dict.loc[data_dict["Category"] == "labs", "Variable Name"].tolist()
features_labs_blood_gas = data_dict.loc[data_dict["Category"] == "labs blood gas", "Variable Name"].tolist()
features_comorbidity = data_dict.loc[data_dict["Category"] == "APACHE comorbidity", "Variable Name"].tolist()

features_oxygen = data_dict.loc[data_dict["oxygen"].tolist(), "Variable Name"].tolist()
features_invasively = data_dict.loc[data_dict["invasively"].tolist(), "Variable Name"].tolist()
features_non_invasively = data_dict.loc[data_dict["non_invasively"].tolist(), "Variable Name"].tolist()

## Missing correlations

In [None]:
msno.heatmap(train_test.loc[:, features_1h], labels=False)

In [None]:
msno.heatmap(train_test.loc[:, features_apache], labels=False)

In [None]:
msno.heatmap(train_test.loc[:, features_vitals], labels=False)

In [None]:
msno.heatmap(train_test.loc[:, features_labs], labels=False)

In [None]:
msno.heatmap(train_test.loc[:, features_labs_blood_gas], labels=False)

In [None]:
msno.heatmap(train_test.loc[:, features_oxygen], labels=False)

In [None]:
msno.heatmap(train_test.loc[:, features_invasively], labels=False)

In [None]:
msno.heatmap(train_test.loc[:, features_non_invasively], labels=False)

## Creating variable and checking their distribution

### Number of 24h exams

In [None]:
df_temp = train_test.loc[:, features_24h]
df_temp.notna().sum(axis=1).value_counts(sort=False)

### Number of 1h exams

In [None]:
df_temp = train_test.loc[:, features_1h]
df_temp.notna().sum(axis=1).value_counts(sort=False)

### Number of apache exams

In [None]:
df_temp = train_test.loc[:, features_apache]
df_temp.notna().sum(axis=1).value_counts(sort=False)

### Number of vital exams

In [None]:
df_temp = train_test.loc[:, features_vitals]
df_temp.notna().sum(axis=1).value_counts(sort=False)

### Number of labs exams

In [None]:
df_temp = train_test.loc[:, features_labs]
df_temp.notna().sum(axis=1).value_counts(sort=False)

### Number of labs blood gas exams

In [None]:
df_temp = train_test.loc[:, features_labs_blood_gas]
df_temp.notna().sum(axis=1).value_counts(sort=False)

### Number of comorbidity

In [None]:
df_temp = train_test.loc[:, features_comorbidity]
df_temp.sum(axis=1).value_counts(sort=False)

### Number of oxygen exams

In [None]:
df_temp = train_test.loc[:, features_oxygen]
df_temp.notna().sum(axis=1).value_counts(sort=False)

### Number of invasively exams

In [None]:
df_temp = train_test.loc[:, features_invasively]
df_temp.notna().sum(axis=1).value_counts(sort=False)

### Number of non-invasively exams

In [None]:
df_temp = train_test.loc[:, features_non_invasively]
df_temp.notna().sum(axis=1).value_counts(sort=False)

### Number of total exams

In [None]:
df_temp = train_test.loc[:, np.concatenate([features_apache, features_vitals, features_labs, features_labs_blood_gas])]
df_temp.notna().sum(axis=1).value_counts(sort=False)