In [None]:
import cudf
import pandas as pd
import numpy as np
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from preprocessing_tools import inspect_missing_data, get_top_abs_correlations

In [None]:
train = cudf.read_csv("../data/TrainingWiDS2021.csv", index_col=0)
train_pd = pd.read_csv("../data/TrainingWiDS2021.csv", index_col=0)
print(train.shape)
train.head()

In [None]:
test = cudf.read_csv("../data/UnlabeledWiDS2021.csv", index_col=0)
print(test.shape)
test.head()

## Different columns between train and test

In [None]:
set(train.columns.values) - set(test.columns.values)

In [None]:
train["diabetes_mellitus"].value_counts(normalize=True)

## Missing values

In [None]:
missing_data = train_pd.isna().sum().reset_index().sort_values(by=0, ascending=False)
missing_data.rename({"index": "variable", 0: "n_missing"}, axis=1, inplace=True)
missing_data["prop"] = missing_data["n_missing"] / train.shape[0]
missing_data.head()

In [None]:
px.histogram(missing_data, x="prop")

## Constant variables

### Numerical

In [None]:
numerical_cols = train.columns[train.dtypes != "object"].values
categorical_cols = train.columns[train.dtypes == "object"].values

### Categorical

In [None]:
df_std = train_pd.loc[:, numerical_cols].apply("std")
df_std[df_std == 0]

In [None]:
train_pd.loc[:, categorical_cols].astype(str).apply(lambda x: len(np.unique(x)))

## Correlation matrix of numerical covariables

In [None]:
missing_data, to_drop = inspect_missing_data(train.loc[:, numerical_cols])

to_drop = np.append(to_drop, ["readmission_status", "encounter_id"])

selected_cols = list(set(numerical_cols) - set(to_drop))

train_numerical = train_pd.loc[:, selected_cols].astype("float64")
mean_values = train_numerical.mean()
train_numerical = train_numerical.fillna(mean_values)

In [None]:
df_corr = train_numerical.corr().abs()

cols = df_corr.columns.tolist()

f, ax = plt.subplots(figsize=(18, 14))
mask = np.triu(df_corr)
sns.heatmap(df_corr, cmap="YlOrRd", xticklabels=cols, yticklabels=cols, mask=mask)

In [None]:
corr_info = get_top_abs_correlations(train_numerical, n=10000000)
corr_info[corr_info["corr_abs"] >= 0.8]