# Explore feature distribution & missingness

In [None]:
import os

import pandas as pd
from pyarrow import parquet as pq

import matplotlib.pyplot as plt
import seaborn as sns

import resource
resource.setrlimit(resource.RLIMIT_AS, (4000000000000000000, resource.RLIM_INFINITY))

# Initial configuration

This notebook assumes that the raw eICU `.csv` files are available in the `data/raw/eICU` directory and that the cohort `.parquet` files are available in `data/cohorts/sepsis_eicu_extended`.

In [None]:
path_to_cohorts = "../data/cohorts/sepsis_eicu_extended"

# Load cohort data

In [None]:
eICU_cohort_static_data = pq.read_table(os.path.join(path_to_cohorts, "sta.parquet")).to_pandas()
eICU_cohort_dynamic_data = pq.read_table(os.path.join(path_to_cohorts, "dyn.parquet")).to_pandas()
eICU_cohort_outcome_data = pq.read_table(os.path.join(path_to_cohorts, "outc.parquet")).to_pandas()

# Calculate missingness

In [None]:
# Calculate missingness percentage accross whole dataset
missing_percentage_static_data = eICU_cohort_static_data.isnull().mean() * 100
missing_percentage_dynamic_data = eICU_cohort_dynamic_data.isnull().mean() * 100
missing_percentage_outcome_data = eICU_cohort_outcome_data.isnull().mean() * 100
print(missing_percentage_static_data)

# Create a new DataFrame with missingness percentage for each feature
missing_info_static = pd.DataFrame(
    {
        "Static Feature": missing_percentage_static_data.index,
        "MissingPercentage": missing_percentage_static_data.values,
    }
)
print(missing_info_static)
missing_info_dynamic = pd.DataFrame(
    {
        "Dynamic Feature": missing_percentage_dynamic_data.index,
        "MissingPercentage": missing_percentage_dynamic_data.values,
    }
)
print(missing_info_dynamic)
missing_info_outcome = pd.DataFrame(
    {
        "Outcome Feature": missing_percentage_outcome_data.index,
        "MissingPercentage": missing_percentage_outcome_data.values, 
    }
)
print(missing_info_outcome)

In [None]:
# Combine into one DataFrame
combined_missing_percentage = pd.concat([
    missing_percentage_static_data,
    missing_percentage_dynamic_data,
    missing_percentage_outcome_data
])
combined_missing_percentage

# Visuazlize feature distribution

1. Combine data

In [None]:
eICU_cohort_static_and_dynamic_data = pd.merge(eICU_cohort_dynamic_data, eICU_cohort_static_data, on='stay_id', how='left')
eICU_cohort_complete_data = pd.merge(eICU_cohort_static_and_dynamic_data, eICU_cohort_outcome_data, on='stay_id', how='left')

In [None]:
eICU_cohort_static_data.shape
eICU_cohort_dynamic_data.shape
eICU_cohort_outcome_data.shape
eICU_cohort_complete_data.shape

In [None]:
categorical_features = eICU_cohort_complete_data.select_dtypes(
    include=["object", "category"]
).columns
categorical_features

In [None]:
# Set the number of rows and columns for subplots
num_features = len(eICU_cohort_complete_data.columns)
num_cols = min(num_features, 5)
num_rows = -(-num_features // num_cols)

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(5 * num_cols, 5 * num_rows))
fig.tight_layout(pad=4.0)

# Loop through each feature and create boxplots
for i, feature in enumerate(eICU_cohort_complete_data.columns):
    print(feature)
    if feature == "hospitalid":
        continue
    
    row_index = i // num_cols
    col_index = i % num_cols

    # Specify the axis for the current subplot
    ax = axes[row_index, col_index] if num_rows > 1 else axes[col_index]

    # Subset data for the current feature
    feature_data = eICU_cohort_static_data.loc[:, [feature, "hospitalid"]]

    if feature in categorical_features:
        sns.countplot(x=feature, hue="hospitalid", data=feature_data, ax=ax)
        ax.set_title(
            f"{feature} - Missing (accross whole dataset): {combined_missing_percentage[feature]:.2f}%"
        )
        ax.set_xlabel(feature)
        # plt.legend(title='Hospital ID', bbox_to_anchor=(1, 1))
    else:
        sns.boxplot(x="hospitalid", y=feature, data=feature_data, ax=ax)
        ax.set_title(
            f"{feature} - Missing (accross whole dataset): {combined_missing_percentage[feature]:.2f}%"
        )
        ax.set_xlabel("Hospital ID")
        ax.set_ylabel(feature)
