# Explore features value distribution & missingness

In [None]:
import os

import pandas as pd
from pyarrow import parquet as pq

import matplotlib.pyplot as plt
import seaborn as sns

# Initial configuration

This notebook assumes that the `features.parquet` cohort data file is available in `data/processed/features.parquet`.

In [None]:
path_to_data = "../data/processed"

# Load cohort data

In [None]:
sepsis_cohort_features = pq.read_table(os.path.join(path_to_data, "features.parquet")).to_pandas()

# Calculate missingness

In [None]:
# Calculate missingness percentage accross whole dataset
missing_percentage = sepsis_cohort_features.isnull().mean() * 100

# Create a new DataFrame with missingness percentage for each feature
missing_info = pd.DataFrame(
    {
        "Feature": missing_percentage.index,
        "MissingPercentage": missing_percentage.values,
    }
)
print(missing_info)

# Visualize clinical concepts value distribution

In [None]:
categorical_features = sepsis_cohort_features.select_dtypes(
    include=["object", "category"]
).columns
categorical_features

In [None]:
# Set the number of rows and columns for subplots
num_features = len(sepsis_cohort_features.columns)
num_cols = min(num_features, 5)
num_rows = -(-num_features // num_cols)

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(5 * num_cols, 5 * num_rows))
fig.tight_layout(pad=4.0)

# Loop through each feature and create boxplots
for i, feature in enumerate(sepsis_cohort_features.columns):
    if feature == "hospitalid":
        continue
    
    row_index = i // num_cols
    col_index = i % num_cols

    # Specify the axis for the current subplot
    ax = axes[row_index, col_index] if num_rows > 1 else axes[col_index]

    # Subset data for the current feature
    feature_data = sepsis_cohort_features.loc[:, [feature, "hospitalid"]]

    if feature in categorical_features:
        sns.countplot(x=feature, hue="hospitalid", data=feature_data, ax=ax)
        ax.set_title(
            f"{feature} - Missing (accross whole dataset): {missing_percentage[feature]:.2f}%"
        )
        ax.set_xlabel(feature)
        # plt.legend(title='Hospital ID', bbox_to_anchor=(1, 1))
    else:
        sns.boxplot(x="hospitalid", y=feature, data=feature_data, ax=ax)
        ax.set_title(
            f"{feature} - Missing (accross whole dataset): {missing_percentage[feature]:.2f}%"
        )
        ax.set_xlabel("Hospital ID")
        ax.set_ylabel(feature)
