# Post-Modeling Exploratory Data Analysis (EDA)

## Import Requisite Libraries

In [None]:
######################## Standard Library Imports ##############################
import pandas as pd
import numpy as np
import os
import sys

# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))


from eda_toolkit import ensure_directory, generate_table1

######################## Modeling Library Imports ##############################
import model_tuner
import eda_toolkit
import matplotlib.pyplot as plt

from functions import crosstab_plot


# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))

print(
    f"This project uses: \n \n Python {sys.version.split()[0]} \n model_tuner "
    f"{model_tuner.__version__} \n eda_toolkit {eda_toolkit.__version__}"
)

## Set Paths & Read in the Data

In [None]:
# Define your base paths
# `base_path`` represents the parent directory of your current working directory
base_path = os.path.join(os.pardir)
# Go up one level from 'notebooks' to the parent directory, then into the 'data' folder

data_path = os.path.join(os.pardir, "data")
image_path_png = os.path.join(base_path, "images", "png_images", "modeling")
image_path_svg = os.path.join(base_path, "images", "svg_images", "modeling")

# Use the function to ensure the 'data' directory exists
ensure_directory(data_path)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

In [None]:
data_path = "../data/processed/"
model_path = "../mlruns/models/"

In [None]:
df = pd.read_parquet(os.path.join(data_path, "X.parquet"))
X = pd.read_parquet(os.path.join(data_path, "X.parquet"))
y = pd.read_parquet(os.path.join(data_path, "y_Bleeding_Edema_Outcome.parquet"))
df = df.join(y, how="inner", on="patient_id")
df.head()

In [None]:
df.columns.to_list()  # list the columns in the dataframe

## Bin Age For EDA

In [None]:
# create bins for age along with labels such that age as a continuous series
# can be converted to something more manageable for visualization and analysis
bin_ages = [18, 30, 40, 50, 60, 70, 80, 90, 100]
label_ages = [
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
]

df["age_group"] = pd.cut(
    df["Age_years"],
    bins=bin_ages,
    labels=label_ages,
    right=False,  # <-- include left edge, exclude right
    include_lowest=True,  # <-- include the lowest value (e.g. 18)
)

## Rename DataFrame columns to be more readable

In [None]:
df_rename = df.copy()  # rename df to avoid confusion with the original df

In [None]:
df_rename = df_rename.rename(
    columns={
        "Intraoperative_Blood_Loss_ml": "Intraoperative Blood Loss",
        "age_group": "Age Group",
        "Surgical_Time_min": "Surgical Time (min)",
        "Intraop_Mean_Heart_Rate_bpm": "Intraoperative Mean Heart Rate (BPM)",
    }
)

## Age vs. BMI

In [None]:
from eda_toolkit import box_violin_plot

metrics_list = [
    "BMI",
    "Intraoperative Blood Loss",
    "Intraoperative Mean Heart Rate (BPM)",
    "Surgical Time (min)",
]
metrics_boxplot_comp = ["Age Group"]
metrics_comp = ["Age Group"]

box_violin_plot(
    df=df_rename,
    metrics_list=metrics_list,
    metrics_comp=metrics_comp,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots=True,
    show_plot="grid",
    show_legend=False,
    plot_type="boxplot",
    xlabel_rot=90,
    text_wrap=50,
)

## Outcome Value Counts

In [None]:
df["Bleeding_Edema_Outcome"].map(
    {0: "No Complications", 1: "Complications"}
).value_counts().plot(kind="bar", rot=0, title="Complications Outcome")
plt.xlabel("Outcome")
plt.ylabel("Count")
plt.savefig(
    os.path.join(image_path_svg, "outcome_count.svg"),
    bbox_inches="tight",
)
plt.show()

df["Bleeding_Edema_Outcome"].value_counts()

## Table 1 (With Respect to Outcome)

In [None]:
table1_cont_outcome = generate_table1(
    df,
    include_types="continuous",
    groupby_col="Bleeding_Edema_Outcome",
    value_counts=True,
    # apply_bonferroni=True,
    # apply_bh_fdr=True,
    # use_fisher_exact=True,
    decimal_places=2,
)

In [None]:
table1_cont_outcome = table1_cont_outcome.drop(
    columns=["Type", "Missing (n)", "Missing (%)", "Proportion (%)"]
)

table1_cont_outcome

In [None]:
table1_cat_outcome = generate_table1(
    df,
    include_types="categorical",
    groupby_col="Bleeding_Edema_Outcome",
    value_counts=True,
    # apply_bonferroni=True,
    # apply_bh_fdr=True,
    # use_fisher_exact=True,
    decimal_places=2,
)

In [None]:
table1_cat_outcome = table1_cat_outcome.drop(
    columns=["Type", "Missing (n)", "Missing (%)"]
)
table1_cat_outcome

## Table 1 (With Respect to Surgical Technique)

In [None]:
table1_cont_surgical_technique = generate_table1(
    df,
    include_types="continuous",
    groupby_col="Surgical_Technique",
    value_counts=True,
    # apply_bonferroni=True,
    # apply_bh_fdr=True,
    # use_fisher_exact=True,
    decimal_places=2,
)
table1_cont_surgical_technique = table1_cont_surgical_technique.drop(
    columns=["Type", "Missing (n)", "Missing (%)", "Proportion (%)"]
)

table1_cont_surgical_technique

In [None]:
table1_cat_surgical_technique = generate_table1(
    df,
    include_types="categorical",
    groupby_col="Surgical_Technique",
    value_counts=True,
    # apply_bonferroni=True,
    # apply_bh_fdr=True,
    # use_fisher_exact=True,
    decimal_places=2,
)
table1_cat_surgical_technique = table1_cat_surgical_technique.drop(
    columns=["Type", "Missing (n)", "Missing (%)"]
)
rows_to_drop = [
    "Surgical_Technique",
    "Surgical_Technique = 0",
    "Surgical_Technique = 1",
]

table1_cat_surgical_technique = table1_cat_surgical_technique[
    ~table1_cat_surgical_technique["Variable"].isin(rows_to_drop)
]

table1_cat_surgical_technique

### Numerical Variable BoxPlot

In [None]:
variable_list = df.columns.to_list()

In [None]:
plt.figure(figsize=(19, 6))

# Create the boxplot (remove the invalid linestyle argument)
ax = df[variable_list].boxplot(vert=False)

# Thicken all boxplot lines: boxes, whiskers, caps, fliers, medians
for line in ax.get_lines():
    line.set_linewidth(2.5)  # change thickness as needed

# Reverse the y-axis so the first variable appears at the top
ax.invert_yaxis()

# Customize ticks and labels
plt.xticks(rotation=0, ha="right")
plt.xlabel("Value")
plt.ylabel("Variable")
plt.title("Distributions of Numeric Variables")

# Adjust layout and save
plt.tight_layout()
plt.savefig(
    os.path.join(image_path_svg, "numeric_variable_distributions.svg"),
    bbox_inches="tight",
)

plt.show()

## Strobe Diagram

### Original Strobe (for Paper)

In [None]:
from graphviz import Digraph

# Create revised STROBE diagram with horizontal surgical modality box
dot = Digraph(
    comment="Enhanced STROBE Diagram with Modality Highlight",
    format="svg",
    directory=image_path_svg,
)
dot.attr(rankdir="TB", size="10")

# Nodes
dot.node("A", "Patients Evaluated\n(n = 202)", shape="box")
dot.node("B", "Excluded: Under 18\n(n = 8)", shape="box")
dot.node("C", "Final Study Cohort:\nAdult Males ≥ 18\n(n = 194)", shape="box")
dot.node(
    "D",
    "Data Preprocessing\n• Feature engineering\n• Comorbidity filtering\n• No missing data",
    shape="box",
)
dot.node(
    "D2",
    "Surgical Modality\n• Traditional (n = 132)\n• Laser (n = 62)",
    shape="box",
)
dot.node(
    "E",
    "Modeling and Evaluation\n• LR, RF, SVM\n• 10-fold CV\n• Balanced class weights",
    shape="box",
)
dot.node(
    "F",
    label=f"Model Calibration\n• Platt scaling\n• Threshold tuning (𝛽 = 1, 2)",
    shape="box",
)

dot.node(
    "G",
    "Primary Outcome\nBleeding, Edema, Pain, or Infection\nwithin 7 days",
    shape="box",
)
dot.node(
    "H",
    "Final Sample Used for Modeling\n(n = 194, 100%)",
    shape="box",
)

# Edges (one at a time due to prior error)
dot.edge("A", "B")
dot.edge("B", "C")
dot.edge("C", "D")
dot.edge("D", "D2")
dot.edge("D2", "E")
dot.edge("E", "F")
dot.edge("F", "G")
dot.edge("G", "H")

# Render diagram
dot.render("strobe_modality_emphasized_final")
dot

In [None]:
from graphviz import Digraph

dot = Digraph(
    comment="STROBE Diagram split after E",
    format="svg",
    directory=image_path_svg,
)
dot.attr(rankdir="TB", size="10")

# 1–4: vertical spine
for id, label in [
    ("A", "Patients Evaluated\n(n = 202)"),
    ("B", "Excluded: Under 18\n(n = 8)"),
    ("C", "Final Cohort:\nAdult Males ≥ 18\n(n = 194)"),
    (
        "D",
        "Data Preprocessing\n• Feature eng\n• Comorbidity filtering\n• No missing data",
    ),
]:
    dot.node(id, label, shape="box")

# 5: branch point
dot.node("D2", "Surgical Modality\n• Traditional (n=132)\n• Laser (n=62)", shape="box")

# 6–9: downstream boxes
for id, label in [
    ("E", "Modeling & Evaluation\n• LR, RF, SVM\n• 10-fold CV\n• Balanced weights"),
    ("F", "Model Calibration\n• Platt scaling\n• Threshold tuning (β=1,2)"),
    ("G", "Primary Outcomes\nBleeding, Edema, Pain, Infection\nwithin 7 days"),
    ("H", "Final Sample for Modeling\n(n=194, 100%)"),
]:
    dot.node(id, label, shape="box")

# connect vertical spine through D2 → E
dot.edge("A", "B")
dot.edge("B", "C")
dot.edge("C", "D")
dot.edge("D", "D2")
dot.edge("D2", "E")

# now force E, F, G, H all on the same rank (i.e. horizontal)
with dot.subgraph() as s:
    s.attr(rank="same")
    for n in ("E", "F", "G", "H"):
        s.node(n)

# chain them left→right
dot.edge("E", "F")
dot.edge("F", "G")
dot.edge("G", "H")

dot.render("strobe_split_after_E")
dot

In [None]:
df

## Outcome by Risk Factors and Surgical Techniques

In [None]:
bar_list = [
    "BMI_Category_Overweight",
    "BMI_Category_Underweight",
    "BMI_Category_Obese",
    "Surgical_Technique",
]

In [None]:
df_binary_list = df.copy()
df_binary_list["BMI_Category_Overweight"] = df_binary_list[
    "BMI_Category_Overweight"
].map({0: "Not Overweight", 1: "Overweight"})
df_binary_list["BMI_Category_Underweight"] = df_binary_list[
    "BMI_Category_Underweight"
].map({0: "Not Underweight", 1: "Underweight"})
df_binary_list["BMI_Category_Obese"] = df_binary_list["BMI_Category_Obese"].map(
    {0: "Not Obese", 1: "Obese"}
)
df_binary_list["Surgical_Technique"] = df_binary_list["Surgical_Technique"].map(
    {0: "Traditional Circumcision", 1: "Laser Circumcision"}
)

In [None]:
## custom color schema
surg_tech_color = {"Surgical_Technique": ["#1f77b4", "#203764"]}

###  Count Crosstab Plot

In [None]:
crosstab_plot(
    df=df_binary_list,
    list_name=bar_list,
    label1="No Complications",
    label2="Complications",
    figsize=(12, 10),
    bbox_to_anchor=(0.5, -0.25),
    w_pad=4,
    h_pad=4,
    normalize=False,
    image_path_svg=image_path_svg,
    string="outcome_by_feature",
    save_plots=True,
    outcome="Bleeding_Edema_Outcome",
    show_value_counts=True,
    color_schema=surg_tech_color,
)

###  Normalized Crosstab Plot

In [None]:
crosstab_plot(
    df=df_binary_list,
    list_name=bar_list,
    label1="No Complications",
    label2="Complications",
    figsize=(12, 10),
    bbox_to_anchor=(0.5, -0.25),
    w_pad=4,
    h_pad=4,
    normalize=True,
    image_path_svg=image_path_svg,
    string="outcome_by_feature_normalized",
    save_plots=True,
    outcome="Bleeding_Edema_Outcome",
    show_value_counts=True,
    color_schema=surg_tech_color,
)

In [None]:
df

In [None]:
df_copy = df.copy()
df_copy["Bleeding_Edema_Outcome"] = df_copy["Bleeding_Edema_Outcome"].map(
    {0: "No Complications", 1: "Complications"}
)

In [None]:
from eda_toolkit import scatter_fit_plot

scatter_fit_plot(
    df=df_copy,
    all_vars=df[[col for col in df_copy.columns if df_copy[col].nunique() > 2]]
    .select_dtypes(np.number)
    .columns.to_list(),
    show_legend=True,
    show_plot="grid",
    label_fontsize=14,
    tick_fontsize=12,
    add_best_fit_line=True,
    scatter_color="#000000",
    show_correlation=True,
    text_wrap=40,
    hue="Bleeding_Edema_Outcome",
    hue_palette={"No Complications": "#0000F2", "Complications": "red"},
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots="grid",
)