# Exploratory Data Analysis

## Import Requisite Libraries

In [None]:
######################## Standard Library Imports ##############################
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
from itertools import combinations
import os
import sys

########################## Plotting Libraries ##################################
import matplotlib.pyplot as plt
import seaborn as sns
import eda_toolkit
from eda_toolkit import (
    ensure_directory,
    kde_distributions,
    box_violin_plot,
    stacked_crosstab_plot,
    flex_corr_matrix,
    box_violin_plot,
    highlight_columns,
    scatter_fit_plot,
    generate_table1,
)

################################################################################

# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))

from python_scripts.functions import *  # import custom functions

print(f"This project uses Python {sys.version.split()[0]}.")
print(f"This project uses EDA_Toolkit {eda_toolkit.__version__}.")

## Read in the Data

In [None]:
# Define your base paths
# `base_path`` represents the parent directory of your current working directory
base_path = os.path.join(os.pardir)
data_path = os.path.join(base_path, "data")
image_path_png = os.path.join(base_path, "images", "png_images")
image_path_svg = os.path.join(base_path, "images", "svg_images")

# Ensure that each directory exists
ensure_directory(data_path)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

In [None]:
# read in the data, set index to "ID"
circ_eda = pd.read_csv(os.path.join(data_path, "circ_eda.csv")).set_index("patient_id")

In [None]:
circ_eda.columns.to_list()

In [None]:
circ_eda.head()  # inspect first five rows of dataframe

In [None]:
circ_eda = circ_eda.drop(columns=["Birthday"])  # drop unused col

In [None]:
circ_eda = circ_eda[circ_eda["Age_years"] >= 18]

In [None]:
circ_eda.columns

In [None]:
table1_cont = generate_table1(
    circ_eda, include_types="continuous", groupby_col="Surgical_Technique"
)

In [None]:
print(table1_cont)

In [None]:
table1_cont = generate_table1(circ_eda, include_types="categorical")
table1_cont

## Define Age Group

In [None]:
# create bins for age along with labels such that age as a continuous series
# can be converted to something more manageable for visualization and analysis
bin_ages = [18, 30, 40, 50, 60, 70, 80, 90, 100]
label_ages = [
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
]

circ_eda["age_group"] = pd.cut(
    circ_eda["Age_years"],
    bins=bin_ages,
    labels=label_ages,
)

## Clinical Characteristics

### Prevalence of Comorbidities

In [None]:
comorbid_color = ["#1f77b4", "#c8544c"]
comorbid_flag = {0: "No Comorbidities", 1: "Comorbidities"}
comorb_val_counts = circ_eda["Comorbidity_Flag"].value_counts()
comorb_val_counts.index = comorb_val_counts.index.map(comorbid_flag)
ax = comorb_val_counts.plot(
    kind="bar",
    rot=0,
    width=0.99,
    color=comorbid_color,
)

for i, v in enumerate(comorb_val_counts):
    ax.text(i, v - 20, str(v), ha="center", color="yellow")
ax.set_title("Prevalence of Comorbidities")
ax.set_xlabel("Comorbidity Flag")
ax.set_ylabel("Comorbidity Count")
plt.savefig(
    os.path.join(image_path_png, "comorbidities_vs_no_comorbidities.png"),
)
plt.savefig(
    os.path.join(image_path_svg, "comorbidities_vs_no_comorbidities.svg"),
)
plt.show()

### Comorbidities by Age Group

In [None]:
pd.crosstab(
    circ_eda["Comorbidities"],
    circ_eda["age_group"],
    margins=True,
    margins_name="Total",
)

In [None]:
circ_eda["age_group"].unique()

In [None]:
filtered_df = circ_eda[circ_eda["Comorbidities"] != "0"]
age_group_order = filtered_df["age_group"].dropna().unique().tolist()
age_group_order = [
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
]

sorted_crosstab = pd.crosstab(
    filtered_df["Comorbidities"], filtered_df["age_group"]
).reindex(columns=age_group_order)


plt.figure(figsize=(9, 6))

# Create the heatmap using the sorted crosstab
sns.heatmap(sorted_crosstab, annot=True, cmap="rocket_r", fmt="d")
# plt.title("Comorbidites by Age Group")
plt.xlabel("Age Group")

# Save the image, assuming image_path_png and image_path_svg are already defined
plt.savefig(
    os.path.join(image_path_png, "comorbidities_by_age_group.png"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_svg, "comorbidities_by_age_group.svg"),
    bbox_inches="tight",
)
plt.show()

### Comorbidities by Geographical Origin

In [None]:
plt.figure(figsize=(9, 6))

# create a heatmap of the crosstab between patient comorbidities and geo. origin
sns.heatmap(
    pd.crosstab(circ_eda["Comorbidities"], circ_eda["Geographical_Origin"]),
    annot=True,
    cmap="rocket_r",
    fmt="d",
)


plt.title("Comorbidites by Geographical Origin")
plt.xlabel("Geographical Origin")
plt.savefig(
    os.path.join(image_path_png, "comorbidities_by_geog_origin.png"),
    bbox_inches="tight",
)

plt.savefig(
    os.path.join(image_path_svg, "comorbidities_by_geog_origin.svg"),
    bbox_inches="tight",
)

plt.show()

## Overall Distributions

In [None]:
dist_list_1 = [
    "Age_years",
    "BMI",
    "Comorbidities",
    "Preop_drugs_antibiotic",
    "Preop_Heart_Rate_bpm",
    "Preop_Pulse_Ox_Percent",
    "Surgical_Technique",
    "Intraoperative_Blood_Loss_ml",
]

dist_list_2 = [
    "Intraop_Mean_Heart_Rate_bpm",
    "Intraop_Mean_Pulse_Ox_Percent",
    "Surgical_Time_min",
    "Functional_Outcomes_Pain",
    "Functional_Outcomes_Bleeding",
    "Functional_Outcomes_Edema",
    "Functional_Outcomes_Infection",
    "Functional_Outcomes_Fast_Recovery",
]

dist_list_3 = [
    "Functional_Outcomes_Cosmetic_Satisfaction",
    "Cost_of_Procedure_euros",
    "Preop_MAP",
    "Intraop_MAP",
    "Anesthesia_Type_lidocaine",
    "SBP",
    "DBP",
    "Comorbidity_Flag",
]

In [None]:
len(dist_list_1) + len(dist_list_2) + len(dist_list_3)

In [None]:
dist_list = circ_eda.select_dtypes(np.number).columns.to_list()

kde_distributions(
    df=circ_eda,
    fill=True,
    n_rows=2,
    n_cols=4,
    h_pad=5,
    fill_alpha=0.60,
    text_wrap=40,
    # grid_figsize=(50, 25),  # Size of the overall grid figure
    vars_of_interest=dist_list_1,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    plot_type="both",
    image_filename="numeric_distributions_1",
    bbox_inches="tight",
    # y_axis_label=" ",
    # plot_title=None,
    bins=10,
    tick_fontsize=14,
    # custom_xlabels={"Age_years": "Age"},
    # custom_titles={"Age_years": None},
    label_fontsize=16,
)

In [None]:
dist_list = circ_eda.select_dtypes(np.number).columns.to_list()

kde_distributions(
    df=circ_eda,
    fill=True,
    n_rows=2,
    n_cols=4,
    h_pad=5,
    fill_alpha=0.60,
    text_wrap=30,
    # grid_figsize=(50, 25),  # Size of the overall grid figure
    vars_of_interest=dist_list_2,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    plot_type="both",
    image_filename="numeric_distributions_2",
    bbox_inches="tight",
    y_axis_label="Density",
    bins=10,
    tick_fontsize=14,
    label_fontsize=16,
)

In [None]:
dist_list = circ_eda.select_dtypes(np.number).columns.to_list()

kde_distributions(
    df=circ_eda,
    fill=True,
    n_rows=2,
    n_cols=4,
    h_pad=5,
    fill_alpha=0.60,
    text_wrap=40,
    # grid_figsize=(50, 25),  # Size of the overall grid figure
    vars_of_interest=dist_list_3,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    plot_type="both",
    image_filename="numeric_distributions_3",
    bbox_inches="tight",
    y_axis_label="Density",
    bins=10,
    tick_fontsize=14,
    label_fontsize=16,
)

In [None]:
kde_distributions(
    df=circ_eda,
    figsize=(10, 6),
    text_wrap=50,
    hist_color="brown",
    bbox_inches="tight",
    vars_of_interest=["Surgical_Time_min"],
    y_axis_label="Density",
    bins=10,
    fill_alpha=0.40,
    plot_type="both",
    stat="Density",
    label_fontsize=16,  # Font size for axis labels
    tick_fontsize=14,  # Font size for tick labels
    plot_mean=True,
    plot_median=True,
    mean_color="blue",
    image_filename="surgical_time_distribution",
    image_path_svg=image_path_svg,
    custom_xlabels=None,  # New parameter to customize x-axis labels
    custom_titles={
        "Surgical_Time_min": "Surgical Time in Minutes"
    },  # New parameter to customize plot titles
    image_path_png=image_path_png,
    std_dev_levels=[
        1,
        2,
        3,
    ],
    std_color=[
        "purple",
        "green",
        "silver",
    ],
    # title="Age Distribution",
)

## Correlation Matrix

In [None]:
# Function to create a mock dataset
def create_mock_dataset(rows=100, seed=42):
    np.random.seed(seed)
    data = {
        "Age_years": np.random.randint(18, 65, size=rows),
        "Weight_kg": np.random.uniform(50, 120, size=rows),
        "BMI": np.random.uniform(18, 35, size=rows),
        "Comorbidities": np.random.choice([0, 1], size=rows),
        "Preop_Heart_Rate_bpm": np.random.randint(60, 100, size=rows),
        "Intraop_Mean_Heart_Rate_bpm": np.random.randint(70, 110, size=rows),
        "Intraop_Mean_Pulse_Ox_Percent": np.random.uniform(90, 100, size=rows),
        "Surgical_Time_min": np.random.randint(30, 300, size=rows),
        "Cost_of_Procedure_euros": np.random.uniform(5000, 20000, size=rows),
        "SBP": np.random.randint(100, 140, size=rows),
        "DBP": np.random.randint(60, 90, size=rows),
    }
    return pd.DataFrame(data)


# Generate the dataset
mock_dataset = create_mock_dataset(rows=100)

In [None]:
mock_dataset

In [None]:
from eda_toolkit import flex_corr_matrix

feature_list = [
    "Age_years",
    "BMI",
    "Surgical_Technique",
    "Intraoperative_Blood_Loss_ml",
    "Intraop_Mean_Heart_Rate_bpm",
    "Intraop_Mean_Pulse_Ox_Percent",
    "Surgical_Time_min",
    "Diabetes",
    "BMI_Category_Obese",
    "BMI_Category_Overweight",
    "BMI_Category_Underweight",
    "Intraop_SBP",
    "Intraop_DBP",
]
flex_corr_matrix(
    df=circ_eda,
    # cols=mock_dataset.columns.to_list(),
    cols=feature_list,
    annot=True,
    cmap="viridis",
    figsize=(20, 20),
    # title="US Census Correlation Matrix",
    xlabel_alignment="right",
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots=True,
    label_fontsize=20,
    tick_fontsize=20,
    xlabel_rot=30,
    ylabel_rot=0,
    text_wrap=30,
    vmin=-1,
    vmax=1,
    cbar_label="Correlation Index",
    # cbar_padding=0.8,  # Adjust spacing as needed
    # cbar_width_ratio=0.05,  # Adjust width as needed
    triangular=True,
    show_colorbar=True,
)

## Scatter Plots

### Preoperative vs. Intraoperative Characteristics (Scatterplots)

Examining the clinical relevance of the proposed correlations involves understanding the physiological interactions and the possible implications of these measurements on patient outcomes. Here's a brief overview of the clinical sensibility of each correlation:

1. Preoperative Heart Rate (BPM) vs. Preoperative Pulse Oximetry (SpO2):  

    Clinical Relevance: Moderately relevant. Heart rate and oxygen saturation can both be indicators of a patient's cardiorespiratory status. While there's no direct causal relationship, abnormalities in one might reflect or affect changes in the other, especially in the context of cardiorespiratory diseases.

2. Preoperative Heart Rate (BPM) vs. Intraoperative Blood Loss (ml):

    Clinical Relevance: Indirect relevance. Preoperative heart rate could reflect the patient's stress or anxiety level, potentially influencing blood pressure and vascular tone. However, the correlation with intraoperative blood loss is likely to be influenced by many other factors, making this relationship more complex and indirect.

3. Preoperative Heart Rate (BPM) vs. Preoperative Mean Arterial Pressure (MAP):

    Clinical Relevance: Highly relevant. There's a physiological interaction where the heart rate can influence and be influenced by arterial pressure due to cardiac output and vascular resistance factors. This relationship is fundamental in understanding the patient's hemodynamic status.

4. Preoperative Heart Rate (BPM) vs. Intraoperative Mean Arterial Pressure (MAP):

    Clinical Relevance: Moderately relevant. Similar to the preoperative MAP, but considering the stress and potential complications during surgery, the correlation might offer insights into how preoperative conditions could affect or predict intraoperative hemodynamic stability.

5. Preoperative Pulse Ox vs. Intraoperative Blood Loss (ml):

    Clinical Relevance: Indirect relevance. While both metrics are important, the direct correlation between preoperative oxygen saturation and intraoperative blood loss is not straightforward. Other factors, such as the surgical site and technique, significantly influence blood loss.

6. Preoperative Pulse Ox vs. Preoperative MAP:

    Clinical Relevance: Indirect relevance. Both are vital signs but relate to different physiological aspects (cardiorespiratory efficiency vs. circulatory pressure). The relationship is more about how general health can impact these measurements rather than a direct correlation.

7. Preoperative Pulse Ox vs. Intraoperative MAP:

    Clinical Relevance: Indirect relevance. This relationship might be more about the underlying health status of the patient and how it could affect or be affected by intraoperative hemodynamic management rather than a direct correlation.

8. Intraoperative Blood Loss (ml) vs. Preoperative MAP:

    Clinical Relevance: Indirect relevance. Preoperative MAP might influence the body's response to blood loss (through compensatory mechanisms), but the amount of blood loss is more directly related to the surgical procedure and technique.

9. Intraoperative Blood Loss (ml) vs. Intraoperative MAP:

    Clinical Relevance: Highly relevant. Significant blood loss can lead to a decrease in MAP due to reduced circulating volume, making this correlation critical for monitoring and managing intraoperative hemodynamics.

10. Preoperative MAP vs. Intraoperative MAP:

    Clinical Relevance: Highly relevant. Understanding the changes from preoperative to intraoperative MAP can provide insights into the patient's hemodynamic response to surgery and anesthesia, helping to guide management to maintain stability.

For each of these correlations, it's important to consider the broader clinical context, including the type of surgery, patient health status, and other concurrent interventions. The significance of these correlations can vary based on specific patient populations and conditions.

In [None]:
preop_intraop_values = [
    "Preop_Heart_Rate_bpm",
    "Preop_Pulse_Ox_Percent",
    "Intraoperative_Blood_Loss_ml",
    "Preop_MAP",
    "Intraop_MAP",
]

custom_titles = {
    "Preop_Heart_Rate_bpm": "Preoperative Heart Rate (BPM)",
    "Preop_Pulse_Ox_Percent": "Preoperative Pulse Oximetry (SpO2)",
    "Intraoperative_Blood_Loss_ml": "Intraoperative Blood Loss (ML)",
    "Preop_MAP": "Preoperative Mean Arterial Pressure",
    "Intraop_MAP": "Intraoperative Mean Arterial Pressure",
}

# Define combinations to omit
combinations_to_omit = [
    (
        "Preop_Heart_Rate_bpm",
        "Preop_Pulse_Ox_Percent",
    ),  # Indirect physiological relationship
    (
        "Preop_Heart_Rate_bpm",
        "Intraoperative_Blood_Loss_ml",
    ),  # Indirect and complex relationship
    (
        "Preop_Pulse_Ox_Percent",
        "Intraoperative_Blood_Loss_ml",
    ),  # No direct physiological relationship
    (
        "Preop_Pulse_Ox_Percent",
        "Intraop_MAP",
    ),  # indirect relevance and differing physiological systems
    (
        "Intraoperative_Blood_Loss_ml",
        "Preop_MAP",
    ),  # complex and indirect influences on outcomes.
    # Any addtl' combos based on further clinical insights can be added here
]

scatter_fit_plot(
    df=circ_eda,
    all_vars=preop_intraop_values,
    label_names=custom_titles,
    show_legend=True,
    show_plot="grid",
    label_fontsize=14,
    exclude_combinations=combinations_to_omit,
    tick_fontsize=12,
    add_best_fit_line=True,
    scatter_color="#808080",
    show_correlation=True,
    text_wrap=40,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots="individual",
)

### Mean Arterial Pressure vs. BMI

In [None]:
scatter_fit_plot(
    df=circ_eda,
    x_vars=["Preop_MAP", "Intraop_MAP"],
    y_vars=["BMI"],
    label_names=custom_titles,
    show_legend=True,
    show_plot="grid",
    label_fontsize=14,
    tick_fontsize=12,
    add_best_fit_line=True,
    scatter_color="#808080",
    show_correlation=True,
    text_wrap=40,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots="individual",
)

### BMI by Geographical Origin

In [None]:
bmi_by_geog = (
    circ_eda.groupby("Geographical_Origin")["BMI"]
    .agg(["mean", "std", "min", "max"])
    .rename(
        columns={
            "mean": "Mean",
            "std": "Standard Deviation",
            "min": "Min",
            "max": "Max",
        },
    )
).replace(np.nan, "-")
bmi_by_geog

In [None]:
circ_eda.groupby("Geographical_Origin")["BMI"].agg("mean").plot(
    kind="barh",
    width=0.9,
    rot=0,
)

plt.title("BMI by Country of Origin")
plt.xlabel("Body Mass Index")
plt.ylabel("Country of Origin")
plt.savefig(
    os.path.join(image_path_png, "bmi_by_geog_origin.png"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_svg, "bmi_by_geog_origin.svg"),
    bbox_inches="tight",
)
plt.show()

## Age-Related Distributions

In [None]:
kde_distributions(
    df=circ_eda,
    figsize=(10, 6),
    text_wrap=50,
    hist_color="brown",
    bbox_inches="tight",
    vars_of_interest=["Age_years"],
    y_axis_label="Density",
    bins=10,
    fill_alpha=0.40,
    plot_type="both",
    stat="Density",
    label_fontsize=16,  # Font size for axis labels
    tick_fontsize=14,  # Font size for tick labels
    plot_mean=True,
    plot_median=True,
    mean_color="blue",
    image_filename="age_distribution",
    image_path_svg=image_path_svg,
    custom_xlabels=None,  # New parameter to customize x-axis labels
    custom_titles={"Age_years": " "},  # New parameter to customize plot titles
    image_path_png=image_path_png,
    std_dev_levels=[
        1,
        2,
        3,
    ],
    std_color=[
        "purple",
        "green",
        "silver",
    ],
)

### Slide-Specific Implementation

In [None]:
# Define the bin edges to create 10 evenly spaced bins
bins = np.arange(0, 101, 10)

# Calculate descriptive statistics
age_description = circ_eda["Age_years"].describe()

# Create the histogram
fig, ax = plt.subplots()
circ_eda["Age_years"].hist(
    bins=bins, grid=False, edgecolor="black", ax=ax, density=True
)

# Calculate the KDE
kde = gaussian_kde(circ_eda["Age_years"].dropna())  # Ensure no NaN values interfere
age_range = np.linspace(0, 100, 500)  # Generate points between 0 and 100
kde_values = kde(age_range)  # Evaluate the KDE here

# Overlay the KDE plot
ax.plot(age_range, kde_values, color="red", alpha=0.5)  # Alpha for transparency

# Construct label with a multi-line string for better readability
label_text = (
    f'Count: {age_description["count"]:.0f}\n'
    f'Mean: {age_description["mean"]:.2f}\n'
    f'Std: {age_description["std"]:.2f}\n'
    f'Min: {age_description["min"]:.2f}\n'
    f'25%: {age_description["25%"]:.2f}\n'
    f'50%: {age_description["50%"]:.2f}\n'
    f'75%: {age_description["75%"]:.2f}\n'
    f'Max: {age_description["max"]:.2f}'
)

# Create an invisible plot for the purpose of adding the legend
ax.plot([], [], " ", label=label_text)

# Add title and labels
ax.set_title("Age Distribution")
ax.set_xlabel("Age")
ax.set_ylabel("Density")

# Display the legend
ax.legend(title="Summary Statistics")

plt.savefig(os.path.join(image_path_png, "age_hist.png"))
plt.savefig(os.path.join(image_path_svg, "age_hist.svg"))
plt.show()

## Comprehensive Age-Related Boxplots for Continuous Values

In [None]:
boxplot_metrics_list = [
    "BMI",
    "Preop_MAP",
    "Intraop_MAP",
    "Preop_Pulse_Ox_Percent",
    "Intraop_Mean_Pulse_Ox_Percent",
    "Preop_Heart_Rate_bpm",
    "Intraop_Mean_Heart_Rate_bpm",
    "Surgical_Time_min",
]
metrics_boxplot_comp = ["age_group"]
metrics_comp = ["age_group"]

box_violin_plot(
    df=circ_eda,
    metrics_list=boxplot_metrics_list,
    metrics_comp=metrics_comp,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots=True,
    show_plot="both",
    show_legend=False,
    plot_type="boxplot",
    xlabel_rot=90,
)

### Mean Arterial Pressure Averages by Age Group

In [None]:
map_values = circ_eda[["Preop_MAP", "Intraop_MAP"]]

# Convert dictionary items to a list and get the last two items
custom_maps = dict(list(custom_titles.items())[-2:])

for col in map_values.columns:
    # Use the custom title from the dict. if available, else use the column name
    custom_map = custom_maps.get(col, col)

    # Group by age_group and calculate statistics
    map_age = (
        circ_eda.groupby("age_group", observed=True)[col]
        .agg(["mean", "std", "min", "max"])
        .rename(
            columns={
                "mean": "Mean",
                "std": "Standard Deviation",
                "min": "Min",
                "max": "Max",
            }
        )
        .replace(np.nan, "-")  # Replace NaN values with "-",
    )

    # Plotting section
    map_age["Mean"].plot(
        kind="bar",
        width=0.92,
        rot=0,
    )
    plt.title(f"{custom_map} Averages by Age Group")
    plt.xlabel("Age Group")
    plt.ylabel(f"{custom_map}")
    file_name = custom_map.replace(" ", "_").replace("/", "_or_")
    plt.savefig(os.path.join(image_path_png, f"{file_name}_by_age_group.png"))
    plt.savefig(os.path.join(image_path_svg, f"{file_name}_by_age_group.svg"))
    plt.show()

    # Display the table
    print(f"Table for {custom_map} by Age Group:")
    display(map_age)  # Use print(map_age) if display is not available

### BMI Averages by Age Group

In [None]:
# group age by BMI and plot average BMI per age
circ_eda.groupby("age_group", observed=True)["BMI"].agg("mean").plot(
    kind="bar",
    width=0.92,
    rot=0,
)
plt.title("BMI Averages by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Body Mass Index")

# Adjust figure size for saving only
# plt.gcf().set_size_inches(12, 6)  # Example: Adjust to desired size for saving
plt.savefig(os.path.join(image_path_png, "bmi_by_age_group.png"))
plt.savefig(os.path.join(image_path_svg, "bmi_by_age_group.svg"))
plt.show()

## Functional Outcomes by Age

In [None]:
neg_outcomes = [
    "Functional_Outcomes_Pain",
    "Functional_Outcomes_Bleeding",
    "Functional_Outcomes_Infection",
]

# Define the legend_labels to use in the loop
neg_legend_labels = [
    ["No Pain", "Pain"],
    ["No Bleeding", "Bleeding"],
    ["No Infection", "Infection"],
]

# Define titles for the plots
neg_title = [
    "Pain",
    "Bleeding",
    "Infection",
]

In [None]:
stacked_crosstabs = stacked_crosstab_plot(
    df=circ_eda,
    col="age_group",
    func_col=neg_outcomes,
    legend_labels_list=neg_legend_labels,
    title=neg_title,
    kind="bar",
    width=0.8,
    rot=0,  # axis rotation angle
    custom_order=None,
    text_wrap=80,
    color=["#1f77b4", "#c8544c"],
    output="both",
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    return_dict=True,
    save_formats=["png", "svg"],
    x=14,
    y=10,
    p=12,
    file_prefix="Stacked_Bar",
    logscale=False,
    plot_type="both",
    show_legend=True,
    label_fontsize=12,
    tick_fontsize=12,
)

In [None]:
## Save the crosstabs to csv on data_path
for key, value in stacked_crosstabs.items():
    # Save each DataFrame as a CSV file
    value.to_csv(os.path.join(data_path, f"{key}.csv"), index=True)

In [None]:
for key, value in stacked_crosstabs.items():
    key = key.lower()
    print(key)
print()

for key, value in stacked_crosstabs.items():
    # Create DataFrame variables dynamically
    key = key.lower()
    globals()[key] = value
    # print(f"{globals()[key]}\n")

In [None]:
functional_outcomes_pain

In [None]:
functional_outcomes_bleeding

In [None]:
functional_outcomes_infection

In [None]:
circ_eda["Functional_Outcomes_Cosmetic_Satisfaction"].value_counts()

In [None]:
pos_outcomes = [
    "Functional_Outcomes_Fast_Recovery",
    "Functional_Outcomes_Cosmetic_Satisfaction",
    "Comorbidity_Flag",
]


pos_legend_labels = [
    ["Not Fast Recovery", "Fast Recovery"],
    ["Not Satisfied", "Satisfied"],
    ["Comorbidities", "No Comorbidities"],
]

pos_title = [
    "Recovery",
    "Cosmetic Satisfaction",
    "Comorbidities",
]

In [None]:
stacked_crosstabs = stacked_crosstab_plot(
    df=circ_eda,
    col="age_group",
    func_col=pos_outcomes,
    legend_labels_list=pos_legend_labels,
    title=pos_title,
    kind="bar",
    width=0.8,
    rot=0,  # axis rotation angle
    custom_order=None,
    text_wrap=80,
    color=["#c8544c", "#1f77b4"],
    output="both",
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    return_dict=True,
    x=14,
    y=10,
    p=12,
    save_formats=["png", "svg"],
    file_prefix="Stacked_Bar",
    logscale=False,
    plot_type="both",
    show_legend=True,
    label_fontsize=12,
    tick_fontsize=12,
)

In [None]:
surgical_outcomes = ["Surgical_Technique"]
surgical_legend_labels = [["Laser", "Traditional"]]
surgical_title = ["Surgical Technique"]

In [None]:
stacked_crosstabs = stacked_crosstab_plot(
    df=circ_eda,
    col="age_group",
    func_col=surgical_outcomes,
    legend_labels_list=surgical_legend_labels,
    title=surgical_title,
    kind="bar",
    width=0.8,
    rot=0,  # axis rotation angle
    custom_order=None,
    text_wrap=80,
    color=["#1f77b4", "#203764"],
    output="both",
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    return_dict=True,
    x=14,
    y=10,
    p=12,
    save_formats=["png", "svg"],
    file_prefix="Stacked_Bar",
    logscale=False,
    plot_type="both",
    show_legend=True,
    label_fontsize=12,
    tick_fontsize=12,
)

## Surgical Techniques

### Number of Procedures in each Surgical Category

In [None]:
surg_tech_color = ["#1f77b4", "#203764"]
surg_tech_values = circ_eda["Surgical_Technique"].value_counts(ascending=True)
ax = surg_tech_values.plot(
    kind="bar",
    rot=0,
    width=0.99,
    color=surg_tech_color,
)

for i, v in enumerate(surg_tech_values):
    ax.text(i, v - 40, str(v), ha="center", color="yellow")
ax.set_title("Number of Procedures in Each Surgical Category")
ax.set_xlabel("Surgical Technique")
ax.set_ylabel("Count")
plt.savefig(os.path.join(image_path_svg, "surgical_technique_by_count.svg"))
plt.savefig(os.path.join(image_path_png, "surgical_technique_by_count.png"))
plt.show()

### Surgical Technique by Mean Time (in Minutes)

In [None]:
surgical_techniques = circ_eda.groupby("Surgical_Technique")["Surgical_Time_min"].agg(
    "mean"
)

ax = (
    circ_eda.groupby("Surgical_Technique")["Surgical_Time_min"]
    .agg("mean")
    .plot(
        kind="bar",
        rot=0,
        width=0.99,
        color=surg_tech_color,
    )
)

for i, v in enumerate(surgical_techniques):
    ax.text(i, v - 15, f"{v:.2f}", ha="center", color="yellow")

ax.set_title("Surgical Technique by Mean Time (in Minutes)")
plt.xlabel("Surgical Technique")
plt.ylabel("Surgical Time in Minutes")
plt.savefig(os.path.join(image_path_svg, "surgical_technique_by_mean_time.svg"))
plt.savefig(os.path.join(image_path_png, "surgical_technique_by_mean_time.png"))
plt.show()

### Antibiotics by Surgical Technique

In [None]:
antibiotic_by_surgical_technique = pd.crosstab(
    circ_eda["Preop_drugs_antibiotic"], circ_eda["Surgical_Technique"]
)

antibiotic_by_surgical_technique

In [None]:
antibiotic_by_surgical_technique.plot(
    kind="bar",
    rot=0,
    color=surg_tech_color,
)


plt.title("Type of Antibiotic by Surgical Technique")
plt.xlabel("Antibiotic")
plt.ylabel("Count")
plt.savefig(os.path.join(image_path_png, "antibiotic_by_surgical_technique.png"))
plt.savefig(os.path.join(image_path_svg, "antibiotic_by_surgical_technique.svg"))
plt.show()

### Anesthesia by Surgical Technique

In [None]:
anesthesia_by_surgical_technique = pd.crosstab(
    circ_eda["Anesthesia_Type"], circ_eda["Surgical_Technique"]
)

anesthesia_by_surgical_technique

In [None]:
anesthesia_by_surgical_technique.plot(
    kind="bar",
    rot=0,
    color=surg_tech_color,
)


plt.title("Type of Anesthesia by Surgical Technique")
plt.xlabel("Anesthesia Type")
plt.ylabel("Count")
plt.savefig(os.path.join(image_path_png, "anesthesia_by_surgical_technique.png"))
plt.savefig(os.path.join(image_path_svg, "anesthesia_by_surgical_technique.svg"))


plt.show()

### Box Plot of Surgical Time by Surgical Technique

In [None]:
# Box plot for Surgical_Time_min across different Surgical_Techniques
unique_techniques = circ_eda["Surgical_Technique"].unique()
technique_colors = dict(zip(unique_techniques, surg_tech_color))
sns.boxplot(
    x="Surgical_Technique",
    y="Surgical_Time_min",
    data=circ_eda,
    hue="Surgical_Technique",
    palette=technique_colors,
    medianprops={"color": "yellow", "linewidth": 1},  # Setting median line props
)
plt.title("Box plot of Surgical Time by Surgical Technique")
plt.ylabel("Surgical Time (min)")
plt.xlabel("Surgical Technique")
plt.xticks(rotation=0)
plt.savefig(
    os.path.join(image_path_png, "surgical_time_by_technique_boxplot.png"),
)
plt.savefig(
    os.path.join(image_path_svg, "surgical_time_by_technique_boxplot.svg"),
)
plt.show()

### Box Plot of Intraoperative Blood Loss by Surgical Technique

In [None]:
# Box plot for Intraoperative_Blood_Loss_ml across different Surgical_Techniques
sns.boxplot(
    x="Surgical_Technique",
    y="Intraoperative_Blood_Loss_ml",
    data=circ_eda,
    hue="Surgical_Technique",
    palette=technique_colors,
    medianprops={"color": "yellow", "linewidth": 1},  # Setting median line props
)
plt.title("Box plot of Intraoperative Blood Loss by Surgical Technique")
plt.ylabel("Intraoperative Blood Loss (ml)")
plt.xlabel("Surgical Technique")
plt.xticks(rotation=0)
plt.savefig(
    os.path.join(image_path_png, "intraop_blood_loss_by_technique_boxplot.png"),
)
plt.savefig(
    os.path.join(image_path_svg, "intraop_blood_loss_by_technique_boxplot.svg"),
)
plt.show()

### Boxplot of Surgical Time by Anesthesia Type

In [None]:
# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Anesthesia_Type",
    y="Surgical_Time_min",
    hue="Anesthesia_Type",
    data=circ_eda,
)
plt.title("Box plot of Surgical Time by Anesthesia Type")
plt.ylabel("Surgical Time (min)")
plt.xlabel("Anesthesia Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "anesthesia_surgical_time_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "anesthesia_surgical_time_boxplot.svg"))
plt.show()

### Box Plot of Intraoperative Blood Loss by Anesthesia Type

In [None]:
# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Anesthesia_Type",
    y="Intraoperative_Blood_Loss_ml",
    hue="Anesthesia_Type",
    data=circ_eda,
)
plt.title("Box plot of Intraoperative Blood Loss by Anesthesia Type")
plt.ylabel("Intraoperative Blood Loss (ml)")
plt.xlabel("Anesthesia Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "anesthesia_blood_loss_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "anesthesia_blood_loss_boxplot.svg"))
plt.show()

### Box Plot of Surgical Time (min) by Preoperative Antibiotic

In [None]:
# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Preop_drugs_antibiotic",
    y="Surgical_Time_min",
    hue="Preop_drugs_antibiotic",
    data=circ_eda,
)
plt.title("Box plot of Surgical Time (min) by Preoperative Antibiotic")
plt.ylabel("Surgical Time (min)")
plt.xlabel("Antibiotic Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "antibiotic_surgical_time_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "antibiotic_surgical_time_boxplot.svg"))
plt.show()

### Box Plot of Intraoperative Blood Loss by Preoperative Antibiotic

In [None]:
# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Preop_drugs_antibiotic",
    y="Intraoperative_Blood_Loss_ml",
    hue="Preop_drugs_antibiotic",
    data=circ_eda,
)
plt.title("Box plot of Intraoperative Blood Loss by Preoperative Antibiotic")
plt.ylabel("Intraoperative Blood Loss (ml)")
plt.xlabel("Antibiotic Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "antibiotic_blood_loss_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "antibiotic_blood_loss_boxplot.svg"))
plt.show()

### Prevalance of Functional Outcomes by Surgical Technique

In [None]:
# List of custom plotting titles
functional_title_list = [
    "Pain",
    "Bleeding",
    "Edema",
    "Infection",
    "Recovery",
    "Satisfaction",
    "Comorbidities",
    "Surgical Technique",
]

functional_list = [col for col in circ_eda.columns if "Functional" in col]


functional_labels = {
    "Functional_Outcomes_Pain": {0: "No Pain", 1: "Pain"},
    "Functional_Outcomes_Bleeding": {0: "No Bleeding", 1: "Bleeding"},
    "Functional_Outcomes_Edema": {0: "No Edema", 1: "Edema"},
    "Functional_Outcomes_Infection": {0: "No Infection", 1: "Infection"},
    "Functional_Outcomes_Fast_Recovery": {0: "Not Fast Recovery", 1: "Fast Recovery"},
    "Functional_Outcomes_Cosmetic_Satisfaction": {
        0: "No Satisfaction",
        1: "Satisfaction",
    },
}


for item, title in zip(functional_list, functional_title_list):
    ax = pd.crosstab(
        circ_eda[item].map(functional_labels[item]),
        circ_eda["Surgical_Technique"],
    ).plot(
        kind="bar",
        # stacked=True,
        # width=0.9,
        rot=0,
        color=technique_colors,
    )

    ax.set_ylabel("Count")
    ax.set_title(f"Prevalence of {title} by Surgical Technique")
    # Setting labels for x-axis ticks
    ax.set_xticklabels(functional_labels[item].values())
    ax.set_xlabel(title)

    plt.savefig(
        os.path.join(
            image_path_png, f"Prevalance_of_{title}_by_surgical_technique.png"
        ),
    )

    plt.savefig(
        os.path.join(
            image_path_svg, f"Prevalance_of_{title}_by_surgical_technique.svg"
        ),
    )


plt.show()

## Socioeconomic Impacts

### Religious Affiliation by Geographical Origin

In [None]:
circ_eda["Cultural_Religious_Affiliation"].unique().tolist()

In [None]:
ct = pd.crosstab(
    circ_eda["Cultural_Religious_Affiliation"],
    circ_eda["Geographical_Origin"],
    margins=True,
    margins_name="Total",
)

highlight_columns(ct, "Total", color="brown")

In [None]:
# create a heatmap of the crosstab between religion and geo. origin
plt.figure(figsize=(9, 6))
sns.heatmap(
    pd.crosstab(
        circ_eda["Geographical_Origin"], circ_eda["Cultural_Religious_Affiliation"]
    ),
    annot=True,
    cmap="rocket_r",
    fmt="d",
)
plt.title("Cultural Religious Affiliation by Geographical Origin")
plt.xlabel("Geographical Origin")
plt.savefig(
    os.path.join(image_path_png, "religion_by_geog_origin.png"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_svg, "religion_by_geog_origin.svg"),
    bbox_inches="tight",
)
plt.show()

### Total Cost by Coverage Type

In [None]:
total_cost_ins_values = circ_eda["Cost_Type"].value_counts(ascending=True)
ins_bar_col = ["#1f77b4", "#c8544c", "#555555"]

ax = total_cost_ins_values.plot(
    kind="barh",
    rot=0,
    width=0.99,
    legend=False,
    color=ins_bar_col,
)

for i, v in enumerate(total_cost_ins_values.values):
    ax.text(v - 5, i, str(v), ha="right", va="center", color="yellow")

ax.set_title("Total Number of Procedures by Coverage Category")
ax.set_xlabel("Number of Procedures")
ax.set_ylabel("Cost Type")
plt.savefig(os.path.join(image_path_svg, "total_number_by_coverage.svg"))
plt.savefig(os.path.join(image_path_png, "total_number_by_coverage.png"))
plt.show()

In [None]:
total_cost_by_ins = round(
    circ_eda.groupby("Cost_Type")["Cost_of_Procedure_euros"].sum().to_frame(), 2
).rename(columns={"Cost_of_Procedure_euros": "Total_Cost"})

ax = total_cost_by_ins.plot(
    kind="barh",
    rot=0,
    width=0.99,
    legend=False,
    color=ins_bar_col,
)

# Accessing the bar patches
for i, patch in enumerate(ax.patches):
    patch.set_facecolor(ins_bar_col[i % len(ins_bar_col)])

# Iterate over the DataFrame's rows to place text labels
for i, (index, row) in enumerate(total_cost_by_ins.iterrows()):
    # Only place text if the cost is greater than 0
    if row["Total_Cost"] > 0:
        # Use 'i' for the y position and adjust the x position to place the text
        # to the right of the bar's end
        ax.text(
            row["Total_Cost"] - 10000,
            i,
            f"{row['Total_Cost']} €",
            ha="left",
            va="center",
            color="yellow",
        )

ax.set_title("Total Cost of Procedure by Coverage Category")
ax.set_xlabel("Cost (in €)")
ax.set_ylabel("Cost Type")

plt.savefig(os.path.join(image_path_svg, "total_cost_by_coverage.svg"))
plt.savefig(os.path.join(image_path_png, "total_cost_by_coverage.png"))

plt.show()
total_cost_by_ins

### Average Cost by Coverage Type

In [None]:
avg_cost_by_ins = round(
    circ_eda.groupby("Cost_Type")["Cost_of_Procedure_euros"].agg("mean").to_frame(), 2
).rename(columns={"Cost_of_Procedure_euros": "Average_Cost"})


ax = avg_cost_by_ins.plot(
    kind="barh",
    rot=0,
    width=0.99,
    legend=False,
)

# Get the y-axis labels (which are the categories) as a list
y_labels = avg_cost_by_ins.index.tolist()

# Accessing the bar patches
for i, patch in enumerate(ax.patches):
    patch.set_facecolor(ins_bar_col[i % len(ins_bar_col)])

# Iterate over the DataFrame's rows to place text labels
for i, (index, row) in enumerate(avg_cost_by_ins.iterrows()):
    # Only place text if the cost is greater than 0
    if row["Average_Cost"] > 0:
        # Use 'i' for the y position and adjust the x position to place the text
        # to the right of the bar's end
        ax.text(
            row["Average_Cost"] - 200,
            i,
            f"{row['Average_Cost']} €",
            ha="left",
            va="center",
            color="yellow",
        )

ax.set_title("Average Cost of Procedure by Coverage Category")
ax.set_xlabel("Cost (in €)")
ax.set_ylabel("Cost Type")
plt.savefig(os.path.join(image_path_svg, "avg_cost_by_coverage.svg"))
plt.savefig(os.path.join(image_path_png, "avg_cost_by_coverage.png"))


plt.show()
avg_cost_by_ins

### Number of Patients by Country of Origin

In [None]:
circ_eda["Geographical_Origin"].value_counts(ascending=True).plot(kind="barh")
plt.title("Number of Patients by Country of Origin")
plt.xlabel("Number of Patients")
plt.ylabel("Country of Origin")
plt.savefig(
    os.path.join(image_path_svg, "number_patients_by_country.svg"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_png, "number_patients_by_country.png"),
    bbox_inches="tight",
)
plt.show()

### Cost of Procedure by Country of Origin

In [None]:
circ_eda.groupby("Geographical_Origin")["Cost_of_Procedure_euros"].agg(
    "mean"
).sort_values().plot(kind="barh")

plt.title("Average Cost of Procedure by Country of Origin")
plt.xlabel("Cost (in €)")
plt.ylabel("Country of Origin")


plt.savefig(
    os.path.join(image_path_svg, "cost_by_country.svg"),
    bbox_inches="tight",
)


plt.savefig(
    os.path.join(image_path_png, "cost_by_country.png"),
    bbox_inches="tight",
)
plt.show()