# Modeling

## Import Requisite Libraries

In [None]:
######################## Standard Library Imports ##############################
import pandas as pd
import numpy as np
import os
import sys

from eda_toolkit import ensure_directory, generate_table1

######################## Modeling Library Imports ##############################
import shap
from model_tuner.pickleObjects import loadObjects
import model_tuner
import eda_toolkit
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt


# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))

from constants import patient_id

print(
    f"This project uses: \n \n Python {sys.version.split()[0]} \n model_tuner "
    f"{model_tuner.__version__} \n eda_toolkit {eda_toolkit.__version__}"
)

## Set Paths & Read in the Data

In [None]:
# Define your base paths
# `base_path`` represents the parent directory of your current working directory
base_path = os.path.join(os.pardir)
# Go up one level from 'notebooks' to the parent directory, then into the 'data' folder

data_path = os.path.join(os.pardir, "data")
image_path_png = os.path.join(base_path, "images", "png_images", "modeling")
image_path_svg = os.path.join(base_path, "images", "svg_images", "modeling")

# Use the function to ensure the 'data' directory exists
ensure_directory(data_path)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

In [None]:
data_path = "../data/processed/"
model_path = "../mlruns/models/"

In [None]:
df = pd.read_parquet(os.path.join(data_path, "X.parquet"))  # Change delimiter as needed
df.head()

In [None]:
df.shape

In [None]:
df.columns.to_list()

In [None]:
df["Diabetes"].value_counts()

In [None]:
df.columns.to_list()

In [None]:
X = pd.read_parquet(os.path.join(data_path, "X.parquet"))
y = pd.read_parquet(os.path.join(data_path, "y_Bleeding_Edema_Outcome.parquet"))

In [None]:
df = df.join(y, how="inner", on="patient_id")

In [None]:
# svm_orig_training
model_svm = loadObjects(
    os.path.join(
        model_path,
        "./452642104975561062/18dc58511b9e45ebaf55308026701c18/artifacts/svm_Bleeding_Edema_Outcome/model.pkl",
    )
)

In [None]:
pipelines_or_models = [model_svm]

# Model titles
model_titles = [
    "Support Vector Machines",
]


thresholds = {
    "Support Vector Machines": next(iter(model_svm.threshold.values())),
}

## Summarize Model Performance

In [None]:
pipelines_or_models

In [None]:
from model_metrics import summarize_model_performance

table3 = summarize_model_performance(
    model=pipelines_or_models,
    X=X,
    y=y,
    model_title=model_titles,
    model_threshold=thresholds,
    return_df=True,
)

In [None]:
table3

In [None]:
X = pd.read_parquet(os.path.join(data_path, "X.parquet"))
y = pd.read_parquet(os.path.join(data_path, "y_Bleeding_Edema_Outcome.parquet"))

In [None]:
explainer = pd.read_csv(
    os.path.join(data_path, "shap_predictions_Bleeding_Edema_Outcome.csv")
)
explainer.index = explainer.index.astype(int)
explainer = explainer.set_index("patient_id")
df.index = df.index.astype(int)

In [None]:
explainer

In [None]:
# Join on index
merged_explainer = explainer.join(df, how="inner", on=patient_id)

In [None]:
merged_explainer.to_csv(os.path.join(data_path, "explainer_df.csv"))

In [None]:
merged_explainer

## Bleeding, Edema, Pain Prevalence

In [None]:
print(merged_explainer["Bleeding_Edema_Outcome"].value_counts(1), "\n")
print(merged_explainer["Bleeding_Edema_Outcome"].value_counts())

In [None]:
merged_explainer[
    (merged_explainer["Surgical_Technique"] == 0)
    & (merged_explainer["Bleeding_Edema_Outcome"] == 1)
    & (merged_explainer["y_pred"] > 0.24)
]

### Laser Circumcision W/ Predictions Over Threshold

In [None]:
merged_explainer[
    (merged_explainer["Surgical_Technique"] == 1)
    & (merged_explainer["y_pred_proba"] >= 0.24)
]["Bleeding_Edema_Outcome"].value_counts()

### Traditional Circumcision W/ Predictions Over Threshold

In [None]:
merged_explainer[
    (merged_explainer["Surgical_Technique"] == 0)
    & (merged_explainer["y_pred_proba"] >= 0.24)
]["Bleeding_Edema_Outcome"].value_counts()