# Modeling

## Import Requisite Libraries

In [None]:
######################## Standard Library Imports ##############################
import pandas as pd
import numpy as np
import os
import sys

from eda_toolkit import ensure_directory, generate_table1

######################## Modeling Library Imports ##############################
import shap
from model_tuner.pickleObjects import loadObjects
import model_tuner
import eda_toolkit
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt


# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))

from constants import patient_id

print(
    f"This project uses: \n \n Python {sys.version.split()[0]} \n model_tuner "
    f"{model_tuner.__version__} \n eda_toolkit {eda_toolkit.__version__}"
)

## Set Paths & Read in the Data

In [None]:
# Define your base paths
# `base_path`` represents the parent directory of your current working directory
base_path = os.path.join(os.pardir)
# Go up one level from 'notebooks' to the parent directory, then into the 'data' folder

data_path = os.path.join(os.pardir, "data")
image_path_png = os.path.join(base_path, "images", "png_images", "modeling")
image_path_svg = os.path.join(base_path, "images", "svg_images", "modeling")

# Use the function to ensure the 'data' directory exists
ensure_directory(data_path)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

In [None]:
data_path = "../data/processed/"
data_raw = "../data/"
model_path = "../mlruns/models/"

In [None]:
df = pd.read_excel(os.path.join(data_raw, "Laser Circumcision Excel 31.03.2024.xlsx"))

In [None]:
df = df[df["Age (y)"] >= 18]  # Filter for patients aged 18 and older

In [None]:
df.columns.to_list()

## Load Models

In [None]:
# lr_smote_training
model_lr = loadObjects(
    os.path.join(
        model_path,
        "./452642104975561062/8eab72fdaa134c209521879f18f19d06/artifacts/lr_Bleeding_Edema_Outcome/model.pkl",
    )
)

# rf_over_training
model_rf = loadObjects(
    os.path.join(
        model_path,
        "./452642104975561062/d18ee7233d0f40ae968e57b596b75ac7/artifacts/rf_Bleeding_Edema_Outcome/model.pkl",
    )
)

# svm_orig_training
model_svm = loadObjects(
    os.path.join(
        model_path,
        "./452642104975561062/18dc58511b9e45ebaf55308026701c18/artifacts/svm_Bleeding_Edema_Outcome/model.pkl",
    )
)

In [None]:
X = pd.read_parquet(os.path.join(data_path, "X.parquet"))
y = pd.read_parquet(os.path.join(data_path, "y_Bleeding_Edema_Outcome.parquet"))

In [None]:
df["patient_id"] = X.index

In [None]:
df = df.set_index(patient_id, drop=False)

In [None]:
stacked_crosstabs = loadObjects(
    os.path.join(
        data_raw,
        "stacked_crosstabs.pkl",
    )
)

In [None]:
stacked_cross_surg_tech = loadObjects(
    os.path.join(
        data_raw,
        "stacked_cross_surg_tech.pkl",
    )
)

In [None]:
combined = {**stacked_crosstabs, **stacked_cross_surg_tech}

In [None]:
# build the full path
out_path = os.path.join(data_raw, "freq_cols.pkl")

# pickle the dict
pd.to_pickle(combined, out_path)

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import StratifiedKFold

# 1) Extract the **test** pipelines (these don’t resample)
pipe_lr = model_lr.test_model
pipe_rf = model_rf.test_model
pipe_svm = model_svm.test_model

# 2) Prepare empty OOF arrays
n = len(y)
lr_oof = np.zeros(n)
rf_oof = np.zeros(n)
svm_oof = np.zeros(n)

# 3) 10‐fold splitter
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=222)

# 4) Manual CV loop
for train_idx, test_idx in cv.split(X, y):
    X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
    y_tr = y.iloc[train_idx]

    # fit on the training fold
    pipe_lr.fit(X_tr, y_tr)
    pipe_rf.fit(X_tr, y_tr)
    pipe_svm.fit(X_tr, y_tr)

    # predict_proba on the test fold
    lr_oof[test_idx] = pipe_lr.predict_proba(X_te)[:, 1]
    rf_oof[test_idx] = pipe_rf.predict_proba(X_te)[:, 1]
    svm_oof[test_idx] = pipe_svm.predict_proba(X_te)[:, 1]

# 5) All arrays are already 1‐D, but just to be safe:
lr_oof = lr_oof.ravel()
rf_oof = rf_oof.ravel()
svm_oof = svm_oof.ravel()
true_arr = y.astype(int).to_numpy().ravel()

# 6) Build the DataFrame with patient_id as index
df_all = pd.DataFrame(
    {
        "model_lr": lr_oof,
        "model_rf": rf_oof,
        "model_svm": svm_oof,
        "y_val": true_arr,
    },
    index=X.index,  # patient_id
)

# 7) Reset index into its own column and save
out_path = os.path.join(data_raw, "models.csv")
df_all.reset_index(drop=True).to_csv(out_path, index=False)

print("Saved k-fold predictions + true labels to:", out_path)

In [None]:
df_all

In [None]:
# create bins for age along with labels such that age as a continuous series
# can be converted to something more manageable for visualization and analysis
bin_ages = [18, 30, 40, 50, 60, 70, 80, 90, 100]
label_ages = [
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
]

df["age_group"] = pd.cut(
    df["Age (y)"],
    bins=bin_ages,
    labels=label_ages,
    right=False,  # <-- include left edge, exclude right
    include_lowest=True,  # <-- include the lowest value (e.g. 18)
)

In [None]:
df

In [None]:
df_roc = df[
    [
        "age_group",
        "Cultural / Religious affiliation",
        "Geographical Origin",
        "Preoperative drugs (antibiotic)",
    ]
]

In [None]:
df_roc = df_roc.rename(columns={"model_svm": "Predictions"})
df_roc = df_roc.join(y, how="inner", on="patient_id")

In [None]:
df_roc.to_csv(os.path.join(data_raw, "df_preds_roc.csv"), index=False)

In [None]:
capstone = pd.read_pickle(os.path.join(data_raw, "freq_cols_capstone.pkl"))

In [None]:
combined.keys()