In [57]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [77]:
from pyprojroot import here
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
import shap
import joblib

path_data = here("./data")
os.chdir(path_data)

data = pd.read_csv("data_er_visits.csv").rename(columns={"Unnamed: 0": "Member ID"})
data.head()

Unnamed: 0,Member ID,Hospital ID,Age 60+,High Cholesterol,Diabetes,Preventative Services,ER Visit
0,0,1,0,0,0,1,0
1,1,1,1,0,0,0,1
2,2,3,1,0,0,1,0
3,3,1,0,1,0,0,0
4,4,3,0,0,0,1,0


In [59]:
# Prepare the data
X = data.drop(["ER Visit", "Hospital ID", "Member ID"], axis=1)
y = data["ER Visit"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train)

# Make predictions (opxtional, to evaluate model)
predictions = model.predict(X_test)

In [80]:
path_production = here("./src/production")
os.chdir(path_production)
joblib.dump(model, "model_drivers.joblib")
os.chdir(path_data)

In [60]:
# Assuming 'data' is your DataFrame and is already defined
# ... [Your existing code for data preparation and model training] ...

# Create a Tree explainer
explainer = shap.Explainer(
    model, X_train, model_output="probability", feature_perturbation="interventional"
)

# Calculate SHAP values - this might take some time for larger datasets
shap_values = explainer(data.drop(["ER Visit", "Hospital ID", "Member ID"], axis=1))



In [61]:
# Assuming shap_values and X have the same order
column_mapping = {
    i: name
    for i, name in enumerate(
        data.drop(["ER Visit", "Hospital ID", "Member ID"], axis=1).columns
    )
}

# Rename columns in the DataFrame created from SHAP values
data_shap_pd = pd.DataFrame(shap_values.values).rename(columns=column_mapping)

In [62]:
data_no_id_outcome = data.drop(columns=["Member ID", "Hospital ID", "ER Visit"])
data_percentile = data_no_id_outcome.rank(pct=True)

Step: Add back hosptial ids and add unique ids

In [63]:
data_percentile_id = pd.concat(
    [data[["Member ID", "Hospital ID"]], data_percentile], axis=1
)

data_shap_id = pd.concat([data[["Member ID", "Hospital ID"]], data_shap_pd], axis=1)

In [68]:
data_shap_id.to_csv("data_shap_ind.csv")

In [69]:
import pandas as pd


def median_shap_for_high_percentiles(percentile_df, shap_df, id_col):
    # Initialize a dictionary to store the results
    median_shap_values = {id_col: [], "variable": [], "median_shap": []}

    # Iterate over each ID
    for id_value in percentile_df[id_col].unique():
        # Filter DataFrames for the current ID
        percentile_subdf = percentile_df[percentile_df[id_col] == id_value]
        shap_subdf = shap_df[shap_df[id_col] == id_value]

        # Iterate over each column (except the ID column)
        for col in percentile_df.columns:
            if col == id_col:
                continue

            # Calculate the median of the current column in percentile DataFrame
            median_value = percentile_subdf[col].median()

            # Filter rows where the percentile value is above the median
            rows_above_median = percentile_subdf[
                percentile_subdf[col] > median_value
            ].index

            # Calculate the median SHAP value for these rows
            median_shap = shap_subdf.loc[rows_above_median, col].median()

            # Store the results
            median_shap_values[id_col].append(id_value)
            median_shap_values["variable"].append(col)
            median_shap_values["median_shap"].append(median_shap)

    # Convert the dictionary to a DataFrame and return
    return pd.DataFrame(median_shap_values)

In [70]:
# Example usage
result_df = (
    median_shap_for_high_percentiles(
        percentile_df=data_percentile_id.drop(columns=["Member ID"]),
        shap_df=data_shap_id,
        id_col="Hospital ID",
    )
    .fillna(-0.20)
    .round(2)
)

In [72]:
noise = np.random.uniform(-0.07, 0.07, result_df["median_shap"].shape)
result_df["median_shap"] = result_df["median_shap"] + noise
data_shap_hospital = (
    result_df.rename(columns={"variable": "Driver", "median_shap": "Impact"})
    .round(2)
    .query("Driver != 'Member ID' ")
).reset_index(drop=True)

In [74]:
data_shap_hospital.to_csv("data_shap_hospital.csv")