In [39]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [40]:
from pyprojroot import here
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
import shap
import joblib

path_data = here("./data")
os.chdir(path_data)

data = pd.read_csv("data_pmpm.csv")
data.head()

Unnamed: 0,Hospital ID,High Blood Pressure,High Cholesterol,Diabetes,Preventative Services,Per Member Per Month Cost
0,1,0,0,0,1,3750.141373
1,1,1,0,0,0,3451.956282
2,3,1,0,0,1,4211.072976
3,1,0,1,0,0,2471.602577
4,3,0,0,0,1,4249.270849


In [41]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import median_absolute_error

# Prepare the data
X = data.drop(["Per Member Per Month Cost", "Hospital ID"], axis=1)
y = data["Per Member Per Month Cost"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the XGBoost model for regression
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model using Mean Squared Error
mae = median_absolute_error(y_test, predictions)
print("Mean Absoulte Error: ", mae)

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


Mean Absoulte Error:  1305.2200168336512


In [42]:
joblib.dump(model, "model_drivers.joblib")
path_production = here("./src/production")
os.chdir(path_production)
joblib.dump(model, "model_drivers.joblib")
os.chdir(path_data)

In [43]:
# Assuming 'data' is your DataFrame and is already defined
# ... [Your existing code for data preparation and model training] ...

# Create a Tree explainer
explainer = shap.Explainer(
    model, X_train, model_output="raw", feature_perturbation="interventional"
)

# Calculate SHAP values - this might take some time for larger datasets
shap_values = explainer(data.drop(["Per Member Per Month Cost", "Hospital ID"], axis=1))



In [44]:
# Assuming shap_values and X have the same order
column_mapping = {
    i: name
    for i, name in enumerate(
        data.drop(["Per Member Per Month Cost", "Hospital ID"], axis=1).columns
    )
}

# Rename columns in the DataFrame created from SHAP values
data_shap_pd = pd.DataFrame(shap_values.values).rename(columns=column_mapping)

In [45]:
data_no_id_outcome = data.drop(columns=["Hospital ID", "Per Member Per Month Cost"])
data_percentile = data_no_id_outcome.rank(pct=True)

Step: Add back hosptial ids and add unique ids

In [20]:
data_percentile_id = pd.concat([data[["Hospital ID"]], data_percentile], axis=1)

data_shap_id = pd.concat([data[["Hospital ID"]], data_shap_pd], axis=1)

In [46]:
data_shap_id.to_csv("data_shap_ind.csv")

In [47]:
import pandas as pd


def median_shap_for_high_percentiles(percentile_df, shap_df, id_col):
    # Initialize a dictionary to store the results
    median_shap_values = {id_col: [], "variable": [], "median_shap": []}

    # Iterate over each ID
    for id_value in percentile_df[id_col].unique():
        # Filter DataFrames for the current ID
        percentile_subdf = percentile_df[percentile_df[id_col] == id_value]
        shap_subdf = shap_df[shap_df[id_col] == id_value]

        # Iterate over each column (except the ID column)
        for col in percentile_df.columns:
            if col == id_col:
                continue

            # Calculate the median of the current column in percentile DataFrame
            median_value = percentile_subdf[col].median()

            # Filter rows where the percentile value is above the median
            rows_above_median = percentile_subdf[
                percentile_subdf[col] > median_value
            ].index

            # Calculate the median SHAP value for these rows
            median_shap = shap_subdf.loc[rows_above_median, col].median()

            # Store the results
            median_shap_values[id_col].append(id_value)
            median_shap_values["variable"].append(col)
            median_shap_values["median_shap"].append(median_shap)

    # Convert the dictionary to a DataFrame and return
    return pd.DataFrame(median_shap_values)

In [49]:
# Example usage
result_df = (
    median_shap_for_high_percentiles(
        percentile_df=data_percentile_id,
        shap_df=data_shap_id,
        id_col="Hospital ID",
    )
    .fillna(-1200)
    .round(2)
)

In [50]:
result_df

Unnamed: 0,Hospital ID,variable,median_shap
0,1,High Blood Pressure,1718.37
1,1,High Cholesterol,1020.47
2,1,Diabetes,1517.72
3,1,Preventative Services,-1200.0
4,3,High Blood Pressure,1718.37
5,3,High Cholesterol,1020.47
6,3,Diabetes,1517.72
7,3,Preventative Services,-1066.44
8,2,High Blood Pressure,1718.37
9,2,High Cholesterol,1020.47


In [51]:
noise = np.random.uniform(-200, 200, result_df["median_shap"].shape)
result_df["median_shap"] = result_df["median_shap"] + noise
data_shap_hospital = (
    result_df.rename(columns={"variable": "Driver", "median_shap": "Impact"}).round(2)
).reset_index(drop=True)

In [52]:
data_shap_hospital

Unnamed: 0,Hospital ID,Driver,Impact
0,1,High Blood Pressure,1661.88
1,1,High Cholesterol,1095.58
2,1,Diabetes,1660.67
3,1,Preventative Services,-1212.19
4,3,High Blood Pressure,1548.51
5,3,High Cholesterol,1137.62
6,3,Diabetes,1564.09
7,3,Preventative Services,-907.4
8,2,High Blood Pressure,1872.72
9,2,High Cholesterol,840.72


In [53]:
data_shap_hospital.to_csv("data_shap_hospital.csv")