This notebook uses conditional average treatment effect (CATE) to estimate effects of different surgeries on total charges for different subpopulations using the [econml](https://econml.azurewebsites.net/spec/api.html#api-of-conditional-average-treatment-effect-package) library

In [1]:
import warnings
warnings.filterwarnings('ignore') #done due to number of warnings from the econML API

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from econml.drlearner import DRLearner, LinearDRLearner, ForestDRLearner
from econml.dml import ForestDML
from econml.cate_interpreter import SingleTreeCateInterpreter
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LassoCV, MultiTaskLassoCV
from utility_functions import load_file, pickle_file, starting_run, finished_run
from analysis_variables import de_col_name, de_col_values


Bad key "text.kerning_factor" on line 4 in
C:\Users\Michael\anaconda3\envs\HCUP-study\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.2/matplotlibrc.template
or from the matplotlib source distribution


In [3]:
starting_run()
dataset = load_file("summary_costs_enhanced.pickle")
category_status = load_file("category_status_filtered.pickle")

Starting  21:01:08.352085


### Filter Dataset Columns and Convert Features

In [4]:
demographic_cols = ['marital_status', 'initial_discharge_quarter', 'gender', 'race', 'payer']
demographic_dataset = dataset.loc[:, demographic_cols].fillna("-1")
enc = OneHotEncoder(sparse=False).fit(demographic_dataset)
features = pd.DataFrame(
    enc.transform(demographic_dataset),
    index = demographic_dataset.index,
    columns = enc.get_feature_names(demographic_dataset.columns)
).join(dataset["age"], how='outer').join(category_status, how="outer")
print(f"Number of Null Rows: {features.isna().any(axis=1).sum()}")
features.dropna(inplace=True)
treatments = dataset.loc[features.index, de_col_name].eq(de_col_values[0])
outcomes = dataset.loc[features.index, ["ED_revisits", "Cost"]]

Number of Null Rows: 0


### Model Methods:

In [5]:
intrp = SingleTreeCateInterpreter(include_model_uncertainty=True, max_depth=3, min_samples_leaf=10)
def plot_interpreter_fig(estimator, analysis_name):
    intrp.interpret(estimator, X=features)
    # Plot the tree
    plt.figure(figsize=(70, 30))
    intrp.plot(feature_names=features.columns ,fontsize=12)
    plt.savefig(f"../figures/{analysis_name}_CATE_interpretation.png")

In [6]:
def run_model(outcome_var):
    est = ForestDML(model_y = LassoCV(), model_t=LassoCV(), random_state=42)
    est.fit(
        Y = outcomes[outcome_var],
        T = treatments,
        X = features
    )
    plot_interpreter_fig(est, outcome_var)
    effect = est.effect(features)
    lower_CI, upper_CI = est.effect_interval(features, alpha=0.05)
    return pd.DataFrame({
        "effect": effect,
        "lower_conf_interval": lower_CI,
        "upper_conf_interval": upper_CI,
    }, index = features.index)

### Model Running:

In [7]:
for outcome in ["Cost", "ED_revisits"]:
    starting_run(outcome)
    run_model(outcome).to_csv(f"../pickled_data/{outcome} CATE Effect")
finished_run()

Starting Cost 19:36:38.462450
Starting ED_revisits 19:36:46.987911
Finished  19:36:55.991571


In [5]:
features[["has biliary colic with inflammation"]].join(treatments).join(outcomes["Cost"]).groupby([
    "has biliary colic with inflammation", "surgery_type"
]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Cost
has biliary colic with inflammation,surgery_type,Unnamed: 2_level_1
0,False,6013.168402
0,True,11982.029717
1,False,9244.339371
1,True,11010.043108


In [6]:
features[["has biliary colic with inflammation"]].join(treatments).join(outcomes["Cost"]).groupby([
    "surgery_type"
]).mean()

Unnamed: 0_level_0,has biliary colic with inflammation,Cost
surgery_type,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.440828,7437.560338
True,0.946844,11061.71017
