### Imports

In [1]:
import pandas as pd
import numpy as np
from utility_functions import load_file, pickle_file, starting_run, finished_run
from analysis_variables import de_col_keys, de_col_values
from scipy.stats import f_oneway, sem, norm

full_dataset = load_file("summary_costs_enhanced.pickle")
filtered_dataset_codes = load_file("fully_filtered_codes.pickle")
code_category_dict = load_file("code-category-dict.pickle")

### Demographic Column Classification

In [2]:
full_dataset["Unmarried"] = full_dataset["marital_status"].ne('Married')
full_dataset["Married"] = full_dataset["marital_status"].eq('Married')
full_dataset["Uninsured"] = full_dataset["payer"].isin(['No charge', 'Self-pay'])
full_dataset["Medicare"] = full_dataset["payer"].eq('Medicare')
full_dataset["Medicaid"] = full_dataset["payer"].eq('Medicaid')
full_dataset["Private Insurance"] = full_dataset["payer"].eq('Private insurance')
full_dataset["White"] = full_dataset["race"].eq("White")
full_dataset["African American or Hispanic"] = full_dataset["race"].isin(['African-American', "Hispanic"])
full_dataset["Female"] = full_dataset["gender"].eq("Female")
full_dataset["Died"] = full_dataset["Died"].eq(1)
del full_dataset["Admitted"] #step needed in this analysis due to "Admitted" column in de_col_values
for key in de_col_keys:
    full_dataset = full_dataset.join([full_dataset[key].eq(val).rename(val).loc[full_dataset.index] for val in de_col_values[key]])

dem_dataset = full_dataset[[
    'median_zip_income', 
    'age',
    'Cost',
    "Unmarried", 
    "Married", 
    "Uninsured", 
    "Medicare",
    "Medicaid",
    "Private Insurance",
    "White",
    "African American or Hispanic",
    "Female",
    'ED Readmissions',
    'Inpatient Readmissions',
    'Died',
    *pd.core.common.flatten(de_col_values.values())
]].copy()

category_dict = {
    'Cost': 'Outcome',
    'ED Readmissions': 'Outcome',
    'Inpatient Readmissions': 'Outcome',
    'Died': 'Outcome',
    'admitted': 'Clinical Pathway',
    'alcohol-related disorders': 'Comorbidity',
    'aspirin': 'Comorbidity',
    'biliary colic with inflammation': 'Comorbidity',
    'diabetes mellitus': 'Comorbidity',
    'hyperlipidemia': 'Comorbidity',
    'ischemic heart disease': 'Comorbidity',
    'mood disorders': 'Comorbidity',
    'nicotine dependence': 'Comorbidity',
    'obesity': 'Comorbidity',
    'systemic hypertension': 'Comorbidity',
    'Medicaid': 'Insurance Status',
    'Medicare': 'Insurance Status',
    'Private Insurance': 'Insurance Status',
    'Uninsured': 'Insurance Status',
    'age': 'Demographic',
    'African American or Hispanic': 'Demographic',
    'Female': 'Demographic',
    'Married': 'Demographic',
    'Unmarried': 'Demographic',
    'White': 'Demographic',
    'median_zip_income': 'Demographic',
    **{value: 'Clinical Pathway' for value in pd.core.common.flatten(de_col_values.values())}
}

### ICD-10 Diagnosis Code Classification

In [3]:
category_list = []
visit_codes = filtered_dataset_codes.reset_index().\
                groupby("visit_link")["codes"].unique().apply(lambda x: [st.strip() for st in x])
for key, values in code_category_dict.items():
    category_list.append(visit_codes.transform(
        lambda x: any([any([code.startswith(value) for value in values]) for code in x])
    ).rename(key))
category_status = pd.DataFrame(category_list).astype("int").T
category_status.index = category_status.index.astype("int")
category_status = category_status.loc[dem_dataset.index, :] # to remove rows dropped in 2.4
pickle_file("category_status_filtered.pickle", category_status)

### Create Summary Table

In [22]:
def create_summary(groupby_col, filter_criteria="Cost >= 0"): #cost should always be positive, making this a universal filter
    num_full_dataset = dem_dataset.query(filter_criteria).join(category_status, how="inner")\
        .astype("int").join(full_dataset[groupby_col]).groupby(groupby_col)
    #         .join(full_dataset['Admitted'].map({True: 'Admitted', False: 'Not Admitted'}))\
    agg_table = num_full_dataset.agg(['mean', 'sum', 'sem']).set_index(
        num_full_dataset.count().iloc[:, 0].reset_index().apply(
            lambda row: f"{row[0]} (N={row[1]})",
            axis=1
        )
    ).T.sort_index()
    summary_table = agg_table.reset_index().set_index("level_1").groupby("level_0").aggregate(
       lambda col: (col.loc["mean"], col.loc["sum"], norm.interval(alpha=0.95,loc=0,scale=col.loc["sem"])[1]) if col.loc["mean"] != 0 else (0, col.loc["sum"], 0)
    ).applymap(
        lambda val: f"{round(val[1], 2)} ({round(val[0]*100,0)}%)" if val[0] <= 1 else f"{round(val[0], 2)} ± {round(val[2], 2)}"
    )
    summary_table["ANOVA P"] = ["p < 0.01" if f_oneway(*[x for _, x in num_full_dataset[col]]).pvalue < 0.01 else "p > 0.01" for col in agg_table.index.levels[0]]
    summary_table["category"] = summary_table.index.map(category_dict)
    summary_table.index = summary_table.index.astype("str") # just in case one of the de_col_values is a number or boolean
    summary_table = summary_table.reset_index().set_index(['category', 'level_0', 'ANOVA P']).sort_index()
    return summary_table

In [23]:
# create_summary(de_col_keys[0]).to_excel(f"../tables/Table 1 {de_col_keys[0]}.xlsx")
create_summary(de_col_keys[0])

  lower_bound = _a * scale + loc
  upper_bound = _b * scale + loc


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,+ Suboxone (N=21),- Suboxone (N=15904)
category,level_0,ANOVA P,Unnamed: 3_level_1,Unnamed: 4_level_1
Clinical Pathway,+ Suboxone,p > 0.01,21.0 (100.0%),0.0 (0%)
Clinical Pathway,- Suboxone,p > 0.01,0.0 (0%),15904.0 (100.0%)
Clinical Pathway,0,p > 0.01,0.0 (0%),15904.0 (100.0%)
Clinical Pathway,12,p > 0.01,5.0 (24.0%),0.0 (0%)
Clinical Pathway,2,p > 0.01,1.0 (5.0%),0.0 (0%)
Clinical Pathway,4,p > 0.01,0.0 (0%),0.0 (0%)
Clinical Pathway,8,p > 0.01,15.0 (71.0%),0.0 (0%)
Clinical Pathway,No Withdrawal,p < 0.01,21.0 (100.0%),15904.0 (100.0%)
Clinical Pathway,Not Overdose,p < 0.01,21.0 (100.0%),15904.0 (100.0%)
Clinical Pathway,Overdose,p < 0.01,0.0 (0%),0.0 (0%)


In [6]:
create_summary(de_col_keys[0], "Complicated == False").to_excel(f"../tables/Table 1 {de_col_keys[0]} Uncomplicated.xlsx")
create_summary(de_col_keys[0], "Complicated == False")

  lower_bound = _a * scale + loc
  upper_bound = _b * scale + loc


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Admitted (N=505),Not Admitted (N=285)
category,level_0,ANOVA P,Unnamed: 3_level_1,Unnamed: 4_level_1
Clinical Pathway,Admitted,p < 0.01,505.0 (100.0%),0.0 (0%)
Clinical Pathway,Complicated,p > 0.01,0.0 (0%),0.0 (0%)
Clinical Pathway,Delayed,p < 0.01,51.0 (10.0%),57.0 (20.0%)
Clinical Pathway,Did Not Obtain Surgery,p < 0.01,335.0 (66.0%),228.0 (80.0%)
Clinical Pathway,Immediate,p < 0.01,119.0 (24.0%),0.0 (0%)
Clinical Pathway,New Cholecystitis,p > 0.01,0.0 (0%),0.0 (0%)
Clinical Pathway,No New Cholecystitis,p > 0.01,505.0 (100.0%),285.0 (100.0%)
Clinical Pathway,No Surgery,p < 0.01,335.0 (66.0%),228.0 (80.0%)
Clinical Pathway,Not Admitted,p < 0.01,0.0 (0%),285.0 (100.0%)
Clinical Pathway,Obtained Surgery,p < 0.01,170.0 (34.0%),57.0 (20.0%)


In [7]:
create_summary(de_col_keys[2], "Complicated == True").to_excel(f"../tables/Table 1 {de_col_keys[0]} Complicated.xlsx")
create_summary(de_col_keys[2], "Complicated == True")

  lower_bound = _a * scale + loc
  upper_bound = _b * scale + loc


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Delayed (N=205),Immediate (N=3607),No Surgery (N=1014),Return Emergency (N=56)
category,level_0,ANOVA P,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Clinical Pathway,Admitted,p < 0.01,171.0 (83.0%),3607.0 (100.0%),881.0 (87.0%),33.0 (59.0%)
Clinical Pathway,Complicated,p > 0.01,205.0 (100.0%),3607.0 (100.0%),1014.0 (100.0%),56.0 (100.0%)
Clinical Pathway,Delayed,p < 0.01,205.0 (100.0%),0.0 (0%),0.0 (0%),0.0 (0%)
Clinical Pathway,Did Not Obtain Surgery,p < 0.01,0.0 (0%),0.0 (0%),1014.0 (100.0%),0.0 (0%)
Clinical Pathway,Immediate,p < 0.01,0.0 (0%),3607.0 (100.0%),0.0 (0%),0.0 (0%)
Clinical Pathway,New Cholecystitis,p > 0.01,0.0 (0%),0.0 (0%),0.0 (0%),0.0 (0%)
Clinical Pathway,No New Cholecystitis,p > 0.01,205.0 (100.0%),3607.0 (100.0%),1014.0 (100.0%),56.0 (100.0%)
Clinical Pathway,No Surgery,p < 0.01,0.0 (0%),0.0 (0%),1014.0 (100.0%),0.0 (0%)
Clinical Pathway,Not Admitted,p < 0.01,34.0 (17.0%),0.0 (0%),133.0 (13.0%),23.0 (41.0%)
Clinical Pathway,Obtained Surgery,p < 0.01,205.0 (100.0%),3607.0 (100.0%),0.0 (0%),56.0 (100.0%)


In [8]:
create_summary(de_col_keys[1]).to_excel(f"../tables/Table 1 {de_col_keys[1]}.xlsx")
create_summary(de_col_keys[1])

  lower_bound = _a * scale + loc
  upper_bound = _b * scale + loc


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Complicated (N=4882),Uncomplicated (N=790)
category,level_0,ANOVA P,Unnamed: 3_level_1,Unnamed: 4_level_1
Clinical Pathway,Admitted,p < 0.01,4692.0 (96.0%),505.0 (64.0%)
Clinical Pathway,Complicated,p < 0.01,4882.0 (100.0%),0.0 (0%)
Clinical Pathway,Delayed,p < 0.01,205.0 (4.0%),108.0 (14.0%)
Clinical Pathway,Did Not Obtain Surgery,p < 0.01,1014.0 (21.0%),563.0 (71.0%)
Clinical Pathway,Immediate,p < 0.01,3607.0 (74.0%),119.0 (15.0%)
Clinical Pathway,New Cholecystitis,p > 0.01,0.0 (0%),0.0 (0%)
Clinical Pathway,No New Cholecystitis,p > 0.01,4882.0 (100.0%),790.0 (100.0%)
Clinical Pathway,No Surgery,p < 0.01,1014.0 (21.0%),563.0 (71.0%)
Clinical Pathway,Not Admitted,p < 0.01,190.0 (4.0%),285.0 (36.0%)
Clinical Pathway,Obtained Surgery,p < 0.01,3868.0 (79.0%),227.0 (29.0%)


In [9]:
create_summary(de_col_keys[2]).to_excel(f"../tables/Table 1 {de_col_keys[2]}.xlsx")
create_summary(de_col_keys[2])

  lower_bound = _a * scale + loc
  upper_bound = _b * scale + loc


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Delayed (N=313),Immediate (N=3726),No Surgery (N=1577),Return Emergency (N=56)
category,level_0,ANOVA P,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Clinical Pathway,Admitted,p < 0.01,222.0 (71.0%),3726.0 (100.0%),1216.0 (77.0%),33.0 (59.0%)
Clinical Pathway,Complicated,p < 0.01,205.0 (65.0%),3607.0 (97.0%),1014.0 (64.0%),56.0 (100.0%)
Clinical Pathway,Delayed,p < 0.01,313.0 (100.0%),0.0 (0%),0.0 (0%),0.0 (0%)
Clinical Pathway,Did Not Obtain Surgery,p < 0.01,0.0 (0%),0.0 (0%),1577.0 (100.0%),0.0 (0%)
Clinical Pathway,Immediate,p < 0.01,0.0 (0%),3726.0 (100.0%),0.0 (0%),0.0 (0%)
Clinical Pathway,New Cholecystitis,p > 0.01,0.0 (0%),0.0 (0%),0.0 (0%),0.0 (0%)
Clinical Pathway,No New Cholecystitis,p > 0.01,313.0 (100.0%),3726.0 (100.0%),1577.0 (100.0%),56.0 (100.0%)
Clinical Pathway,No Surgery,p < 0.01,0.0 (0%),0.0 (0%),1577.0 (100.0%),0.0 (0%)
Clinical Pathway,Not Admitted,p < 0.01,91.0 (29.0%),0.0 (0%),361.0 (23.0%),23.0 (41.0%)
Clinical Pathway,Obtained Surgery,p < 0.01,313.0 (100.0%),3726.0 (100.0%),0.0 (0%),56.0 (100.0%)
