### Imports

In [1]:
import pandas as pd
import numpy as np
from utility_functions import load_file, pickle_file, starting_run, finished_run
from analysis_variables import de_col_name
from  scipy.stats import f_oneway, sem, norm

full_dataset = load_file("summary_costs_enhanced.pickle")
filtered_dataset_codes = load_file("fully_filtered_codes.pickle")
code_category_dict = load_file("code-category-dict.pickle")

### Demographic Column Classification

In [2]:
full_dataset["is_unmarried"] = full_dataset["marital_status"].ne('Married')
full_dataset["is_married"] = full_dataset["marital_status"].eq('Married')
full_dataset["is_uninsured"] = full_dataset["payer"].isin(['No charge', 'Self-pay'])
full_dataset["has_medicare"] = full_dataset["payer"].eq('Medicare')
full_dataset["has_medicaid"] = full_dataset["payer"].eq('Medicaid')
full_dataset["has_private_insurance"] = full_dataset["payer"].eq('Private insurance')
full_dataset["is_white"] = full_dataset["race"].eq("White")
full_dataset["is_african_american_or_hispanic"] = full_dataset["race"].isin(['African-American', "Hispanic"])
full_dataset["is_female"] = full_dataset["gender"].eq("Female")
full_dataset["died"] = full_dataset["died"].eq(1)

dem_dataset = full_dataset[[
    'median_zip_income', 
    'age',
    'Cost',
    'is_unmarried', 
    'is_married', 
    'is_uninsured', 
    'has_medicare',
    'has_medicaid',
    'has_private_insurance',
    'is_female',
    'is_white',
    'is_african_american_or_hispanic',
    'ED_revisits',
    'inpatient_visits',
    'died'
]].copy()

### ICD-10 Diagnosis Code Classification

In [3]:
category_list = []
visit_codes = filtered_dataset_codes.reset_index().\
                groupby("visit_link")["codes"].unique().apply(lambda x: [st.strip() for st in x])
for key, values in code_category_dict.items():
    category_list.append(visit_codes.transform(
        lambda x: any([any([code.startswith(value) for value in values]) for code in x])
    ).rename(f"has {key}"))
category_status = pd.DataFrame(category_list).astype("int").T
category_status.index = category_status.index.astype("int")
category_status = category_status.loc[dem_dataset.index, :] # to remove rows dropped in 2.4
pickle_file("category_status_filtered.pickle", category_status)

### Create Summary Table

In [4]:
num_full_dataset = dem_dataset.join(category_status, how="outer").astype("int")\
    .join(full_dataset[de_col_name])\
    .groupby([de_col_name])
summary_table = num_full_dataset.agg(['mean', 'sem']).set_index(
    num_full_dataset.count().iloc[:, 0].reset_index().apply(
        lambda row: f"{row[0]} (N={row[1]})",
        axis=1
    )
).T

In [6]:
#Create table of confidence intervals
ci_table = summary_table.reset_index().set_index("level_1").groupby("level_0").aggregate(
   lambda col: norm.interval(alpha=0.95,loc=col.loc["mean"],scale=col.loc["sem"]) if col.loc["mean"] != 0 else (np.nan, np.nan)
).applymap(
    lambda val: f"({round(val[0], 2)}-{round(val[1],2)})"
)
#Concat with existing table
ci_table["level_1"] = "CI"
summary_table = pd.concat([
    ci_table.reset_index().set_index(["level_0", "level_1"]),
    summary_table
], levels=["level_0", "level_1"]).sort_index()

# #Create ANOVA Column
# summary_table["ANOVA P"] = np.repeat([f_oneway(*[x for _, x in num_full_dataset[col]]).pvalue for col in summary_table.index.levels[0]], 3)

# #Reindex table
# summary_table = summary_table.reset_index().set_index(["level_0", "ANOVA P", "level_1"])

In [7]:
summary_table

Unnamed: 0_level_0,Unnamed: 1_level_0,No Surgery (N=706)
level_0,level_1,Unnamed: 2_level_1
Cost,CI,(601.47-1153.6)
Cost,CI,(601.47-1153.6)
Cost,mean,877.537
Cost,sem,140.852
ED_revisits,CI,(0.03-0.06)
...,...,...
is_white,sem,0.0174216
median_zip_income,CI,(2.54-2.71)
median_zip_income,CI,(2.54-2.71)
median_zip_income,mean,2.62748


In [9]:
summary_table.to_csv("../tables/cross_group_demographics_comparison.csv")

In [10]:
unstacked = summary_table.unstack("level_1")
unstacked["category"] = unstacked.index.get_level_values(0).map({
    'Cost': 'Outcome',
    'ED_revisits': 'Outcome',
    'inpatient_visits': 'Outcome',
    'died': 'Outcome',
    'has alcohol-related disorders': 'Comorbidity',
    'has aspirin': 'Comorbidity',
    'has biliary colic with inflammation': 'Comorbidity',
    'has diabetes mellitus': 'Comorbidity',
    'has hyperlipidemia': 'Comorbidity',
    'has ischemic heart disease': 'Comorbidity',
    'has mood disorders': 'Comorbidity',
    'has nicotine dependence': 'Comorbidity',
    'has obesity': 'Comorbidity',
    'has systemic hypertension': 'Comorbidity',
    'has_medicaid': 'Insurance Status',
    'has_medicare': 'Insurance Status',
    'has_private_insurance': 'Insurance Status',
    'is_uninsured': 'Insurance Status',
    'age': 'Demographic',
    'is_african_american_or_hispanic': 'Demographic',
    'is_female': 'Demographic',
    'is_married': 'Demographic',
    'is_unmarried': 'Demographic',
    'is_white': 'Demographic',
    'median_zip_income': 'Demographic'
})
unstacked = unstacked.reset_index().set_index(['category', 'level_0', 'ANOVA P']).sort_index()

ValueError: Index contains duplicate entries, cannot reshape

In [None]:
unstacked.stack('level_1').to_csv("../tables/cross_group_demographics_comparison.csv")