In [1]:
# This notebook is the only one that doesn't rely on analysis_variables for codes and data enrichment vars
# because the kinds of statistical tests/questions we want will be specific to the results of each analysis

In [4]:
import pandas as pd
from scipy.stats import ttest_ind, f_oneway
from utility_functions import load_file, starting_run, finished_run
from analysis_variables import logreg_targets, de_col_keys, de_col_values

In [3]:
dataset = load_file("summary_costs_enhanced.pickle").join(load_file("category_status_filtered.pickle"))
# logreg_targets = load_file("Full_logreg_targets.pickle")

In [7]:
def test(data, col_name_index, col_value_index):
    print(ttest_ind(
        data[data[de_col_keys[col_name_index]] == de_col_value[de_col_keys[col_name_index]][col_value_index]]["Cost"],
        data[data[de_col_keys[col_name_index]] != de_col_value[de_col_keys[col_name_index]][col_value_index]]["Cost"],
        equal_var = False,
#         alternative = "greater"
    ))

In [8]:
de_col_keys

['Admission Status', 'Biliary Colic Type', 'Cholecystectomy Type']

In [5]:
test(
    dataset
)

Ttest_indResult(statistic=13.336646639661922, pvalue=7.939789679811304e-40)


In [12]:
dataset.groupby(['Biliary Colic Type', 'Admission Status', 'Cholecystectomy Type'])[['Cost', 'ED Readmissions']].agg(['mean','count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Cost,Cost,ED Readmissions,ED Readmissions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,count,mean,count
Biliary Colic Type,Admission Status,Cholecystectomy Type,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Complicated,Admitted,Delayed,11029.254941,166,0.03012,166
Complicated,Admitted,Immediate,10754.346624,3607,0.011644,3607
Complicated,Admitted,No Surgery,9362.261106,876,0.035388,876
Complicated,Admitted,Return Emergency,23305.676826,26,1.038462,26
Complicated,Not Admitted,Delayed,4662.32395,32,0.03125,32
Complicated,Not Admitted,No Surgery,1308.014732,127,0.047244,127
Complicated,Not Admitted,Return Emergency,7763.618819,12,1.0,12
Uncomplicated,Admitted,Delayed,10767.530688,56,0.053571,56
Uncomplicated,Admitted,Immediate,11834.135527,119,0.0,119
Uncomplicated,Admitted,No Surgery,8876.249622,340,0.029412,340


In [6]:
ttest_ind(
    dataset.loc[dataset["surgery_type"] == "Emergency Surgery"]["ED Readmissions"],
    dataset.loc[dataset["surgery_type"] != "Emergency Surgery"]["ED Readmissions"],
    equal_var = False,
    alternative = "less"
)

Ttest_indResult(statistic=-6.123143831957243, pvalue=5.757754126114558e-10)

In [7]:
dataset.groupby("surgery_type")["ED Readmissions"].agg(["sum", "count", "mean"])

Unnamed: 0_level_0,sum,count,mean
surgery_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Delayed Surgery,10.0,229,0.043668
ED Readmit Surgery,27.0,27,1.0
Emergency Surgery,27.0,2709,0.009967
No Surgery,31.0,1096,0.028285


In [8]:
def compare_cost(target):
    data = dataset.join(
        logreg_targets[target](dataset).rename('comp'),
        how="inner"
    )
    p = ttest_ind(
        data[data["comp"] == False]["Cost"],
        data[data["comp"] == True]["Cost"],
        equal_var = False
    )
    print (f"p = {p[1]}")
    return (
        data.groupby('comp')['Cost'].mean()
    )
    

In [9]:
compare_cost("Admission vs Discharge")

p = 0.0


comp
False     1434.876420
True     10737.786787
Name: Cost, dtype: float64

In [10]:
compare_cost('Delayed + Delayed Emergency vs No Surgery')

p = 0.0009684376664753035


comp
False    7046.435888
True     9139.202212
Name: Cost, dtype: float64

In [11]:
compare_cost('Given Admission - Immediate Surgery vs non-Surgical Management')

p = 3.993031089877197e-05


comp
False     9817.308391
True     11067.038749
Name: Cost, dtype: float64

In [12]:
dataset.groupby("surgery_type")["Cost"].agg(["mean", "sem", "count"])

Unnamed: 0_level_0,mean,sem,count
surgery_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Delayed Surgery,8083.023816,489.415886,229
ED Readmit Surgery,18097.159722,3354.748917,27
Emergency Surgery,11067.038749,176.44312,2709
No Surgery,7046.435888,212.695959,1096


In [13]:
dataset.columns

Index(['died', 'initial_record_id', 'initial_year', 'age', 'homeless', 'race',
       'median_zip_income', 'payer', 'initial_discharge_quarter', 'admitted',
       'max_year', 'ED Readmissions', 'Surgery Visits',
       'Inpatient Readmissions', 'surgery_type', 'gender', 'marital_status',
       'age_groups', 'CPT Costs', 'SID_costs', 'Cost', 'systemic hypertension',
       'obesity', 'aspirin', 'nicotine dependence', 'hyperlipidemia',
       'diabetes mellitus', 'ischemic heart disease',
       'alcohol-related disorders', 'biliary colic with inflammation',
       'mood disorders'],
      dtype='object')

In [15]:
dataset.groupby(["surgery_type", "ED Readmissions", "Inpatient Readmissions"])["Cost"].agg(["mean", "sem", "count"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,sem,count
surgery_type,ED Readmissions,Inpatient Readmissions,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Delayed Surgery,0.0,0.0,6269.188844,347.878255,195
Delayed Surgery,0.0,1.0,19898.439747,1802.993906,24
Delayed Surgery,0.0,2.0,42868.666025,,1
Delayed Surgery,1.0,0.0,1641.58176,181.077988,2
Delayed Surgery,1.0,1.0,14415.819707,3520.274472,5
Delayed Surgery,1.0,2.0,27830.844584,,1
Delayed Surgery,2.0,1.0,4896.30272,,1
ED Readmit Surgery,1.0,0.0,3109.285452,732.745867,3
ED Readmit Surgery,1.0,1.0,19970.644006,3596.288753,24
Emergency Surgery,0.0,0.0,10996.872343,176.139901,2680


In [17]:
filtered_dataset_codes = load_file("fully_filtered_codes.pickle")

In [20]:
filtered_dataset_codes["codes"].str.contains("K81").value_counts()

False    86106
True        16
Name: codes, dtype: int64