In [1]:
import pandas as pd
import numpy as np

In [2]:
from numbers import Number
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
from sklearn.linear_model import LinearRegression

In [3]:
dtype_obj = {
    'health_service_area': str,
    'hospital_county': str,
    'operating_certificate_number': str,
    'facility_id': str,
    'facility_name': str,
    'age_group': str,
    'zip_code__3_digits': str,
    'gender': str,
    'race': str,
    'ethnicity': str,
    'length_of_stay': int,
    'type_of_admission': str,
    'patient_disposition': str,
    'discharge_year': int,
    'ccs_diagnosis_code': str,
    'ccs_diagnosis_description': str,
    'ccs_procedure_code': str,
    'ccs_procedure_description': str,
    'apr_drg_code': str,
    'apr_drg_description': str,
    'apr_mdc_code': str,
    'apr_mdc_description': str,
    'apr_severity_of_illness_code': str,
    'apr_severity_of_illness_description': str,
    'apr_risk_of_mortality': str,
    'apr_medical_surgical_description': str,
    'source_of_payment_1': str,
    'source_of_payment_2': str,
    'source_of_payment_3': str,
    'attending_provider_license_number': str,
    'operating_provider_license_number': str,
    'other_provider_license_number': str,
    'birth_weight': float,
    'abortion_edit_indicator': str,
    'emergency_department_indicator': str,
    'total_charges': float,
    'total_costs': float,
    'year': int,
    'hospital_service_area': str,
    'permanent_facility_id': str,
    'payment_typology_1': str,
    'payment_typology_2': str,
    'payment_typology_3': str,
    'diagnosis': str,
    'apr_risk_of_mortality_code': float
}

In [4]:
f_data = pd.read_csv("../data/enriched_data/CD_filtered.csv", index_col=0, dtype= dtype_obj)

In [5]:
f_data.dtypes

hospital_county                         object
operating_certificate_number            object
facility_id                             object
facility_name                           object
age_group                               object
zip_code__3_digits                      object
gender                                  object
race                                    object
ethnicity                               object
length_of_stay                           int32
type_of_admission                       object
patient_disposition                     object
discharge_year                           int32
ccs_diagnosis_code                      object
ccs_diagnosis_description               object
ccs_procedure_code                      object
ccs_procedure_description               object
apr_drg_code                            object
apr_drg_description                     object
apr_mdc_code                            object
apr_mdc_description                     object
apr_severity_

In [6]:
cols = list(f_data)
cols2 = cols.copy()
pairwise_dfs = []
for col in cols:
    for col2 in cols2:
        if col != col2:
            tempDf = f_data[[col, col2]].dropna()
            if len(tempDf.index)>0:
                pairwise_dfs.append(tempDf)
    #if cols2.length > 0:
    cols2.remove(col)
len(pairwise_dfs)

807

In [7]:
significantAttr = pd.DataFrame()
for df in pairwise_dfs:
    topValues = df.values[0]
    keys = list(df)
    if isinstance(topValues[0], Number) and isinstance(topValues[1], Number):
        x,y = df[keys[0]].to_numpy().reshape(-1,1),df[keys[1]].to_numpy()
        model = LinearRegression().fit(x,y)
        significantAttr=significantAttr.append(pd.Series(['linear_regression',keys[0],keys[1],model.score(x,y)], index=['test','X','Y','R^2']), ignore_index=True)
    elif isinstance(topValues[0], str) and isinstance(topValues[1], str):
        chiTable = pd.crosstab(df.iloc[:,0], df.iloc[:,1])
        chi2, p, dof, expected = chi2_contingency(chiTable.values)
        significantAttr=significantAttr.append(pd.Series(['chi_squared',keys[0],keys[1],p], index=['test','X','Y','p']), ignore_index=True)
    elif isinstance(topValues[0], Number)  and (df[keys[1]].nunique() > 2):
        res = ols(keys[0] + ' ~ C('+ keys[1]+')', data=df).fit()
        anova_res = anova_lm(res, typ=2)
        sst = anova_res.at['C('+ keys[1]+')','sum_sq']
        ssr = anova_res.at['Residual','sum_sq']
        p = anova_res.at['C('+ keys[1]+')','PR(>F)']
        significantAttr=significantAttr.append(pd.Series(['ANOVA',keys[0],keys[1],p,sst/(sst+ssr)], index=['test','X','Y','p','R^2']), ignore_index=True)
    elif isinstance(topValues[1], Number) and (df[keys[0]].nunique() > 2):
        res = ols(keys[1] + ' ~ C('+ keys[0]+')', data=df).fit()
        anova_res = anova_lm(res, typ=2)
        sst = anova_res.at['C('+ keys[0]+')','sum_sq']
        ssr = anova_res.at['Residual','sum_sq']
        p = anova_res.at['C('+ keys[0]+')','PR(>F)']
        significantAttr=significantAttr.append(pd.Series(['ANOVA',keys[0],keys[1],p,sst/(sst+ssr)], index=['test','X','Y','p','R^2']), ignore_index=True)
    elif isinstance(topValues[0], Number):
        s1, s2 = df[(df[keys[1]]==topValues[1])][keys[0]], df[(df[keys[1]]!=topValues[1])][keys[0]]
        t, p = ttest_ind(s1, s2, equal_var=False)
        significantAttr=significantAttr.append(pd.Series(['t_test',keys[0],keys[1],p], index=['test','X','Y','p']), ignore_index=True)
    elif isinstance(topValues[1], Number):
        s1, s2 = df[(df[keys[0]]==topValues[0])][keys[1]], df[(df[keys[0]]!=topValues[0])][keys[1]]
        t, p = ttest_ind(s1, s2, equal_var=False)
        significantAttr=significantAttr.append(pd.Series(['t_test',keys[0],keys[1],p], index=['test','X','Y','p']), ignore_index=True)
    else:
        significantAttr=significantAttr.append(pd.Series(['untested',keys[0],keys[1]], index=['test','X','Y']), ignore_index=True)
        # histogram
significantAttr.shape

  F /= J
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


(807, 5)

In [8]:
significantAttr.groupby('test').count()
significantAttr.groupby('test').describe()

Unnamed: 0_level_0,p,p,p,p,p,p,p,p,R^2,R^2,R^2,R^2,R^2,R^2,R^2,R^2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
test,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
ANOVA,159.0,0.071015,0.217124,0.0,0.0,1.583737e-116,2.16338e-09,1.0,159.0,0.132443,0.19374,3e-06,0.008449,0.039174,0.182212,0.8112
chi_squared,582.0,0.158779,0.354396,0.0,0.0,1.7123420000000002e-113,1.773849e-06,1.0,0.0,,,,,,,
linear_regression,0.0,,,,,,,,15.0,0.107441,0.265066,1e-05,0.000402,0.009374,0.028706,1.0
t_test,36.0,0.030044,0.105648,0.0,1.038475e-92,5.691434e-35,7.228425e-07,0.514841,0.0,,,,,,,


In [9]:
significantAttr.to_csv("../data/enriched_data/CD_stat_tested.csv")