In [ ]:
# Removes lint errors from VS Code
from typing import Dict, TYPE_CHECKING, Tuple, List

if TYPE_CHECKING:
    import kedro
    catalog: kedro.io.data_catalog.DataCatalog
    session: kedro.framework.session.session.KedroSession
    catalog: kedro.io.data_catalog.DataCatalog
    pipelines: Dict[str, kedro.pipeline.pipeline.Pipeline]

import pandas as pd
patients_wrk:   pd.DataFrame = catalog.load('mimic_mm_core.wrk.patients')
admissions_wrk: pd.DataFrame = catalog.load('mimic_mm_core.wrk.admissions')
transfers_wrk:  pd.DataFrame = catalog.load('mimic_mm_core.wrk.transfers')

patients_ref:   pd.DataFrame = catalog.load('mimic_mm_core.ref.patients')
admissions_ref: pd.DataFrame = catalog.load('mimic_mm_core.ref.admissions')
transfers_ref:  pd.DataFrame = catalog.load('mimic_mm_core.ref.transfers')

metadata: Dict = catalog.load('params:mimic_mm_core.metadata')

2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `mimic_mm_core.wrk.patients` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `mimic_mm_core.wrk.admissions` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `mimic_mm_core.wrk.transfers` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `mimic_mm_core.ref.patients` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `mimic_mm_core.ref.admissions` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `mimic_mm_core.ref.transfers` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `params:mimic_mm_core.metadata` (MemoryDataSet)...


In [ ]:
from scipy.stats import chisquare

tables = [
    ['patients', patients_wrk, patients_ref],
    ['admissions', admissions_wrk, admissions_ref],
    ['transfers', transfers_wrk, transfers_ref]
]

def gen_freq(a, b):
    a, b = a.value_counts(), b.value_counts()
    c = pd.DataFrame(a).join(b, rsuffix='_a', lsuffix='_b')
    c = c.fillna(value=1)
    c = c / c.sum()
    return c.iloc[:,0], c.iloc[:,1]

res=[]
for name, a, b in tables:
    for col in a.keys():
        if metadata['tables'][name]['fields'][col]['type'] == 'categorical':
            k, j = gen_freq(a[col], b[col])
            chi, p = chisquare(k, j)
            # print(f"{name:12}.{col:20}: X^2={chi:3.3f} p={100*p:7.3f}%")
            res.append([name, col, chi, p])

res = pd.DataFrame(res, columns=['table', 'col', 'X^2', 'p'])
res.set_index(keys=['table','col']).style.background_gradient(axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,X^2,p
table,col,Unnamed: 2_level_1,Unnamed: 3_level_1
patients,gender,1.4e-05,0.997065
patients,anchor_year_group,4.7e-05,1.0
admissions,admission_type,7.1e-05,1.0
admissions,admission_location,0.000102,1.0
admissions,discharge_location,0.000225,1.0
admissions,insurance,0.000203,0.999899
admissions,language,2.7e-05,0.995881
admissions,marital_status,0.000221,0.999999
admissions,ethnicity,0.000156,1.0
admissions,hospital_expire_flag,2.8e-05,0.995768


In [ ]:
gen_freq(patients_wrk['gender']).head(30)

F    0.5232
M    0.4768
Name: gender, dtype: float64