In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import polars as pl
import seaborn as sns
from make_clinical_dataset.epic.util import plot_count_over_time
from make_clinical_dataset.shared.constants import ROOT_DIR
from ydata_profiling import ProfileReport

pl.Config.set_tbl_rows(20)

# Raw

## Triage Assessment

In [None]:
from make_clinical_dataset.epic.preprocess.acu import process_triage_data

In [None]:
date = '2025-03-29'
df = pl.scan_parquet(f'{ROOT_DIR}/data/processed/ED/ED_{date}/*.parquet').collect()

In [None]:
# the different information captured in each procedure
df.group_by("proc_name", "obs_name").agg(pl.len()).sort("proc_name", "len", descending=True)

In [None]:
# procedure counts over time
df = df.with_columns(pl.coalesce(['effective_datetime', 'occurrence_datetime_from_order']).dt.year().alias('year'))
counts = df.group_by('proc_name', 'year').agg(pl.len()).sort('len', descending=True)
counts = counts.to_pandas()
plot_count_over_time(counts, x='year', y='len', catcol='proc_name')
df['proc_name'].value_counts().sort('count', descending=True)

In [None]:
# observation status over time for ER Triage Assessment
counts = (
    df
    .filter(pl.col('proc_name') == 'ER Triage Assessment')
    .group_by('obs_status', 'year')
    .agg(pl.len())
    .sort('year', 'len', descending=True)
)
counts = counts.to_pandas()
ax = sns.lineplot(data=counts, x='year', y='len', hue='obs_status')
counts

# Processed

In [None]:
date = '2025-03-29'
acu = pl.read_parquet(f'{ROOT_DIR}/data/final/data_{date}/interim/acute_care_use.parquet')
mask = pl.col('data_source') == "Discharge Summary"
hosp, emerg = acu.filter(mask), acu.filter(~mask)

In [None]:
from make_clinical_dataset.epic.combine import merge_closest_measurements
df = merge_closest_measurements(
    emerg.select('mrn', 'ED_arrival_date', 'data_source'), 
    hosp, 
    'ED_arrival_date', 
    'hosp_admission_date', 
    direction='forward', 
    time_window=(0,2), 
    merge_individually=False,
)
mask = df['hosp_admission_date'].is_not_null()
print(f"Number of ED visits that results in hospitalization within 2 days: {mask.sum()}/{len(mask)} ({mask.mean()*100:.1f}%)")

In [None]:
hosp = hosp.with_columns([
    pl.col('data_source').cast(pl.Categorical),
    pl.col('hosp_admission_date').dt.year().alias('year')
]).select('mrn', 'year', 'length_of_stay')
emerg = emerg.with_columns([
    pl.col('data_source').cast(pl.Categorical),
    pl.col('ED_arrival_date').dt.year().alias('year')
]).drop('mrn', 'year', 'CTAS_score')

In [None]:
%%time
profile = ProfileReport(hosp.drop('mrn').to_pandas(), minimal=True)
profile.to_file("reports/hosp.html")
profile.to_notebook_iframe()

In [None]:
%%time
profile = ProfileReport(emerg.drop('mrn').to_pandas(), minimal=True)
profile.to_file("reports/emerg.html")
profile.to_notebook_iframe()

In [None]:
# number of admissions over time
freq = (
    emerg
    .group_by(["year", "data_source"])
    .agg(pl.len())
    .sort('year', 'len', descending=True)
    .filter((pl.col('year') > 2003) & (pl.col('year') < 2025) & (pl.col('len') > 100))
    .to_pandas()
)
freq['data_source'] = freq['data_source'].astype(str)
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(freq, x='year', y='len', hue='data_source', ax=ax)
freq