In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import polars as pl
import seaborn as sns
from make_clinical_dataset.epic.util import plot_count_over_time
from make_clinical_dataset.shared.constants import ROOT_DIR

pl.Config.set_tbl_rows(600)

# Raw

## Triage Assessment

In [None]:
from make_clinical_dataset.epic.preprocess.acu import process_triage_data

In [None]:
date = '2025-03-29'
df = pl.scan_parquet(f'{ROOT_DIR}/data/processed/ED/ED_{date}/*.parquet').collect()

In [None]:
# the different information captured in each procedure
df.group_by("proc_name", "obs_name").agg(pl.len()).sort("proc_name", "len", descending=True)

In [None]:
df = process_triage_data(df)
df = df.with_columns(pl.col('datetime').dt.year().alias('year'))

In [None]:
# procedure counts over time
counts = df.group_by('proc_name', 'year').agg(pl.len()).sort('len', descending=True)
counts = counts.to_pandas()
plot_count_over_time(counts, x='year', y='len', catcol='proc_name')
df['proc_name'].value_counts().sort('count', descending=True)

In [None]:
# observation status over time for ER Triage Assessment
counts = (
    df
    .filter(pl.col('proc_name') == 'ER Triage Assessment')
    .group_by('obs_status', 'year')
    .agg(pl.len())
    .sort('year', 'len', descending=True)
)
counts = counts.to_pandas()
ax = sns.lineplot(data=counts, x='year', y='len', hue='obs_status')
counts

# Processed

In [None]:
date = '2025-03-29'
admit_dates = pl.read_parquet(f'{ROOT_DIR}/data/final/data_{date}/interim/acute_care_admission_dates.parquet')

In [None]:
admit_dates = admit_dates.with_columns([
    pl.col('admission_date').dt.year().alias('admission_year'),
    pl.col("data_source").list.contains("EPIC ED Admission Dates").alias("EPIC ED Admission Dates"),
    pl.col("data_source").list.contains("Discharge Summary").alias("Discharge Summary"),
])
admit_dates = admit_dates.with_columns([
    pl.when(pl.col("EPIC ED Admission Dates") & ~pl.col("Discharge Summary")).then(pl.lit("EPIC only"))
    .when(~pl.col("EPIC ED Admission Dates") & pl.col("Discharge Summary")).then(pl.lit("Discharge only"))
    .when(pl.col("EPIC ED Admission Dates") & pl.col("Discharge Summary")).then(pl.lit("Both"))
    .otherwise(pl.lit("Neither"))
    .cast(pl.Categorical)
    .alias("data_source")
])

In [None]:
%%time
from ydata_profiling import ProfileReport
profile = ProfileReport(admit_dates.select('data_source', 'admission_year').to_pandas(), minimal=True)
# profile.to_file("reports/acu.html")
profile.to_notebook_iframe()

In [None]:
# number of admissions over time
freq = (
    admit_dates
    .group_by(["admission_year", "data_source"])
    .agg(pl.len())
    .sort('admission_year')
    .filter((pl.col('admission_year') > 2003) & (pl.col('admission_year') < 2025))
    .to_pandas()
)
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(freq, x='admission_year', y='len', hue='data_source', ax=ax)
freq