In [None]:
import matplotlib.pyplot as plt
import polars as pl
import seaborn as sns
from make_clinical_dataset.shared.constants import ROOT_DIR

pl.Config.set_tbl_rows(200)

In [None]:
DATE = '2025-03-29'
DATA_DIR = f"{ROOT_DIR}/data/final/data_{DATE}"

# Processed

In [None]:
admit_dates = pl.read_parquet(f'{DATA_DIR}/interim/acute_care_admission_dates.parquet')

In [None]:
admit_dates = admit_dates.with_columns([
    pl.col('admission_date').dt.year().alias('admission_year'),
    pl.col("data_source").list.contains("EPIC ED Admission Dates").alias("EPIC ED Admission Dates"),
    pl.col("data_source").list.contains("Discharge Summary").alias("Discharge Summary"),
])
admit_dates = admit_dates.with_columns([
    pl.when(pl.col("EPIC ED Admission Dates") & ~pl.col("Discharge Summary")).then(pl.lit("EPIC only"))
    .when(~pl.col("EPIC ED Admission Dates") & pl.col("Discharge Summary")).then(pl.lit("Discharge only"))
    .when(pl.col("EPIC ED Admission Dates") & pl.col("Discharge Summary")).then(pl.lit("Both"))
    .otherwise(pl.lit("Neither"))
    .cast(pl.Categorical)
    .alias("data_source")
])

In [None]:
%%time
from ydata_profiling import ProfileReport
profile = ProfileReport(admit_dates.select('data_source', 'admission_year').to_pandas(), minimal=True)
# profile.to_file("reports/acu.html")
profile.to_notebook_iframe()

In [None]:
# number of admissions over time
freq = (
    admit_dates
    .group_by(["admission_year", "data_source"])
    .agg(pl.len())
    .sort('admission_year')
    .filter((pl.col('admission_year') > 2003) & (pl.col('admission_year') < 2025))
    .to_pandas()
)
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(freq, x='admission_year', y='len', hue='data_source', ax=ax)
freq