To load the entire data, it is recommended to request the following resources on SLURM:

`srun -p himem -c 8 --mem 64GB -t 0-08:00:00 --pty bash`

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import polars as pl

pl.Config.set_fmt_str_lengths(100)
pl.Config.set_tbl_rows(200)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

from make_clinical_dataset.constants import ROOT_DIR, INFO_DIR
from make_clinical_dataset.preprocess.epic.radiology import get_radiology_data, END_TEXT

In [None]:
date = '2025-03-29'

# Raw

In [None]:
%%time
df = pl.read_parquet(f'{ROOT_DIR}/data/processed/radiology/radiology_{date}/*.parquet').lazy()

## datetime

In [None]:
main_date_col, secondary_date_col = "effective_datetime", "occurrence_datetime_from_order"
df.select(
    (pl.col(main_date_col).is_not_null() & pl.col(secondary_date_col).is_null()).sum().alias('Only effective_datetime'),
    (pl.col(main_date_col).is_null() & pl.col(secondary_date_col).is_null()).sum().alias('Neither'),
    (pl.col(main_date_col).is_not_null() & pl.col(secondary_date_col).is_not_null()).sum().alias('Both'),
    (pl.col(main_date_col).is_null() & pl.col(secondary_date_col).is_not_null()).sum().alias('Only occurence_datetime_from_order')
).collect()

## obs_name

In [None]:
df.group_by('obs_name').len().sort('len', descending=True).collect()

# Reports Only

In [None]:
df = pl.read_parquet(f'{ROOT_DIR}/data/processed/radiology/radiology_{date}/*.parquet').lazy()
df = df.with_columns([pl.coalesce([pl.col('effective_datetime'), pl.col('occurrence_datetime_from_order')]).alias("epr_datetime")])
df = df.with_columns(pl.col("epr_datetime").dt.date().alias("epr_date"))
df = df.filter(pl.col('obs_val_str').str.starts_with('\nREPORT'))
df = df.filter(pl.struct(['patient', 'epr_date', 'obs_val_str']).is_unique())

## datetime

In [None]:
main_date_col, secondary_date_col = "effective_datetime", "occurrence_datetime_from_order"
df.select(
    (pl.col(main_date_col).is_not_null() & pl.col(secondary_date_col).is_null()).sum().alias('Only effective_datetime'),
    (pl.col(main_date_col).is_null() & pl.col(secondary_date_col).is_null()).sum().alias('Neither'),
    (pl.col(main_date_col).is_not_null() & pl.col(secondary_date_col).is_not_null()).sum().alias('Both'),
    (pl.col(main_date_col).is_null() & pl.col(secondary_date_col).is_not_null()).sum().alias('Only occurence_datetime_from_order')
).collect()

In [None]:
# how often effective_datetime is earlier than occurence_datetime_from_order
both = df.filter(pl.col(main_date_col).is_not_null() & pl.col(secondary_date_col).is_not_null())
mask = pl.col(main_date_col) < pl.col(secondary_date_col)
both.select(mask.value_counts().alias('earlier')).collect()

# Processed

In [None]:
df = pl.read_parquet(f'{ROOT_DIR}/data/final/data_{date}/interim/reports.parquet').lazy()

## year distribution

In [None]:
df.with_columns(pl.col("date").dt.year().alias("year")).group_by("year").len().sort("year").collect()

## proc_name

In [None]:
df.group_by('proc_name').len().sort('len', descending=True).collect()

## END_TEXT

In [None]:
mask = pl.col('obs_val_str').str.ends_with(END_TEXT)
df.select(mask.value_counts()).collect()

## string length

In [None]:
df.with_columns(pl.col("obs_val_str").str.len_chars().alias("str_len")).select("str_len").describe()

## dates

In [None]:
# how many rows failed to extract the date from the report
df.select(pl.col('initial_report_date').is_null().value_counts()).collect()

In [None]:
# how many rows had addendums
df.select(pl.col('last_addendum_date').is_not_null().value_counts()).collect()

In [None]:
# check the case where addendum date was earlier than initial report date
df.filter(pl.col("initial_report_date") > pl.col("last_addendum_date")).collect()