In [None]:
%%capture
%cd ../../
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import polars as pl

from make_clinical_dataset.epic.combine import (
    merge_closest_measurements, 
    combine_chemo_to_main_data, 
    combine_demographic_to_main_data,
    combine_event_to_main_data,
    combine_radiation_to_main_data
)
from make_clinical_dataset.epic.label import get_acu_labels
from make_clinical_dataset.epic.preprocess.demographic import get_demographic_data
from make_clinical_dataset.shared.constants import INFO_DIR, ROOT_DIR

pl.Config.set_tbl_rows(100)

In [None]:
DATE = '2025-03-29'
DATA_DIR = f"{ROOT_DIR}/data/final/data_{DATE}"

# Combine the features & targets

In [None]:
# load the features and targets
chemo = pl.read_parquet(f'{DATA_DIR}/interim/chemo.parquet')
rad = pl.read_parquet(f'{DATA_DIR}/interim/radiation.parquet')
lab = pl.read_parquet(f'{DATA_DIR}/interim/lab.parquet')
sym = pl.read_parquet(f'{DATA_DIR}/interim/symptom.parquet')
acu = pl.read_parquet(f'{DATA_DIR}/interim/acute_care_use.parquet')
# TODO: EDA - show number of patients with multiple birth dates
demog = pl.from_pandas(get_demographic_data())

## Align on chemo sessions

In [None]:
%%time
# select anchor
main = (
    chemo
    .filter(pl.col('drug_type') == "direct")
    .select('mrn', 'treatment_date').unique()
    .rename({'treatment_date': 'assessment_date'})
    .sort('mrn', 'assessment_date')
)

# merge demographics
main = combine_demographic_to_main_data(main, demog, main_date_col="assessment_date")

# merge chemotherapy treatments
main = combine_chemo_to_main_data(main, chemo, main_date_col="assessment_date", time_window=(-28,0))

# merge radiation treatments
main = combine_radiation_to_main_data(main, rad, main_date_col="assessment_date", time_window=(-28,0))

# merge laboratory tests
lab = lab.with_columns(pl.col('mrn').cast(pl.Int64)) # TODO: do this in lab preprocessing
main = merge_closest_measurements(main, lab, "assessment_date", "obs_date", include_meas_date=True, time_window=(-5,0))

# merge symptom surveys
main = merge_closest_measurements(main, sym, "assessment_date", "obs_date", include_meas_date=True, time_window=(-30,0))

# merge acute care use
main = combine_event_to_main_data(main, acu, "assessment_date", "ED_visit", lookback_window=5)

# add lables
# 1) ED
main = get_acu_labels(main, acu, lookahead_window=[30, 60, 90])
# 2) Death

In [None]:
%%time
from make_clinical_dataset.epr.label import get_CTCAE_labels, get_symptom_labels
# add targets
# 3) CTCAE
main = main.to_pandas()
lab = lab.to_pandas()
main = get_CTCAE_labels(main, lab)
# 4) symptoms
# sym = sym.to_pandas()
# sym['survey_date'] = sym['obs_date']
# main = get_symptom_labels(main, sym)

main = pl.from_pandas(main)

In [None]:
date_cols = ['mrn'] + [col for col in main.columns if col.endswith('date')]
str_cols = ['cancer_type', 'primary_site_desc', 'intent', 'drug_name', 'postal_code']
feat_cols = ['mrn', 'assessment_date'] + str_cols + [col for col in main.columns if col not in date_cols+str_cols]
main_dates = main.select(date_cols)
main_dates.write_parquet(f'{DATA_DIR}/processed/treatment_centered_dates.parquet')
main_data = main.select(feat_cols)
main_data.write_parquet(f'{DATA_DIR}/processed/treatment_centered_data.parquet')