In [None]:
%%capture
%cd ../../
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import polars as pl

from make_clinical_dataset.constants import INFO_DIR, ROOT_DIR

pl.Config.set_tbl_rows(100)

In [None]:
DATE = '2025-03-29'
DATA_DIR = f"{ROOT_DIR}/data/final/data_{DATE}"

In [None]:
# load the features and targets
chemo = pl.read_parquet(f'{DATA_DIR}/interim/chemo.parquet')
lab = pl.read_parquet(f'{DATA_DIR}/interim/lab.parquet')
sym = pl.read_parquet(f'{DATA_DIR}/interim/symptom.parquet')

# emerg
notes = pl.read_parquet(f'{ROOT_DIR}/data/processed/clinical_notes/data_pull_2025-01-08/merged_processed_cleaned_clinical_notes.parquet.gzip')
notes = notes.rename({"Observations.ProcName": "proc_name"})
procs = ["Unscheduled Discharge Summary", "ED Prov Note", "Disch Summ", "Discharge Summary"]
emerg = notes.filter(pl.col('proc_name').is_in(procs))
del notes

# demog
cancer_diag = pd.read_csv(f'{INFO_DIR}/cancer_diag.csv')
postal_codes = pd.read_csv(f'{INFO_DIR}/postal_codes.csv')

# Quick and dirty dataset

In [None]:
def merge_closest_measurements(
    main: pl.DataFrame | pl.LazyFrame, 
    meas: pd.DataFrame | pl.LazyFrame, 
    main_date_col: str,
    meas_date_col: str, 
    direction: str = 'backward',
    time_window: tuple[int, int] = (-5,0),
    merge_individually: bool = True,
    include_meas_date: bool = False
) -> pd.DataFrame:
    """Extract the closest measurements (lab tests, symptom scores, etc) prior to / after the main date 
    within a lookback / lookahead window and combine them to the main dataset

    Both main and meas should have mrn and date columns
    
    Args:
        main_date_col: The column name of the main visit date
        meas_date_col: The column name of the measurement date
        time_window: The start and end of the window in terms of number of days after(+)/before(-) the main visit dates
        direction: specifies whether to merge measurements before or after the main date. Either 'backward' or 'forward'
        merge_individually: If True, merges each measurement column separately
        include_meas_date: If True, include the date of the closest measurement that was merged
    """
    lower_limit, upper_limit = time_window
    if direction == 'backward':
        main_date = pl.col(main_date_col) + pl.duration(days=upper_limit)
    elif direction == 'forward':
        main_date = pl.col(main_date_col) + pl.duration(days=lower_limit)
    main = main.with_columns(main_date.alias("main_date"))

    # ensure date types match
    meas = meas.with_columns(pl.col(meas_date_col).cast(main.schema["main_date"]))

    merge_kwargs = dict(
        left_on='main_date', right_on=meas_date_col, by='mrn', strategy=direction,
        tolerance=datetime.timedelta(days=upper_limit - lower_limit), check_sortedness=False
    )
    
    if merge_individually:
        # merge each measurement column individually
        for col in meas.columns:
            if col in ["mrn", meas_date_col]: continue

            data_to_merge = meas.filter(pl.col(col).is_not_null()).select(["mrn", meas_date_col, col])

            # merges the closest row to main date while matching on mrn
            main = main.join_asof(data_to_merge, **merge_kwargs)

            if include_meas_date:
                main = main.rename({meas_date_col: f"{col}_{meas_date_col}"})
            else:
                main = main.drop(meas_date_col)

    else:
        main = main.join_asof(meas, **merge_kwargs)

    main = main.drop("main_date")
    return main

In [None]:
#TODO: rerun treatment preprocessing pipeline, dropping rows without treatment date
chemo = chemo.filter(pl.col('treatment_date').is_not_null())

chemo = chemo.filter(pl.col('drug_type') == "direct")
chemo = chemo.select(
    'mrn', 'treatment_date', 'drug_name', 'first_treatment_date', 'intent', 
    'cycle_number', 'body_surface_area', 'height', 'weight'
)
# one-hot encode drugs - TODO: retrive the dosages instead of binary 0/1 for each drug
drugs = chemo['drug_name'].unique()
chemo = chemo.with_columns(pl.col('drug_name').alias('drug')) # keep the orignal string column
chemo = chemo.to_dummies(columns='drug')
# merge same-day rows
chemo = chemo.group_by('mrn', 'treatment_date').agg(
    pl.col('body_surface_area').mean(),
    pl.col('height').mean(),
    pl.col('weight').mean(),
    # if two treatments (the old regimen and new regimen) overlap on same day, use data associated with the most recent regimen 
    # NOTE: examples found thru df.group_by('mrn', 'treatment_date').agg(pl.col('first_treatment_date').n_unique() > 1)
    pl.col('cycle_number').min(),
    pl.col('first_treatment_date').max(),
    # TODO: come up with robust way to handle the following conflicts
    pl.col('intent').first(),
    # combine dosages together
    *(pl.col(f'drug_{col}').max() for col in drugs),
    # concat the drugs together
    pl.concat_str("drug_name", separator=",")
)
# NOTE: group_by's maintain_order=True is not efficient, better to sort it again right after
chemo = chemo.sort('mrn', 'treatment_date')

In [None]:
%%time
# select anchor
main = chemo.select('mrn', 'treatment_date').rename({'treatment_date': 'assessment_date'})

# merge demog
# TODO: EDA - show number of patients with multiple birth dates
df = pd.read_csv(f'{INFO_DIR}/cancer_diag.csv')
df.columns = df.columns.str.lower()
df = df.rename(columns={'type': 'cancer_type', 'medical_record_number': 'mrn', 'date_of_birth': 'birth_date'})
df['mrn'] = df['mrn'].astype(int)
for col in ['birth_date', 'diagnosis_date']: 
    df[col] = pd.to_datetime(df[col])
df = df.drop(columns=['morphology_desc']).drop_duplicates()
cancer_diag = pl.from_pandas(df)
main = merge_closest_measurements(
    main, cancer_diag, main_date_col="assessment_date", meas_date_col="diagnosis_date", 
    merge_individually=False, time_window=[-1e8, 0]
)
# create age column
age = (pl.col("assessment_date") - pl.col("birth_date")).dt.total_days() / 365.25
main = main.with_columns(age.alias('age'))
# filter patients with missing age
main = main.filter(pl.col('age').is_not_null())

# merge chemo
# NOTE: by setting a tolerance (even if it's 0), all columns must be not null
main = merge_closest_measurements(
    main, chemo, main_date_col="assessment_date", meas_date_col="treatment_date", merge_individually=False
)

# merge radiation

# merge labs
lab = lab.with_columns(pl.col('mrn').cast(pl.Int64)) # TODO: do this in lab preprocessing
main = merge_closest_measurements(
    main, lab, main_date_col="assessment_date", meas_date_col="obs_date", include_meas_date=True, time_window=(-5,0)
)

# merge symptoms
main = merge_closest_measurements(
    main, sym, main_date_col="assessment_date", meas_date_col="obs_date", include_meas_date=True, time_window=(-30,0)
)

# add targets
# 1) ED
# 2) Death
# 3) CTCAE

# add engineered features

In [None]:
date_cols = ['mrn'] + [col for col in main.columns if col.endswith('date')]
str_cols = ['cancer_type', 'primary_site_desc', 'intent', 'drug_name']
feat_cols = ['mrn', 'assessment_date'] + str_cols + [col for col in main.columns if col not in date_cols+str_cols]
main_dates = main.select(date_cols)
main_dates.write_parquet(f'{DATA_DIR}/processed/treatment_centered_dates.parquet')
main_data = main.select(feat_cols)
main_data.write_parquet(f'{DATA_DIR}/processed/treatment_centered_data.parquet')

In [None]:
#TODO:
# check with Rob, what to do with the EPIC drugs that's missing doses (ask Jeremy for data guide)
# incorporate drug dose into the given dose (use only the ones where we were able to extract the mg)
# normalize regimens

# final process of treatment (how to feature engineer? just intent, cycle number, line of therapy, height, weight, bsa, and each drug as a feature)

# create subset for ED visits, for clinic visits

# get the sex

# combine (get a preliminary AIM2REDUCE dataset using just the labs and symptoms and treatments), anchor based on treatment date for now (clinic date next)

# combine ED, demog, last seen date, etc