In [1]:
%%capture
%cd ../
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import yaml

from make_clinical_dataset import ROOT_DIR
from make_clinical_dataset.combine import (
    add_engineered_features,
    combine_demographic_to_main_data, 
    combine_event_to_main_data,
    combine_feat_to_main_data, 
    combine_perc_dose_to_main_data,
    combine_treatment_to_main_data
)
from make_clinical_dataset.preprocess.cancer_registry import get_demographic_data
from make_clinical_dataset.preprocess.dart import get_symptoms_data
from make_clinical_dataset.preprocess.emergency import get_emergency_room_data
from make_clinical_dataset.preprocess.lab import get_lab_data
from make_clinical_dataset.preprocess.opis import get_treatment_data
from make_clinical_dataset.util import load_included_drugs, load_included_regimens

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
def quick_summary(df):
    print(f'Number of treatment sessions = {len(df)}')
    print(f'Number of patients = {df["mrn"].nunique()}')
    print(f'Cohort from {df["treatment_date"].min().date()} to {df["treatment_date"].max().date()}')

In [4]:
# load config
with open(f'{ROOT_DIR}/config.yaml') as file:
    cfg = yaml.safe_load(file)

# load external data
included_drugs = load_included_drugs()
included_regimens = load_included_regimens()

mrn_map = pd.read_csv(f'{ROOT_DIR}/data/external/MRN_map.csv')
mrn_map = mrn_map.set_index('RESEARCH_ID')['PATIENT_MRN'].to_dict()

# Build the features

## DART

In [5]:
dart = get_symptoms_data(data_dir=f'{ROOT_DIR}/data/raw')
dart.to_parquet(f'{ROOT_DIR}/data/interim/symptom.parquet.gzip', compression='gzip', index=False)

04:29:57 INFO:Removing 1118 patients and 8940 sessions in which consent to research was declined
04:29:57 INFO:Removing 2 patients and 12 sessions in which sex is Unknown
04:29:57 INFO:Removing 174 patients and 2468 sessions without any symptom scores


## Cancer Registry

In [6]:
canc_reg = get_demographic_data(data_dir=f'{ROOT_DIR}/data/raw')
canc_reg.to_parquet(f'{ROOT_DIR}/data/interim/demographic.parquet.gzip', compression='gzip', index=False)

04:30:30 INFO:Removing 1 patients and 1 sessions with no MRN
04:30:30 INFO:Removing 2 patients and 2 sessions in which sex is other than Male/Female


## OPIS

In [5]:
opis = get_treatment_data(included_drugs, included_regimens, data_dir=f'{ROOT_DIR}/data/raw')
opis.to_parquet(f'{ROOT_DIR}/data/interim/treatment.parquet.gzip', compression='gzip', index=False)
quick_summary(opis)
print(f'Number of unique regimens: {opis["regimen"].nunique()}')

05:00:31 INFO:Removing 1 patients and 998 sessions with missing regimen info
05:00:31 INFO:Removing 708 patients and 74259 sessions not part of selected regimens
05:00:31 INFO:Removing 102 patients and 111404 sessions that received only trial, supportive, and/or non-aerodigestive drugs
05:00:31 INFO:Removing 0 patients and 137 sessions where dosage is not provided
05:00:31 INFO:Removing 0 patients and 1 sessions that are duplicate rows except for first_treatment_date
05:00:31 INFO:Removing 0 patients and 1 sessions that are duplicate rows except for cycle_number


Number of treatment sessions = 118597
Number of patients = 10272
Cohort from 2005-11-16 to 2021-02-01
Number of unique regimens: 134


## Laboratory Tests 
Hematology and Biochemistry

In [103]:
lab = get_lab_data(mrn_map, data_dir=f'{ROOT_DIR}/data/raw')
lab.to_parquet(f'{ROOT_DIR}/data/interim/lab.parquet.gzip', compression='gzip', index=False)

{'sodium': 'mmol/L', 'chloride': 'mmol/L', 'potassium': 'mmol/L', 'hemoglobin': 'g/L', 'mean_corpuscular_volume': 'fL', 'mean_corpuscular_hemoglobin_concentration': 'g/L', 'platelet': 'x10e9/L', 'mean_corpuscular_hemoglobin': 'pg', 'mean_platelet_volume': 'fL', 'creatinine': 'umol/L', 'lymphocyte': 'x10e9/L', 'monocyte': 'x10e9/L', 'hematocrit': 'L/L', 'red_blood_cell': 'x10e12/L', 'white_blood_cell': 'x10e9/L', 'neutrophil': 'x10e9/L', 'glucose': 'mmol/L', 'magnesium': 'mmol/L', 'aspartate_aminotransferase': 'U/L', 'total_bilirubin': 'umol/L', 'alkaline_phosphatase': 'U/L', 'alanine_aminotransferase': 'U/L', 'eosinophil': 'x10e9/L', 'phosphate': 'mmol/L', 'bicarbonate': 'mmol/L', 'albumin': 'g/L', 'red_cell_distribution_width': '%CV', 'basophil': 'x10e9/L', 'lactate_dehydrogenase': 'U/L', 'activated_partial_thromboplastin_time': 's', 'carbohydrate_antigen_19-9': 'kU/L', 'carcinoembryonic_antigen': 'ug/L'}


# Emergency Room Visits

In [123]:
er_visit = get_emergency_room_data(data_dir=f'{ROOT_DIR}/data/raw')
er_visit.to_parquet(f'{ROOT_DIR}/data/interim/emergency_room_visit.parquet.gzip', compression='gzip', index=False)

02:56:46 INFO:Removing 0 patients and 7 sessions which are duplicate entries


# Combine the features

In [6]:
lab = pd.read_parquet(f'{ROOT_DIR}/data/interim/lab.parquet.gzip')
trt = pd.read_parquet(f'{ROOT_DIR}/data/interim/treatment.parquet.gzip')
dmg = pd.read_parquet(f'{ROOT_DIR}/data/interim/demographic.parquet.gzip')
sym = pd.read_parquet(f'{ROOT_DIR}/data/interim/symptom.parquet.gzip')
erv = pd.read_parquet(f'{ROOT_DIR}/data/interim/emergency_room_visit.parquet.gzip')

## Align on treatment sessions

In [6]:
df = combine_demographic_to_main_data(main=trt, demographic=dmg, main_date_col='treatment_date')
quick_summary(df)

05:59:24 INFO:Removing 975 patients and 7379 sessions with missing birth date
05:59:25 INFO:Removing 0 patients and 6 sessions under 18 years of age


Number of treatment sessions = 111212
Number of patients = 9297
Cohort from 2005-11-16 to 2021-02-01


In [None]:
#TODO: Try polars/dask for performance comparison
df = combine_feat_to_main_data(
    main=df, feat=sym, main_date_col='treatment_date', feat_date_col='survey_date', time_window=(-cfg['symp_lookback_window'],0)
)
df = combine_feat_to_main_data(
    main=df, feat=lab, main_date_col='treatment_date', feat_date_col='obs_date', time_window=(-cfg['lab_lookback_window'],0)
)

In [None]:
df = combine_event_to_main_data(
    main=df, event=erv, main_date_col='treatment_date', event_date_col='event_date', event_name='ED_visit',
    lookback_window=cfg['ed_visit_lookback_window']
)

In [None]:
df = combine_perc_dose_to_main_data(main=df, included_drugs=included_drugs)
df = add_engineered_features(df, date_col='treatment_date')

In [17]:
df.to_parquet(f'{ROOT_DIR}/data/processed/treatment_centered_clinical_dataset.parquet.gzip', compression='gzip', index=False)

## Align on every Mondays

In [31]:
from itertools import product
mrns = trt['mrn'].unique()
dates = pd.date_range(start='2018-01-01', end='2018-12-31', freq='W-MON')
df = pd.DataFrame(product(mrns, dates), columns=['mrn', 'assessment_date'])

In [32]:
df = combine_treatment_to_main_data(df, trt, main_date_col='assessment_date', time_window=(-cfg['trt_lookback_window'],0))
df = combine_demographic_to_main_data(main=df, demographic=dmg, main_date_col='assessment_date')
df = combine_feat_to_main_data(
    main=df, feat=sym, main_date_col='assessment_date', feat_date_col='survey_date', time_window=(-cfg['symp_lookback_window'],0)
)
df = combine_feat_to_main_data(
    main=df, feat=lab, main_date_col='assessment_date', feat_date_col='obs_date', time_window=(-cfg['lab_lookback_window'],0)
)
df = combine_event_to_main_data(
    main=df, event=erv, main_date_col='treatment_date', event_date_col='event_date', event_name='ED_visit', 
    lookback_window=cfg['ed_visit_lookback_window']
)
df = combine_perc_dose_to_main_data(main=df, included_drugs=included_drugs)
df = add_engineered_features(df, date_col='assessment_date')
# df.to_parquet(f'{ROOT_DIR}/data/processed/weekly_monday_clinical_dataset.parquet.gzip', compression='gzip', index=False)

04:43:59 INFO:Removing 975 patients and 51675 sessions with missing birth date
04:44:00 INFO:Removing 0 patients and 0 sessions under 18 years of age


In [33]:
#TODO: analyze how missingness differs for 5 day, 7 day, 10 day, 14 day baseline lab value
#TODO: set up DVC (data version control)