In [1]:
%%capture
%cd ../
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import yaml

from make_clinical_dataset import ROOT_DIR
from make_clinical_dataset.combine import (
    add_engineered_features,
    combine_demographic_to_main_data, 
    combine_event_to_main_data,
    combine_meas_to_main_data,
    combine_perc_dose_to_main_data,
    combine_treatment_to_main_data,
)
from make_clinical_dataset.label import get_CTCAE_labels, get_death_labels, get_ED_labels, get_symptom_labels
from make_clinical_dataset.preprocess.cancer_registry import get_demographic_data
from make_clinical_dataset.preprocess.clinic import get_clinical_notes_data, get_clinic_visits_during_treatment, backfill_treatment_info
from make_clinical_dataset.preprocess.dart import get_symptoms_data
from make_clinical_dataset.preprocess.emergency import get_emergency_room_data
from make_clinical_dataset.preprocess.lab import get_lab_data
from make_clinical_dataset.preprocess.opis import get_treatment_data
from make_clinical_dataset.preprocess.radiology import get_radiology_data
from make_clinical_dataset.preprocess.recist import get_recist_data
from make_clinical_dataset.util import load_included_drugs, load_included_regimens

from ml_common.anchor import merge_closest_measurements

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
def quick_summary(df):
    print(f'Number of sessions = {len(df)}')
    print(f'Number of patients = {df["mrn"].nunique()}')
    print(f'Cohort from {df["treatment_date"].min().date()} to {df["treatment_date"].max().date()}')

def check_overlap(main, feat, main_name, feat_name):
    mask = ~main['mrn'].isin(feat['mrn'])
    n_sessions = sum(mask)
    perc_sessions = (mask).mean()*100
    n_patients = main.loc[mask, 'mrn'].nunique()
    perc_patients = (n_patients / main['mrn'].nunique()) * 100
    print(f'{perc_sessions:.1f}% (N={n_sessions}) of sessions and {perc_patients:.1f}% (N={n_patients}) of patients '
          f'in the {main_name} do not have overlapping mrns with the {feat_name}')

In [4]:
# load config
with open(f'{ROOT_DIR}/config.yaml') as file:
    cfg = yaml.safe_load(file)

# load external data
included_drugs = load_included_drugs()
included_regimens = load_included_regimens()

mrn_map = pd.read_csv(f'{ROOT_DIR}/data/external/MRN_map.csv')
mrn_map = mrn_map.set_index('RESEARCH_ID')['PATIENT_MRN'].to_dict()

# Build the features & targets

## DART

In [39]:
dart = get_symptoms_data(data_dir=f'{ROOT_DIR}/data/raw')
dart.to_parquet(f'{ROOT_DIR}/data/interim/symptom.parquet.gzip', compression='gzip', index=False)

04:22:52 INFO:Removing 1118 patients and 8940 sessions in which consent to research was declined
04:22:52 INFO:Removing 2 patients and 12 sessions in which sex is Unknown
04:22:52 INFO:Removing 174 patients and 2468 sessions without any symptom scores


## Cancer Registry

In [40]:
canc_reg = get_demographic_data(data_dir=f'{ROOT_DIR}/data/raw')
canc_reg.to_parquet(f'{ROOT_DIR}/data/interim/demographic.parquet.gzip', compression='gzip', index=False)

04:22:53 INFO:Removing 1 patients and 1 sessions with no MRN
04:22:53 INFO:Removing 2 patients and 2 sessions in which sex is other than Male/Female


## OPIS

In [42]:
opis = get_treatment_data(included_drugs, included_regimens, data_dir=f'{ROOT_DIR}/data/raw')
opis.to_parquet(f'{ROOT_DIR}/data/interim/treatment.parquet.gzip', compression='gzip', index=False)
quick_summary(opis)
print(f'Number of unique regimens: {opis["regimen"].nunique()}')

04:30:48 INFO:Removing 1 patients and 998 sessions with missing regimen info
04:30:48 INFO:Removing 708 patients and 74259 sessions not part of selected regimens
04:30:48 INFO:Removing 102 patients and 111404 sessions that received only trial, supportive, and/or non-aerodigestive drugs
04:30:48 INFO:Removing 0 patients and 137 sessions where dosage is not provided
04:30:48 INFO:Removing 0 patients and 1 sessions that are duplicate rows except for first_treatment_date
04:30:49 INFO:Removing 0 patients and 1 sessions that are duplicate rows except for cycle_number


Number of sessions = 118597
Number of patients = 10272
Cohort from 2005-11-16 to 2021-02-01
Number of unique regimens: 134


## Laboratory Tests 
Hematology and Biochemistry

In [None]:
lab = get_lab_data(mrn_map, data_dir=f'{ROOT_DIR}/data/raw')
lab.to_parquet(f'{ROOT_DIR}/data/interim/lab.parquet.gzip', compression='gzip', index=False)

{'sodium': 'mmol/L', 'chloride': 'mmol/L', 'potassium': 'mmol/L', 'hemoglobin': 'g/L', 'mean_corpuscular_volume': 'fL', 'mean_corpuscular_hemoglobin_concentration': 'g/L', 'platelet': 'x10e9/L', 'mean_corpuscular_hemoglobin': 'pg', 'mean_platelet_volume': 'fL', 'creatinine': 'umol/L', 'lymphocyte': 'x10e9/L', 'monocyte': 'x10e9/L', 'hematocrit': 'L/L', 'red_blood_cell': 'x10e12/L', 'white_blood_cell': 'x10e9/L', 'neutrophil': 'x10e9/L', 'glucose': 'mmol/L', 'magnesium': 'mmol/L', 'aspartate_aminotransferase': 'U/L', 'total_bilirubin': 'umol/L', 'alkaline_phosphatase': 'U/L', 'alanine_aminotransferase': 'U/L', 'eosinophil': 'x10e9/L', 'phosphate': 'mmol/L', 'bicarbonate': 'mmol/L', 'albumin': 'g/L', 'red_cell_distribution_width': '%CV', 'basophil': 'x10e9/L', 'lactate_dehydrogenase': 'U/L', 'activated_partial_thromboplastin_time': 's', 'carbohydrate_antigen_19-9': 'kU/L', 'carcinoembryonic_antigen': 'ug/L'}


## Emergency Room Visits

In [None]:
er_visit = get_emergency_room_data(data_dir=f'{ROOT_DIR}/data/raw')
er_visit.to_parquet(f'{ROOT_DIR}/data/interim/emergency_room_visit.parquet.gzip', compression='gzip', index=False)

02:56:46 INFO:Removing 0 patients and 7 sessions which are duplicate entries


## Radiology Reports

In [None]:
reports = get_radiology_data(mrn_map, data_dir=f'{ROOT_DIR}/data/raw')
reports.to_parquet(f'{ROOT_DIR}/data/interim/reports.parquet.gzip', compression='gzip', index=False)

## Clinical Notes

In [None]:
clinical_notes = get_clinical_notes_data(data_dir=f'{ROOT_DIR}/data/raw')
clinical_notes.to_parquet(f'{ROOT_DIR}/data/interim/clinical_notes.parquet.gzip', compression='gzip', index=False)

Removing 105 visits that "occured before" 2006-01-05 00:00:00


## RECIST - COMPASS

In [None]:
recist = get_recist_data(data_dir=f'{ROOT_DIR}/data/external')
recist.to_parquet(f'{ROOT_DIR}/data/interim/recist.parquet.gzip', compression='gzip', index=False)

# Combine the features & targets

In [3]:
lab = pd.read_parquet(f'{ROOT_DIR}/data/interim/lab.parquet.gzip')
trt = pd.read_parquet(f'{ROOT_DIR}/data/interim/treatment.parquet.gzip')
dmg = pd.read_parquet(f'{ROOT_DIR}/data/interim/demographic.parquet.gzip')
sym = pd.read_parquet(f'{ROOT_DIR}/data/interim/symptom.parquet.gzip')
erv = pd.read_parquet(f'{ROOT_DIR}/data/interim/emergency_room_visit.parquet.gzip')
last_seen = pd.read_parquet(f'{ROOT_DIR}/data/interim/last_seen_dates.parquet.gzip')

In [None]:
check_overlap(trt, lab, 'treatment database', 'laboratory database')
check_overlap(trt, sym, 'treatment database', 'symptoms database')

26.6% (N=31561) of sessions and 25.3% (N=2598) of patients in the treatment database do not have overlapping mrns with the laboratory database
25.4% (N=30100) of sessions and 30.3% (N=3117) of patients in the treatment database do not have overlapping mrns with the symptoms database


## Align on treatment sessions

In [6]:
df = combine_demographic_to_main_data(trt, dmg, 'treatment_date')
df['last_seen_date'] = df['mrn'].map(last_seen['last_seen_date'])
df['assessment_date'] = df['treatment_date']
quick_summary(df)

09:22:48 INFO:Removing 975 patients and 7379 sessions with missing birth date
09:22:48 INFO:Removing 0 patients and 6 sessions under 18 years of age


Number of sessions = 111212
Number of patients = 9297
Cohort from 2005-11-16 to 2021-02-01


In [7]:
# Extract features
# df = combine_meas_to_main_data(df, sym, 'treatment_date', 'survey_date', time_window=cfg['symp_lookback_window'], stats=['last'])
# df = combine_meas_to_main_data(df, lab, 'treatment_date', 'obs_date', time_window=cfg['lab_lookback_window'], stats=['last'])
# df.columns = df.columns.str.replace('_LAST', '')
df = merge_closest_measurements(df, sym, 'treatment_date', 'survey_date', time_window=cfg['symp_lookback_window'])
df = merge_closest_measurements(df, lab, 'treatment_date', 'obs_date', time_window=cfg['lab_lookback_window'])
df = combine_event_to_main_data(df, erv, 'treatment_date', 'event_date', event_name='ED_visit', lookback_window=cfg['ed_visit_lookback_window'])
df = combine_perc_dose_to_main_data(df, included_drugs)
df = add_engineered_features(df, 'treatment_date')

In [8]:
# Extract targets
df = get_death_labels(df, lookahead_window=[30, 365])
df = get_ED_labels(df, erv[['mrn', 'event_date']].copy(), lookahead_window=30) #, 'CTAS_score', 'CEDIS_complaint']
df = get_symptom_labels(df, sym, lookahead_window=30)
df = get_CTCAE_labels(df, lab, lookahead_window=30)

In [9]:
df.to_parquet(f'{ROOT_DIR}/data/processed/treatment_centered_dataset.parquet.gzip', compression='gzip', index=False)

## Align on clinic visits

In [6]:
clinic = pd.read_parquet(f'{ROOT_DIR}/data/interim/clinical_notes.parquet.gzip')
check_overlap(trt, clinic, 'treatment database', 'clinic database')
clinic = get_clinic_visits_during_treatment(clinic, trt)

0.4% (N=527) of sessions and 0.9% (N=90) of patients in the treatment database do not have overlapping mrns with the clinic database


In [7]:
# Extract features
df = combine_treatment_to_main_data(clinic, trt, 'clinic_date', time_window=cfg['trt_lookback_window'])
df['last_seen_date'] = df['mrn'].map(last_seen['last_seen_date'])
df['assessment_date'] = df['clinic_date']
df = backfill_treatment_info(df)
quick_summary(df)
df = combine_demographic_to_main_data(df, dmg, 'clinic_date')
df = merge_closest_measurements(df, sym, 'clinic_date', 'survey_date', time_window=cfg['symp_lookback_window'])
df = merge_closest_measurements(df, lab, 'clinic_date', 'obs_date', time_window=cfg['lab_lookback_window'])
df = combine_event_to_main_data(df, erv, 'clinic_date', 'event_date', event_name='ED_visit', lookback_window=cfg['ed_visit_lookback_window'])
df = combine_perc_dose_to_main_data(df, included_drugs)
df = add_engineered_features(df, 'clinic_date')
# Extract targets
df = get_death_labels(df, lookahead_window=[30, 365])
df = get_ED_labels(df, erv[['mrn', 'event_date']].copy(), lookahead_window=30)
df = get_symptom_labels(df, sym, lookahead_window=30)
df = get_CTCAE_labels(df, lab, lookahead_window=30)
df.to_parquet(f'{ROOT_DIR}/data/processed/clinic_centered_dataset.parquet.gzip', compression='gzip', index=False)

09:42:22 INFO:Removing 903 patients and 4299 sessions with missing birth date
09:42:22 INFO:Removing 0 patients and 0 sessions under 18 years of age


Number of sessions = 45611
Number of patients = 8776
Cohort from 2006-02-28 to 2021-01-21


## Align on every Mondays

In [None]:
from itertools import product
mrns = trt['mrn'].unique()
dates = pd.date_range(start='2018-01-01', end='2018-12-31', freq='W-MON')
df = pd.DataFrame(product(mrns, dates), columns=['mrn', 'assessment_date'])
df['last_seen_date'] = df['mrn'].map(last_seen['last_seen_date'])

In [None]:
# Extract features
df = combine_treatment_to_main_data(df, trt, 'assessment_date', time_window=cfg['trt_lookback_window'])
df = combine_demographic_to_main_data(df, dmg, 'assessment_date')
df = merge_closest_measurements(df, sym, 'assessment_date', 'survey_date', time_window=cfg['symp_lookback_window'])
df = merge_closest_measurements(df, lab, 'assessment_date', 'obs_date', time_window=cfg['lab_lookback_window'])
df = combine_event_to_main_data(df, erv, 'assessment_date', 'event_date', event_name='ED_visit', lookback_window=cfg['ed_visit_lookback_window'])
df = combine_perc_dose_to_main_data(df, included_drugs)
df = add_engineered_features(df, 'assessment_date')
# Extract targets
df = get_death_labels(df, lookahead_window=[30, 365])
df = get_ED_labels(df, erv[['mrn', 'event_date']].copy(), lookahead_window=30)
df = get_symptom_labels(df, sym, lookahead_window=30)
df = get_CTCAE_labels(df, lab, lookahead_window=30)
# df.to_parquet(f'{ROOT_DIR}/data/processed/weekly_monday_clinical_dataset.parquet.gzip', compression='gzip', index=False)

04:43:59 INFO:Removing 975 patients and 51675 sessions with missing birth date
04:44:00 INFO:Removing 0 patients and 0 sessions under 18 years of age
