In [1]:
%%capture
%cd ../
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import yaml

from src import ROOT_DIR
from src.combine import (
    add_engineered_features,
    combine_demographic_to_main_data, 
    combine_event_to_main_data,
    combine_feat_to_main_data, 
    combine_perc_dose_to_main_data,
    combine_treatment_to_main_data
)
from src.preprocess.cancer_registry import get_demographic_data
from src.preprocess.dart import get_symptoms_data
from src.preprocess.emergency import get_emergency_room_data
from src.preprocess.lab import get_lab_data
from src.preprocess.opis import get_treatment_data
from src.util import load_included_drugs, load_included_regimens

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
def quick_summary(df):
    print(f'Number of treatment sessions = {len(df)}')
    print(f'Number of patients = {df["mrn"].nunique()}')
    print(f'Cohort from {df["treatment_date"].min().date()} to {df["treatment_date"].max().date()}')

In [4]:
# load config
with open(f'{ROOT_DIR}/config.yaml') as file:
    cfg = yaml.safe_load(file)

# load external data
included_drugs = load_included_drugs()
included_regimens = load_included_regimens()

mrn_map = pd.read_csv(f'{ROOT_DIR}/data/external/MRN_map.csv')
mrn_map = mrn_map.set_index('RESEARCH_ID')['PATIENT_MRN'].to_dict()

# Build the features

## DART

In [5]:
dart = get_symptoms_data(data_dir=f'{ROOT_DIR}/data/raw')
dart.to_parquet(f'{ROOT_DIR}/data/interim/symptom.parquet.gzip', compression='gzip', index=False)

04:29:57 INFO:Removing 1118 patients and 8940 sessions in which consent to research was declined
04:29:57 INFO:Removing 2 patients and 12 sessions in which sex is Unknown
04:29:57 INFO:Removing 174 patients and 2468 sessions without any symptom scores


## Cancer Registry

In [6]:
canc_reg = get_demographic_data(data_dir=f'{ROOT_DIR}/data/raw')
canc_reg.to_parquet(f'{ROOT_DIR}/data/interim/demographic.parquet.gzip', compression='gzip', index=False)

04:30:30 INFO:Removing 1 patients and 1 sessions with no MRN
04:30:30 INFO:Removing 2 patients and 2 sessions in which sex is other than Male/Female


## OPIS

In [11]:
opis = get_treatment_data(included_drugs, included_regimens, data_dir=f'{ROOT_DIR}/data/raw')
opis.to_parquet(f'{ROOT_DIR}/data/interim/treatment.parquet.gzip', compression='gzip', index=False)
quick_summary(opis)

03:33:59 INFO:Removing 1 patients and 998 sessions with missing regimen info
03:34:01 INFO:Removing 708 patients and 74259 sessions not part of selected regimens
03:34:01 INFO:Removing 102 patients and 111404 sessions that received only trial, supportive, and/or non-aerodigestive drugs
03:34:01 INFO:Removing 0 patients and 137 sessions where dosage is not provided
03:34:01 INFO:Removing 0 patients and 1 sessions that are duplicate rows except for first_treatment_date
03:34:02 INFO:Removing 0 patients and 1 sessions that are duplicate rows except for cycle_number


Number of treatment sessions = 118597
Number of patients = 10272
Cohort from 2005-11-16 to 2021-02-01


In [8]:
print(f'Number of unique regimens: {opis["regimen"].nunique()}')
dict(opis['regimen'].value_counts())

Number of unique regimens: 136


{'TRIAL': 9242,
 'GI-FOLFIRI+BEVACIZUMAB': 8356,
 'GI-GEM D1,8,15': 7223,
 'LU-ETOPCISP-RT': 5654,
 'GI-FOLFOX-6 MOD': 5473,
 'GI-GEMCISP (BILIARY)': 4300,
 'LU-ETOPCISP 3 DAY': 3477,
 'LU-PEMETREXED (NSCLC)': 3378,
 'LU-ETOPCARBO': 3297,
 'GI-GEM+ABRAXANE': 3284,
 'LU-VINOCISP': 3129,
 'GI-MITOFU': 3128,
 'GI-FOLFOX-4 MOD': 2807,
 'GI-FOLFIRINOX': 2527,
 'GI-ECF': 2414,
 'GI-FOLFIRI': 2353,
 'GI-FUFA-5 DAYS': 2078,
 'LU-PEMBROLIZUMAB': 1839,
 'GI-GEM 40MG/M2 2X/WK': 1753,
 'GI-GEM 7-WEEKLY': 1697,
 'LU-ETOPCARBO-RT': 1690,
 'GI-GEMCAP': 1688,
 'HN-CISPLATIN WEEKLY': 1682,
 'GI-CISPFU ESOPHAGEAL': 1487,
 'LU-PEMETREXED-CARBO': 1467,
 'GI-FOLFIRINOX-MOD': 1463,
 'LU-GEMCISP': 1438,
 'GI-FU CIV + RT': 1354,
 'LU-VINOCARBO': 1236,
 'LU-GEMCARBO': 1163,
 'GI-ECX': 1098,
 'HN-GEM D1,8': 1078,
 'LU-DOCEQ3W': 1068,
 'GI-PANITUMUMAB': 983,
 'LU-PACLICARBO': 857,
 'LU-NIVOLUMAB (CCO)': 849,
 'GI-FUFA C1,4,5 GASTRIC': 804,
 'LU-NIVO Q4WEEKS (CCO)': 795,
 'GI-FOLFOX (GASTRIC)': 756,
 'LU-ETOPCISP

In [9]:
opis.columns.tolist()

['mrn',
 'treatment_date',
 'regimen',
 'height',
 'weight',
 'body_surface_area',
 'cycle_number',
 'first_treatment_date',
 'intent',
 'drug_ATEZOLIZUMAB COMPASSIONATE SUP_given_dose',
 'drug_ATEZOLIZUMAB STUDY SUPPLY_given_dose',
 'drug_BEVACIZUMAB (AVASTIN)_given_dose',
 'drug_BEVACIZUMAB TRIAL SUPPLY_given_dose',
 'drug_CAPECITABINE_given_dose',
 'drug_CAPECITABINE STUDY SUPPLY_given_dose',
 'drug_CARBOPLATIN_given_dose',
 'drug_CARBOPLATIN STUDY SUPPLY_given_dose',
 'drug_CETUXIMAB_given_dose',
 'drug_CETUXIMAB TRIAL SUPPLY_given_dose',
 'drug_CISPLATIN_given_dose',
 'drug_CISPLATIN STUDY SUPPLY_given_dose',
 'drug_CYCLOPHOSPHAMIDE_given_dose',
 'drug_DOCETAXEL_given_dose',
 'drug_DOCETAXEL - PAID_given_dose',
 'drug_DOCETAXEL STUDY SUPPLY_given_dose',
 'drug_DOCETAXEL TRIAL SUPPLY_given_dose',
 'drug_DOXORUBICIN HCL_given_dose',
 'drug_DOXORUBICIN STUDY SUPPLY_given_dose',
 'drug_DURVALUMAB_given_dose',
 'drug_DURVALUMAB COMPASSIONATE SUPPL_given_dose',
 'drug_DURVALUMAB STUDY S

## Laboratory Tests 
Hematology and Biochemistry

In [103]:
lab = get_lab_data(mrn_map, data_dir=f'{ROOT_DIR}/data/raw')
lab.to_parquet(f'{ROOT_DIR}/data/interim/lab.parquet.gzip', compression='gzip', index=False)

{'sodium': 'mmol/L', 'chloride': 'mmol/L', 'potassium': 'mmol/L', 'hemoglobin': 'g/L', 'mean_corpuscular_volume': 'fL', 'mean_corpuscular_hemoglobin_concentration': 'g/L', 'platelet': 'x10e9/L', 'mean_corpuscular_hemoglobin': 'pg', 'mean_platelet_volume': 'fL', 'creatinine': 'umol/L', 'lymphocyte': 'x10e9/L', 'monocyte': 'x10e9/L', 'hematocrit': 'L/L', 'red_blood_cell': 'x10e12/L', 'white_blood_cell': 'x10e9/L', 'neutrophil': 'x10e9/L', 'glucose': 'mmol/L', 'magnesium': 'mmol/L', 'aspartate_aminotransferase': 'U/L', 'total_bilirubin': 'umol/L', 'alkaline_phosphatase': 'U/L', 'alanine_aminotransferase': 'U/L', 'eosinophil': 'x10e9/L', 'phosphate': 'mmol/L', 'bicarbonate': 'mmol/L', 'albumin': 'g/L', 'red_cell_distribution_width': '%CV', 'basophil': 'x10e9/L', 'lactate_dehydrogenase': 'U/L', 'activated_partial_thromboplastin_time': 's', 'carbohydrate_antigen_19-9': 'kU/L', 'carcinoembryonic_antigen': 'ug/L'}


# Emergency Room Visits

In [123]:
er_visit = get_emergency_room_data(data_dir=f'{ROOT_DIR}/data/raw')
er_visit.to_parquet(f'{ROOT_DIR}/data/interim/emergency_room_visit.parquet.gzip', compression='gzip', index=False)

02:56:46 INFO:Removing 0 patients and 7 sessions which are duplicate entries


# Combine the features

In [5]:
lab = pd.read_parquet(f'{ROOT_DIR}/data/interim/lab.parquet.gzip')
trt = pd.read_parquet(f'{ROOT_DIR}/data/interim/treatment.parquet.gzip')
dmg = pd.read_parquet(f'{ROOT_DIR}/data/interim/demographic.parquet.gzip')
sym = pd.read_parquet(f'{ROOT_DIR}/data/interim/symptom.parquet.gzip')
erv = pd.read_parquet(f'{ROOT_DIR}/data/interim/emergency_room_visit.parquet.gzip')

## Align on treatment sessions

In [6]:
df = combine_demographic_to_main_data(main=trt, demographic=dmg, main_date_col='treatment_date')
quick_summary(df)

05:59:24 INFO:Removing 975 patients and 7379 sessions with missing birth date
05:59:25 INFO:Removing 0 patients and 6 sessions under 18 years of age


Number of treatment sessions = 111212
Number of patients = 9297
Cohort from 2005-11-16 to 2021-02-01


In [7]:
#TODO: Try polars/dask for performance comparison
df = combine_feat_to_main_data(
    main=df, feat=sym, main_date_col='treatment_date', feat_date_col='survey_date', time_window=(-cfg['symp_lookback_window'],0)
)
df = combine_feat_to_main_data(
    main=df, feat=lab, main_date_col='treatment_date', feat_date_col='obs_date', time_window=(-cfg['lab_lookback_window'],0)
)

100%|██████████| 1604/1604 [00:04<00:00, 373.46it/s]
100%|██████████| 1604/1604 [00:04<00:00, 338.31it/s]
100%|██████████| 1604/1604 [00:04<00:00, 357.08it/s]
100%|██████████| 1604/1604 [00:04<00:00, 340.28it/s]
100%|██████████| 1831/1831 [00:06<00:00, 268.11it/s]
100%|██████████| 1830/1830 [00:06<00:00, 277.45it/s]
100%|██████████| 1830/1830 [00:06<00:00, 273.44it/s]
100%|██████████| 1830/1830 [00:06<00:00, 263.85it/s]


In [8]:
df = combine_event_to_main_data(
    main=df, event=erv, main_date_col='treatment_date', event_date_col='event_date', event_name='ED_visit',
    lookback_window=cfg['ed_visit_lookback_window']
)

05:59:40 INFO:NumExpr defaulting to 8 threads.
05:59:40 INFO:NumExpr defaulting to 8 threads.
05:59:40 INFO:NumExpr defaulting to 8 threads.
  0%|          | 0/1100 [00:00<?, ?it/s]05:59:40 INFO:NumExpr defaulting to 8 threads.
100%|██████████| 1100/1100 [00:04<00:00, 230.08it/s]
100%|██████████| 1100/1100 [00:04<00:00, 229.07it/s]
100%|██████████| 1099/1099 [00:04<00:00, 228.83it/s]
100%|██████████| 1099/1099 [00:04<00:00, 230.73it/s]


In [9]:
df = combine_perc_dose_to_main_data(main=df, included_drugs=included_drugs)
df = add_engineered_features(df, date_col='treatment_date')

  return (140 - df['age']) * df['weight'] * 1.23 * df['female'].replace({True: 0.85, False: 1}) / df['creatinine']


In [10]:
df.to_parquet(f'{ROOT_DIR}/data/processed/treatment_centered_clinical_dataset.parquet.gzip', compression='gzip', index=False)

## Align on every Mondays

In [31]:
from itertools import product
mrns = trt['mrn'].unique()
dates = pd.date_range(start='2018-01-01', end='2018-12-31', freq='W-MON')
df = pd.DataFrame(product(mrns, dates), columns=['mrn', 'assessment_date'])

In [32]:
df = combine_treatment_to_main_data(df, trt, main_date_col='assessment_date', time_window=(-7,0))
df = combine_demographic_to_main_data(main=df, demographic=dmg, main_date_col='assessment_date')
df = combine_feat_to_main_data(
    main=df, feat=sym, main_date_col='assessment_date', feat_date_col='survey_date', time_window=(-cfg['symp_lookback_window'],0)
)
df = combine_feat_to_main_data(
    main=df, feat=lab, main_date_col='assessment_date', feat_date_col='obs_date', time_window=(-cfg['lab_lookback_window'],0)
)
df = combine_event_to_main_data(
    main=df, event=erv, main_date_col='treatment_date', event_date_col='event_date', event_name='ED_visit', 
    lookback_window=cfg['ed_visit_lookback_window']
)
df = combine_perc_dose_to_main_data(main=df, included_drugs=included_drugs)
df = add_engineered_features(df, date_col='assessment_date')
# df.to_parquet(f'{ROOT_DIR}/data/processed/weekly_monday_clinical_dataset.parquet.gzip', compression='gzip', index=False)

04:43:59 INFO:Removing 975 patients and 51675 sessions with missing birth date
04:44:00 INFO:Removing 0 patients and 0 sessions under 18 years of age


In [33]:
#TODO: analyze how missingness differs for 5 day, 7 day, 10 day, 14 day baseline lab value
#TODO: set up DVC (data version control)