In [2]:
%cd ../
%load_ext autoreload
%autoreload 2

/Users/khe/Documents/Projects/PMClinicalDatasetMaker


In [3]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
import yaml

from src import ROOT_DIR
from src.combine import (
    add_engineered_features,
    combine_demographic_to_main_data, 
    combine_feat_to_main_data, 
    combine_perc_dose_to_main_data,
    combine_treatment_to_main_data
)
from src.preprocess.cancer_registry import get_demographic_data
from src.preprocess.dart import get_symptoms_data
from src.preprocess.lab import get_lab_data
from src.preprocess.opis import get_treatment_data
from src.util import load_included_drugs

In [3]:
def quick_summary(df):
    print(f'Number of treatment sessions = {len(df)}')
    print(f'Number of patients = {df["mrn"].nunique()}')
    print(f'Cohort from {df["treatment_date"].min().date()} to {df["treatment_date"].max().date()}')

In [4]:
# load config
with open(f'{ROOT_DIR}/config.yaml') as file:
    cfg = yaml.safe_load(file)

# load external data
included_drugs = load_included_drugs()

mrn_map = pd.read_csv(f'{ROOT_DIR}/data/external/MRN_map.csv')
mrn_map = mrn_map.set_index('RESEARCH_ID')['PATIENT_MRN'].to_dict()

# Build the features

## DART

In [9]:
dart, dart_demog = get_symptoms_data(data_dir=f'{ROOT_DIR}/data/raw')
dart.to_parquet(f'{ROOT_DIR}/data/interim/symptom.parquet.gzip', compression='gzip', index=False)

08:17:56 INFO:Removing 3 patients whose sex is Unknown
08:17:56 INFO:Removing 161 patients without any symptom scores


## Cancer Registry

In [10]:
canc_reg = get_demographic_data(data_dir=f'{ROOT_DIR}/data/raw', external_data=dart_demog)
canc_reg.to_parquet(f'{ROOT_DIR}/data/interim/demographic.parquet.gzip', compression='gzip', index=False)

08:17:58 INFO:Removing 1 patients with no MRN
08:17:58 INFO:Removing 2 patients whose sex is other than Male/Female
08:17:59 INFO:Number of patients in cancer registry = 55928. Adding an additional 3560 patients from DART.


## OPIS

In [11]:
opis = get_treatment_data(included_drugs, data_dir=f'{ROOT_DIR}/data/raw')
opis.to_parquet(f'{ROOT_DIR}/data/interim/treatment.parquet.gzip', compression='gzip', index=False)
quick_summary(opis)

08:18:06 INFO:Removing 1 patients with missing regimen info
08:18:07 INFO:Removing 128 patients who received only trial, supportive, and/or non-aerodigestive drugs
08:18:07 INFO:Removing 0 patients where dosage is not provided


Number of treatment sessions = 119830
Number of patients = 10315
Cohort from 2005-11-16 to 2021-02-01


## Laboratory Tests 
Hematology and Biochemistry

In [12]:
lab = get_lab_data(mrn_map, data_dir=f'{ROOT_DIR}/data/raw')
lab.to_parquet(f'{ROOT_DIR}/data/interim/lab.parquet.gzip', compression='gzip', index=False)

{'sodium': 'mmol/L', 'chloride': 'mmol/L', 'potassium': 'mmol/L', 'hemoglobin': 'g/L', 'mean_corpuscular_volume': 'fL', 'mean_corpuscular_hemoglobin_concentration': 'g/L', 'platelet': 'x10e9/L', 'mean_corpuscular_hemoglobin': 'pg', 'mean_platelet_volume': 'fL', 'creatinine': 'umol/L', 'lymphocyte': 'x10e9/L', 'monocyte': 'x10e9/L', 'hematocrit': 'L/L', 'red_blood_cell': 'x10e12/L', 'white_blood_cell': 'x10e9/L', 'neutrophil': 'x10e9/L', 'glucose': 'mmol/L', 'magnesium': 'mmol/L', 'aspartate_aminotransferase': 'U/L', 'total_bilirubin': 'umol/L', 'alkaline_phosphatase': 'U/L', 'alanine_aminotransferase': 'U/L', 'eosinophil': 'x10e9/L', 'phosphate': 'mmol/L', 'bicarbonate': 'mmol/L', 'albumin': 'g/L', 'red_cell_distribution_width': '%CV', 'basophil': 'x10e9/L', 'lactate_dehydrogenase': 'U/L', 'activated_partial_thromboplastin_time': 's', 'carbohydrate_antigen_19-9': 'kU/L', 'carcinoembryonic_antigen': 'ug/L'}


# Combine the features

In [55]:
lab = pd.read_parquet(f'{ROOT_DIR}/data/interim/lab.parquet.gzip')
trt = pd.read_parquet(f'{ROOT_DIR}/data/interim/treatment.parquet.gzip')
dmg = pd.read_parquet(f'{ROOT_DIR}/data/interim/demographic.parquet.gzip')
sym = pd.read_parquet(f'{ROOT_DIR}/data/interim/symptom.parquet.gzip')

## Align on treatment sessions

In [14]:
df = combine_demographic_to_main_data(main=trt, demographic=dmg, main_date_col='treatment_date')
quick_summary(df)

08:25:11 INFO:Removing 221 patients with missing birth date
08:25:11 INFO:Removing 0 patients under 18 years of age


Number of treatment sessions = 118442
Number of patients = 10094
Cohort from 2005-11-16 to 2021-02-01


In [15]:
#TODO: Try polars/dask for performance comparison
df = combine_feat_to_main_data(
    main=df, feat=sym, main_date_col='treatment_date', feat_date_col='survey_date', time_window=(-cfg['symp_days'],0)
)
df = combine_feat_to_main_data(
    main=df, feat=lab, main_date_col='treatment_date', feat_date_col='obs_date', time_window=(-cfg['lab_days'],0)
)

100%|██████████| 1847/1847 [00:22<00:00, 81.37it/s] 
100%|██████████| 1847/1847 [00:23<00:00, 78.36it/s]
100%|██████████| 1848/1848 [00:24<00:00, 76.84it/s]
100%|██████████| 1847/1847 [00:26<00:00, 70.92it/s] 
100%|██████████| 1907/1907 [00:28<00:00, 66.66it/s] 
100%|██████████| 1907/1907 [00:29<00:00, 65.04it/s]
100%|██████████| 1907/1907 [00:29<00:00, 64.40it/s]
100%|██████████| 1907/1907 [00:31<00:00, 61.22it/s]


In [17]:
df = combine_perc_dose_to_main_data(main=df, included_drugs=included_drugs)
df = add_engineered_features(df, date_col='treatment_date')

In [18]:
df.to_parquet(f'{ROOT_DIR}/data/processed/treatment_centered_clinical_dataset.parquet.gzip', compression='gzip', index=False)

## Align on every Mondays

In [75]:
from itertools import product
mrns = trt['mrn'].unique()
dates = pd.date_range(start='2018-01-01', end='2018-12-31', freq='W-MON')
df = pd.DataFrame(product(mrns, dates), columns=['mrn', 'assessment_date'])

In [76]:
df = combine_treatment_to_main_data(df, trt, main_date_col='assessment_date', time_window=(-7,0))
df = combine_demographic_to_main_data(main=df, demographic=dmg, main_date_col='assessment_date')
df = combine_feat_to_main_data(
    main=df, feat=lab, main_date_col='assessment_date', feat_date_col='obs_date', time_window=(-cfg['lab_days'],0)
)
df = combine_perc_dose_to_main_data(main=df, included_drugs=included_drugs)
df = add_engineered_features(df, date_col='assessment_date')
# df.to_parquet(f'{ROOT_DIR}/data/processed/weekly_monday_clinical_dataset.parquet.gzip', compression='gzip', index=False)

100%|██████████| 2579/2579 [00:58<00:00, 43.86it/s]
100%|██████████| 2579/2579 [00:59<00:00, 43.51it/s]
100%|██████████| 2579/2579 [01:00<00:00, 42.92it/s]
100%|██████████| 2578/2578 [01:02<00:00, 40.97it/s]
100%|██████████| 2579/2579 [00:52<00:00, 48.75it/s]
100%|██████████| 2579/2579 [00:53<00:00, 48.58it/s]
100%|██████████| 2579/2579 [00:54<00:00, 47.75it/s]
100%|██████████| 2578/2578 [00:56<00:00, 46.01it/s]
10:00:39 INFO:Removing 221 patients with missing birth date
10:00:41 INFO:Removing 0 patients under 18 years of age
100%|██████████| 1907/1907 [00:35<00:00, 54.39it/s]
100%|██████████| 1907/1907 [00:35<00:00, 54.34it/s]
100%|██████████| 1907/1907 [00:35<00:00, 53.37it/s]
100%|██████████| 1907/1907 [00:37<00:00, 51.07it/s]


In [None]:
#TODO: analyze how missingness differs for 5 day, 7 day, 10 day, 14 day baseline lab value
#TODO: set up DVC (data version control), set up github (private or public?), where to place data (in H4H cluster?)