In [1]:
%cd ../
%load_ext autoreload
%autoreload 2

c:\Users\Kevin He\Documents\kh\Job\UHN\OICR_TRANSFER\TRANSFER\Projects\aim2reduce


In [2]:
from tqdm import tqdm
import pandas as pd
pd.set_option('display.max_rows', 150)

from src import logger
from src.util import get_nunique_categories, get_nmissing
from src.constants import symp_cols
from src.label import convert_to_binary_symptom_labels, get_symptom_labels, get_label_distribution
from src.summarize import feature_summary
from src.prepare.prep import PrepData, fill_missing_data
from src.prepare.pipeline import symptom_prep_pipeline
from src.prepare.filter import drop_highly_missing_features, drop_samples_outside_study_date, drop_samples_with_no_targets
from src.prepare.engineer import collapse_rare_categories, get_change_since_prev_session, get_missingness_features

In [3]:
df = pd.read_parquet('data/treatment_centered_clinical_dataset.parquet.gzip')
"""
Note to self: why is morphology and cancer site one-hot encoded but regimen is not?
Because patients can have multiple diagnoses at different dates. 
See make-clinical-dataset/preprocess/cancer_registry for more info.
"""
df.columns.tolist()

['mrn',
 'treatment_date',
 'regimen',
 'height',
 'weight',
 'body_surface_area',
 'cycle_number',
 'first_treatment_date',
 'intent',
 'date_of_birth',
 'female',
 'cancer_site_C00',
 'cancer_site_C01',
 'cancer_site_C02',
 'cancer_site_C03',
 'cancer_site_C04',
 'cancer_site_C05',
 'cancer_site_C06',
 'cancer_site_C07',
 'cancer_site_C08',
 'cancer_site_C09',
 'cancer_site_C10',
 'cancer_site_C11',
 'cancer_site_C12',
 'cancer_site_C13',
 'cancer_site_C14',
 'cancer_site_C15',
 'cancer_site_C16',
 'cancer_site_C17',
 'cancer_site_C18',
 'cancer_site_C19',
 'cancer_site_C20',
 'cancer_site_C21',
 'cancer_site_C22',
 'cancer_site_C23',
 'cancer_site_C24',
 'cancer_site_C25',
 'cancer_site_C26',
 'cancer_site_C30',
 'cancer_site_C31',
 'cancer_site_C32',
 'cancer_site_C34',
 'cancer_site_C37',
 'cancer_site_C38',
 'cancer_site_C48',
 'cancer_site_C62',
 'cancer_site_C76',
 'morphology_800',
 'morphology_801',
 'morphology_802',
 'morphology_803',
 'morphology_804',
 'morphology_805',
 

# Prep Data - Part 1

In [4]:
df = get_change_since_prev_session(df)

100%|██████████| 9297/9297 [00:11<00:00, 821.21it/s]


In [5]:
symp = pd.read_parquet('./data/external/symptom.parquet.gzip')
df = get_symptom_labels(df, symp)
df = convert_to_binary_symptom_labels(df)

In [6]:
# Filter out sessions without any labels
target_cols = 'target_' + pd.Index(symp_cols) + '_change'
df = drop_samples_with_no_targets(df, target_cols, missing_val=-1)

01:29:31 INFO:Removing 5069 patients and 76538 sessions with no targets


In [7]:
# Filter out dates before 2014 and after 2020
df = drop_samples_outside_study_date(df)

01:29:31 INFO:Removing 997 patients and 8141 sessions before 2014-01-01 and after 2019-12-31


# Describe Data - Part 1

In [39]:
get_nunique_categories(df)

Unnamed: 0,regimen,intent
Number of Unique Categories,107,4


In [40]:
get_nmissing(df)

Unnamed: 0,Missing (N),Missing (%)
esas_pain,29,0.109
esas_tiredness,41,0.155
target_esas_pain,49,0.185
esas_drowsiness,52,0.196
target_esas_tiredness,53,0.2
esas_appetite,54,0.204
esas_depression,57,0.215
esas_anxiety,58,0.219
esas_shortness_of_breath,59,0.222
target_esas_shortness_of_breath,76,0.286


# Prep Data - Part 2

In [8]:
# fill missing data that can be filled heuristically
df = fill_missing_data(df)

# drop features with high missingness
df = drop_highly_missing_features(df, missing_thresh=80)

# create missingness features
df = get_missingness_features(df)

# collapse rare morphology and cancer sites into 'Other' category
df = collapse_rare_categories(df, catcols=['cancer_site', 'morphology'])

01:29:31 INFO:Dropping the following 10 features for missingness over 80%: ['basophil', 'bicarbonate_change', 'basophil_change', 'carbohydrate_antigen_19-9', 'prothrombin_time_international_normalized_ratio', 'activated_partial_thromboplastin_time', 'carcinoembryonic_antigen', 'esas_diarrhea', 'esas_vomiting', 'esas_constipation']
01:29:31 INFO:Reassigning the following 6 indicators with less than 6 patients as other: ['cancer_site_C00', 'cancer_site_C14', 'cancer_site_C26', 'cancer_site_C48', 'cancer_site_C62', 'cancer_site_C76']
01:29:31 INFO:Reassigning the following 63 indicators with less than 6 patients as other: ['morphology_800', 'morphology_803', 'morphology_805', 'morphology_809', 'morphology_812', 'morphology_815', 'morphology_818', 'morphology_820', 'morphology_822', 'morphology_829', 'morphology_831', 'morphology_832', 'morphology_833', 'morphology_836', 'morphology_840', 'morphology_843', 'morphology_844', 'morphology_845', 'morphology_847', 'morphology_851', 'morphology_

In [9]:
X, Y, metainfo = symptom_prep_pipeline(df)
# clean up Y
Y = Y[[col for col in Y.columns if col.endswith('change')]]
for substr in ['target_', 'esas_', '_change']: Y.columns = Y.columns.str.replace(substr, '')

01:29:31 INFO:Development Cohort: NSessions=21358. NPatients=2437. Contains all patients whose first visit was on or before 2018-02-01
01:29:31 INFO:Test Cohort: NSessions=5175. NPatients=794. Contains all patients whose first visit was after 2018-02-01
01:29:32 INFO:One-hot encoding training data
01:29:32 INFO:Separated and dropped 0 treatment set indicator columns, and added 0 new treatment indicator columns


Reassigning the following indicators with less than 6 patients as other: ['regimen_GI-CISPFU + TRAS(LOAD)', 'regimen_GI-CISPFU + TRAS(MAIN)', 'regimen_GI-CISPFU ANAL', 'regimen_GI-DOCEQ3W', 'regimen_GI-DOXO', 'regimen_GI-EOX', 'regimen_GI-FLOT (GASTRIC)', 'regimen_GI-FOLFNALIRI', 'regimen_GI-FUFA WEEKLY', 'regimen_GI-FUFA-5 DAYS', 'regimen_GI-GEMCAP', 'regimen_GI-GEMFU (BILIARY)', 'regimen_GI-IRINO Q3W', 'regimen_GI-PACLI WEEKLY', 'regimen_GI-PACLITAXEL', 'regimen_HN-DOCE/CISP Q3W', 'regimen_HN-DOCETAXEL WEEKLY', 'regimen_HN-ETOPCISP 3 DAY', 'regimen_HN-GEM/CIS + APREP', 'regimen_HN-NIVO Q4WEEKS (CCO)', 'regimen_HN-NIVOLUMAB', 'regimen_LU-DOCECARBO', 'regimen_LU-DOCECISP', 'regimen_LU-DURVALUMAB (COMP)', 'regimen_LU-ETOPCARBO-NO RT', 'regimen_LU-GEM D1,8,15', 'regimen_LU-GEMCISP +APREPITANT', 'regimen_LU-IRINOCARBO NO RT', 'regimen_LU-IRINOCISP NO RT', 'regimen_LU-PACLI/CARBO WEEKX5', 'regimen_LU-PACLI/CARBO WEEKX6', 'regimen_LU-RALTICARBO', 'regimen_LU-RALTICISP', 'regimen_LU-TOPOTECA

01:29:32 INFO:One-hot encoding validation data
01:29:32 INFO:Separated and dropped 0 treatment set indicator columns, and added 0 new treatment indicator columns
01:29:32 INFO:Reassigning the following regimen indicator columns that did not exist in train set as other:
regimen_GI-CISPFU + TRAS(MAIN)     2
regimen_GI-CISPFU ANAL            16
regimen_GI-DOCEQ3W                 1
regimen_GI-FLOT (GASTRIC)          9
regimen_GI-IRINO Q3W               2
regimen_GI-PACLI WEEKLY            5
regimen_GI-PACLITAXEL              5
regimen_HN-DOCETAXEL WEEKLY        3
regimen_HN-ETOPCISP 3 DAY         12
regimen_HN-GEM/CIS + APREP         5
regimen_LU-ETOPCARBO-NO RT        10
regimen_LU-GEMCISP +APREPITANT     2
regimen_LU-VINO D1,8               2
dtype: int64
01:29:32 INFO:One-hot encoding testing data
01:29:32 INFO:Separated and dropped 1 treatment set indicator columns, and added 0 new treatment indicator columns
01:29:32 INFO:Reassigning the following regimen indicator columns that did no

In [10]:
train_mask, valid_mask, test_mask = metainfo['split'] == 'Train', metainfo['split'] == 'Valid', metainfo['split'] == 'Test'
X_train, X_valid, X_test = X[train_mask], X[valid_mask], X[test_mask]
Y_train, Y_valid, Y_test = Y[train_mask], Y[valid_mask], Y[test_mask]

In [58]:
# new_data = pd.concat([df.loc[metainfo.index], metainfo[['cohort', 'split']]], axis=1)
# new_data.to_parquet('./data/debug/new_data.parquet.gzip', compression='gzip', index=False)

# Describe Data - Part 2

In [25]:
count = pd.DataFrame({
    'Number of sessions': metainfo.groupby('split').apply(len), 
    'Number of patients': metainfo.groupby('split')['mrn'].nunique()}
).T
count['Total'] = count.sum(axis=1)
logger.info(f'\n{count.to_string()}')

01:33:02 INFO:
split               Test  Train  Valid  Total
Number of sessions  5175  16826   4532  26533
Number of patients   794   1949    488   3231


In [26]:
# UNIT TESTING
assert not X.isnull().any().any()

In [27]:
get_label_distribution(Y, metainfo, with_respect_to='sessions')

  dists = {split: group.apply(pd.value_counts)


Unnamed: 0_level_0,Test,Test,Test,Train,Train,Train,Valid,Valid,Valid,Total,Total,Total
Unnamed: 0_level_1,-1,0,1,-1,0,1,-1,0,1,-1,0,1
pain,301,4489,385,1023,14521,1282,291,3911,330,1615,22921,1997
tiredness,687,4047,441,2029,13443,1354,579,3563,390,3295,21053,2185
nausea,154,4681,340,446,15194,1186,152,4116,264,752,23991,1790
depression,222,4751,202,834,15221,771,283,4111,138,1339,24083,1111
anxiety,346,4597,232,1167,14887,772,346,4016,170,1859,23500,1174
drowsiness,365,4320,490,1215,14155,1456,323,3784,425,1903,22259,2371
appetite,444,4253,478,1245,13951,1630,351,3773,408,2040,21977,2516
well_being,405,4396,374,1504,14140,1182,421,3839,272,2330,22375,1828
shortness_of_breath,367,4544,264,808,15276,742,295,4056,181,1470,23876,1187
patient_ecog,37,3955,1183,1428,11856,3542,392,3248,892,1857,19059,5617


In [28]:
get_label_distribution(Y, metainfo, with_respect_to='patients')

Unnamed: 0_level_0,Test,Test,Train,Train,Valid,Valid,Total,Total
Unnamed: 0_level_1,1,0,1,0,1,0,1,0
pain,162,632,510,1439,137,351,809,2422
tiredness,205,589,579,1370,162,326,946,2285
nausea,144,650,465,1484,113,375,722,2509
depression,102,692,351,1598,77,411,530,2701
anxiety,116,678,345,1604,81,407,542,2689
drowsiness,234,560,613,1336,171,317,1018,2213
appetite,213,581,650,1299,180,308,1043,2188
well_being,170,624,496,1453,135,353,801,2430
shortness_of_breath,126,668,355,1594,91,397,572,2659
patient_ecog,428,366,1136,813,291,197,1855,1376


In [30]:
# Feature Characteristics
prep = PrepData()
x = prep.ohe.encode(df.copy(), verbose=False) # get original (non-normalized, non-imputed) data one-hot encoded
cols = [col for col in x.columns if not (col in metainfo.columns or col.startswith('target'))]
x = x.loc[X_train.index, cols]
feature_summary(x, save_dir='result/tables').head(100)

Reassigning the following indicators with less than 6 patients as other: ['regimen_GI-CISPFU + TRAS(LOAD)', 'regimen_GI-DOCEQ3W', 'regimen_GI-DOXO', 'regimen_GI-EOX', 'regimen_GI-FOLFIRI+PANITUMUMAB', 'regimen_GI-FOLFNALIRI', 'regimen_GI-FUFA WEEKLY', 'regimen_GI-FUFA-5 DAYS', 'regimen_GI-GEM+OXALI (BILIARY)', 'regimen_GI-GEMFU (BILIARY)', 'regimen_GI-IRINO Q3W', 'regimen_GI-PACLITAXEL', 'regimen_HN-DOCE/CISP Q3W', 'regimen_HN-DOCETAXEL WEEKLY', 'regimen_HN-ETOPCISP 3 DAY', 'regimen_LU-DOCECARBO', 'regimen_LU-DOCECISP', 'regimen_LU-GEM D1,8,15', 'regimen_LU-PACLI/CARBO WEEKX5', 'regimen_LU-PEME-CARBO-PEMBRO', 'regimen_LU-PEME-CISP-PEMBRO', 'regimen_LU-PEME-PEMBRO MAINT', 'regimen_LU-RALTICARBO', 'regimen_LU-RALTICISP', 'regimen_LU-TOPOTECAN', 'regimen_LU-VINO D1,8']


Unnamed: 0,Train (N=16826) - Missingness Count,Train - Mean,Train - SD,Missingness (%),Mean (SD)
height,0.0,167.851004,9.806501,0.0,167.851 (9.807)
weight,0.0,71.673981,17.069065,0.0,71.674 (17.069)
body_surface_area,0.0,1.812991,0.241979,0.0,1.813 (0.242)
cycle_number,0.0,5.487935,7.599766,0.0,5.488 (7.6)
female,0.0,0.446511,0.497146,0.0,0.447 (0.497)
cancer_site_C01,0.0,0.00832,0.090839,0.0,0.008 (0.091)
cancer_site_C02,0.0,0.004755,0.068791,0.0,0.005 (0.069)
cancer_site_C03,0.0,0.001129,0.033586,0.0,0.001 (0.034)
cancer_site_C04,0.0,0.00422,0.064824,0.0,0.004 (0.065)
cancer_site_C05,0.0,0.001426,0.037741,0.0,0.001 (0.038)


# Train Model

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score
from xgboost import XGBClassifier

from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [12]:
# Logistic Regression
targets = Y.columns
LR_params = {'C': 0.3, 'penalty': 'l2', 'class_weight': 'balanced', 'max_iter': 2000, 'random_state': 42} # 'solver': 'saga', 
LR_model = {target: LogisticRegression(**LR_params) for target in targets}
XGB_params = dict(n_estimators=100, max_depth=6, learning_rate=0.01, min_child_weight=6, random_state=42)
XGB_model = {target: XGBClassifier(**XGB_params) for target in targets}
for target in tqdm(targets):
    mask = Y_train[target] != -1
    LR_model[target].fit(X_train[mask], Y_train.loc[mask, target])
    XGB_model[target].fit(X_train[mask], Y_train.loc[mask, target])

In [13]:
def evaluate(model, X, Y):
    result = {}
    for target, label in Y.items():
        mask = label != -1
        # check model.classes_ to confirm prediction of positive label is at index 1
        pred = model[target].predict_proba(X[mask])[: ,1]
        auprc = average_precision_score(label[mask], pred)
        auroc = roc_auc_score(label[mask], pred)
        result[target] = {'AUPRC': auprc, 'AUROC': auroc}
    return pd.DataFrame(result)

In [14]:
evaluate(LR_model, X_valid, Y_valid)

Unnamed: 0,pain,tiredness,nausea,depression,anxiety,drowsiness,appetite,well_being,shortness_of_breath,patient_ecog
AUPRC,0.232587,0.218736,0.17117,0.0811,0.112212,0.28295,0.277058,0.193337,0.06887,0.414887
AUROC,0.745248,0.695083,0.730217,0.679376,0.695537,0.750055,0.728963,0.708432,0.639634,0.703843


In [15]:
evaluate(LR_model, X_test, Y_test)

Unnamed: 0,pain,tiredness,nausea,depression,anxiety,drowsiness,appetite,well_being,shortness_of_breath,patient_ecog
AUPRC,0.218995,0.25685,0.180209,0.093458,0.089398,0.237833,0.269563,0.200071,0.105457,0.455162
AUROC,0.74019,0.733819,0.730242,0.643385,0.639175,0.737455,0.726553,0.733307,0.631385,0.721226


In [16]:
evaluate(XGB_model, X_valid, Y_valid)

Unnamed: 0,pain,tiredness,nausea,depression,anxiety,drowsiness,appetite,well_being,shortness_of_breath,patient_ecog
AUPRC,0.214121,0.262379,0.116096,0.099198,0.100782,0.239924,0.219536,0.163745,0.070814,0.434081
AUROC,0.746686,0.71763,0.699561,0.724275,0.683311,0.74017,0.723843,0.725819,0.655408,0.74039


In [17]:
evaluate(XGB_model, X_test, Y_test)

Unnamed: 0,pain,tiredness,nausea,depression,anxiety,drowsiness,appetite,well_being,shortness_of_breath,patient_ecog
AUPRC,0.203083,0.23342,0.154838,0.084092,0.093751,0.237669,0.243493,0.206317,0.101599,0.459708
AUROC,0.72553,0.731785,0.730314,0.67114,0.671367,0.729126,0.735006,0.744444,0.695988,0.726885
