In [1]:
%%capture
%cd ../
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 150)

from src import logger
from src.label import get_event_labels, get_label_distribution
from src.prepare.filter import drop_highly_missing_features, drop_samples_outside_study_date, drop_unused_drug_features
from src.prepare.engineer import collapse_rare_categories, get_change_since_prev_session, get_missingness_features
from src.prepare.pipeline import acu_prep_pipeline
from src.prepare.prep import PrepData, fill_missing_data
from src.summarize import feature_summary
from src.util import get_excluded_numbers

In [3]:
# Load data
df = pd.read_parquet('./data/treatment_centered_clinical_dataset.parquet.gzip')
emerg = pd.read_parquet('./data/external/emergency_room_visit.parquet.gzip')

# Prepare Data

In [4]:
# get the change in measurement since previous assessment
df = get_change_since_prev_session(df)
# extract labels
df = get_event_labels(df, emerg, event_name='ED_visit', extra_cols=['CTAS_score', 'CEDIS_complaint'])
# filter out dates before 2014 and after 2020
df = drop_samples_outside_study_date(df)
# drop drug features that were never used
df = drop_unused_drug_features(df)
# fill missing data that can be filled heuristically
df = fill_missing_data(df)
# drop features with high missingness
keep_cols = df.columns[df.columns.str.contains('target_')]
df = drop_highly_missing_features(df, missing_thresh=80, keep_cols=keep_cols)
# create missingness features
df = get_missingness_features(df)
# collapse rare morphology and cancer sites into 'Other' category
df = collapse_rare_categories(df, catcols=['cancer_site', 'morphology'])

100%|██████████| 9297/9297 [00:11<00:00, 786.43it/s]
05:41:01 INFO:Removing 4909 patients and 61780 sessions before 2014-01-01 and after 2019-12-31
05:41:02 INFO:Removing the following features for drugs given less than 10 times: ['%_ideal_dose_given_DURVALUMAB', '%_ideal_dose_given_IPILIMUMAB', '%_ideal_dose_given_CAPECITABINE', '%_ideal_dose_given_ERLOTINIB']
05:41:02 INFO:Dropping the following 10 features for missingness over 80%: ['basophil', 'bicarbonate_change', 'basophil_change', 'carbohydrate_antigen_19-9', 'prothrombin_time_international_normalized_ratio', 'activated_partial_thromboplastin_time', 'carcinoembryonic_antigen', 'esas_diarrhea', 'esas_vomiting', 'esas_constipation']
05:41:02 INFO:Reassigning the following 6 indicators with less than 6 patients as other: ['cancer_site_C00', 'cancer_site_C14', 'cancer_site_C26', 'cancer_site_C48', 'cancer_site_C62', 'cancer_site_C76']
05:41:02 INFO:Reassigning the following 59 indicators with less than 6 patients as other: ['morphol

In [5]:
# To align with EPIC system for silent deployment
# 1. remove drug and morphology features
# 2. restrict to GI patients
# This will be temporary
cols = df.columns
cols = cols[~cols.str.contains('morphology|%_ideal_dose')]
df = df[cols]

mask = df['regimen'].str.startswith('GI-')
get_excluded_numbers(df, mask, context=' not from GI department')
df = df[mask]

11:53:55 INFO:Removing 2387 patients and 24980 sessions not from GI department


In [6]:
X, Y, metainfo = acu_prep_pipeline(df, event_name='ED_visit')
# clean up Y
for col in ['target_CEDIS_complaint', 'target_CTAS_score']:
    metainfo[col] = Y.pop(col)
Y.columns = Y.columns.str.replace('target_', '')

11:53:56 INFO:Development Cohort: NSessions=19413. NPatients=1510. Contains all patients whose first visit was on or before 2018-02-01
11:53:56 INFO:Test Cohort: NSessions=5039. NPatients=491. Contains all patients whose first visit was after 2018-02-01
11:53:56 INFO:Removing 3 patients and 76 sessions in which patient had a target event in less than 2 days.
11:53:56 INFO:Removing 1 patients and 26 sessions in which patient had a target event in less than 2 days.
11:53:56 INFO:One-hot encoding training data
11:53:56 INFO:Separated and dropped 0 treatment set indicator columns, and added 0 new treatment indicator columns


Reassigning the following indicators with less than 6 patients as other: ['regimen_GI-CISPFU + TRAS(LOAD)', 'regimen_GI-CISPFU + TRAS(MAIN)', 'regimen_GI-DOCEQ3W', 'regimen_GI-DOXO', 'regimen_GI-EOX', 'regimen_GI-FOLFIRI+PANITUMUMAB', 'regimen_GI-FOLFNALIRI', 'regimen_GI-FU/FA/CISP BILIARY', 'regimen_GI-GEMCAP', 'regimen_GI-GEMFU (BILIARY)', 'regimen_GI-IRINO 4-WEEKLY', 'regimen_GI-IRINO Q3W', 'regimen_GI-PACLITAXEL', 'regimen_GI-XELIRI ELDERLY']


11:53:56 INFO:One-hot encoding validation data
11:53:56 INFO:Separated and dropped 0 treatment set indicator columns, and added 0 new treatment indicator columns
11:53:56 INFO:Reassigning the following regimen indicator columns that did not exist in train set as other:
regimen_GI-CISPFU + TRAS(LOAD)     8
regimen_GI-CISPFU + TRAS(MAIN)    45
regimen_GI-DOXO                    2
regimen_GI-EOX                     4
regimen_GI-FOLFNALIRI              5
regimen_GI-FUFA-5 DAYS            10
regimen_GI-GEMCAP                 29
regimen_GI-IRINO Q3W               4
regimen_GI-PACLITAXEL              3
regimen_GI-XELIRI ELDERLY          1
dtype: int64
11:53:56 INFO:One-hot encoding testing data
11:53:56 INFO:Separated and dropped 1 treatment set indicator columns, and added 1 new treatment indicator columns
11:53:56 INFO:Reassigning the following regimen indicator columns that did not exist in train set as other:
regimen_GI-CISPFU + TRAS(LOAD)    11
regimen_GI-CISPFU + TRAS(MAIN)    87
regime

In [7]:
train_mask, valid_mask, test_mask = metainfo['split'] == 'Train', metainfo['split'] == 'Valid', metainfo['split'] == 'Test'
X_train, X_valid, X_test = X[train_mask], X[valid_mask], X[test_mask]
Y_train, Y_valid, Y_test = Y[train_mask], Y[valid_mask], Y[test_mask]

# Describe Data

In [20]:
count = pd.DataFrame({
    'Number of sessions': metainfo.groupby('split').apply(len), 
    'Number of patients': metainfo.groupby('split')['mrn'].nunique()}
).T
count['Total'] = count.sum(axis=1)
logger.info(f'\n{count.to_string()}')

04:11:45 INFO:
split               Test  Train  Valid  Total
Number of sessions  5039  15484   3827  24350
Number of patients   491   1205    301   1997


In [21]:
get_label_distribution(Y, metainfo, with_respect_to='sessions')

  dists = {split: group.apply(pd.value_counts)


Unnamed: 0_level_0,Test,Test,Train,Train,Valid,Valid,Total,Total
ED_visit,False,True,False,True,False,True,False,True
ED_visit,4515,524,14136,1348,3480,347,22131,2219


In [22]:
get_label_distribution(Y, metainfo, with_respect_to='patients')

Unnamed: 0_level_0,Test,Test,Train,Train,Valid,Valid,Total,Total
Unnamed: 0_level_1,1,0,1,0,1,0,1,0
ED_visit,183,308,446,759,112,189,741,1256


In [10]:
# Feature Characteristics
prep = PrepData()
x = prep.ohe.encode(df.loc[X_train.index].copy(), verbose=False) # get original (non-normalized, non-imputed) data one-hot encoded
x = x[[col for col in x.columns if not (col in metainfo.columns or col.startswith('target'))]]
feature_summary(x, save_path='result/tables/feature_summary_ED.csv').head(100)

Reassigning the following indicators with less than 6 patients as other: ['regimen_GI-CISPFU + TRAS(LOAD)', 'regimen_GI-CISPFU + TRAS(MAIN)', 'regimen_GI-DOCEQ3W', 'regimen_GI-DOXO', 'regimen_GI-EOX', 'regimen_GI-FOLFIRI+PANITUMUMAB', 'regimen_GI-FOLFNALIRI', 'regimen_GI-FU/FA/CISP BILIARY', 'regimen_GI-GEMCAP', 'regimen_GI-GEMFU (BILIARY)', 'regimen_GI-IRINO 4-WEEKLY', 'regimen_GI-IRINO Q3W', 'regimen_GI-PACLITAXEL', 'regimen_GI-XELIRI ELDERLY']


Unnamed: 0,Features,Group,Mean (SD),Missingness (%)
75,Days Since Previous ED Visit,Acute care use,1079.544 (819.072),0.0
74,Number of Prior ED Visits Within 5 Years,Acute care use,1.441 (2.797),0.0
5,"Topography ICD-0-3 C01, Base of tongue",Cancer,0.002 (0.045),0.0
6,"Topography ICD-0-3 C02, Other and unspecified ...",Cancer,0.000 (0.016),0.0
7,"Topography ICD-0-3 C03, Gum",Cancer,0.000 (0.000),0.0
8,"Topography ICD-0-3 C04, Floor of mouth",Cancer,0.002 (0.045),0.0
9,"Topography ICD-0-3 C05, Palate",Cancer,0.001 (0.025),0.0
10,"Topography ICD-0-3 C06, Other and unspecified ...",Cancer,0.001 (0.032),0.0
11,"Topography ICD-0-3 C07, Parotid gland",Cancer,0.000 (0.000),0.0
12,"Topography ICD-0-3 C08, Other and unspecified ...",Cancer,0.000 (0.000),0.0


# Train Model

In [8]:
from collections import defaultdict
from functools import partial

from bayes_opt import BayesianOptimization
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score
from xgboost import XGBClassifier

from src.util import load_pickle, save_pickle

from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [9]:
targets = Y.columns

# LGBM does not like non alphanumeric characters (except for _)
for char in ['(', ')', '+', '-', '/', ',']: 
    X_train.columns = X_train.columns.str.replace(char, '_')
    X_valid.columns = X_valid.columns.str.replace(char, '_')
    X_test.columns = X_test.columns.str.replace(char, '_')

In [41]:
# hyperparameter tuning
algs = {
    'LR': LogisticRegression,
    'XGB': XGBClassifier,
    'LGBM': LGBMClassifier
}
bayesopt_param = {
    'LR': {'init_points': 2, 'n_iter': 10}, 
    'XGB': {'init_points': 15, 'n_iter': 100},
    'LGBM': {'init_points': 20, 'n_iter': 200},
}
model_static_param = {
    'LR': {
        'penalty': 'l2', 
        'class_weight': 'balanced', 
        'max_iter': 2000,
        'random_state': 42
    },
    'XGB': {
        'random_state': 42
    },
    'LGBM': {
        'random_state': 42,
        'verbosity': -1
    }
}
model_tuning_param = {
    'LR': {
        'C': (0.0001, 1)
    },
    'XGB': {
        'n_estimators': (50, 200),
        'max_depth': (3, 7),
        'learning_rate': (0.01, 0.3),
        'min_split_loss': (0, 0.5),
        'min_child_weight': (6, 100),
        'reg_lambda': (0, 1),
        'reg_alpha': (0, 1000)
    },
    'LGBM': {
        'n_estimators': (50, 200),
        'max_depth': (3, 7),
        'learning_rate': (0.01, 0.3),
        'num_leaves': (20, 40),
        'min_data_in_leaf': (6, 30),
        'feature_fraction': (0.5, 1),
        'bagging_fraction': (0.5, 1),
        'bagging_freq': (0, 10),
        'reg_lambda': (0, 1),
        'reg_alpha': (0, 1000)
    }
}
def convert_params(params):
    # convert necessary hyperparams to integers
    for param in ['n_estimators', 'max_depth', 'num_leaves', 'min_data_in_leaf', 'min_child_weight', 'bagging_freq']:
        if param in params: params[param] = int(params[param])
    return params

def eval_func(alg, data, **kwargs):
    train_X, train_Y, valid_X, valid_Y = data
    kwargs = convert_params(kwargs)
    model = algs[alg](**kwargs, **model_static_param[alg])
    model.fit(train_X, train_Y)
    assert model.classes_[1] == 1 # positive class is at index 1
    pred = model.predict_proba(valid_X)[: ,1]
    return roc_auc_score(valid_Y, pred)

best_params = {}
for target in targets:
    for alg, optim_config in bayesopt_param.items():
        hyperparam_config = model_tuning_param[alg]
        data = (X_train, Y_train[target], X_valid, Y_valid[target])
        bo = BayesianOptimization(
            f=partial(eval_func, alg=alg, data=data),
            pbounds=hyperparam_config,
            verbose=2,
            random_state=42
        )
        bo.maximize(**optim_config)
        best_param = bo.max['params']
        best_param = convert_params(best_param)
        best_params[f'{alg}_{target}'] = best_param
save_pickle(best_params, save_dir='./models', filename='best_params')

  0%|          | 0/1 [00:00<?, ?it/s]

|   iter    |  target   |     C     |
-------------------------------------
| [0m1        [0m | [0m0.7518   [0m | [0m0.3746   [0m |
| [95m2        [0m | [95m0.7532   [0m | [95m0.9507   [0m |
| [0m3        [0m | [0m0.7504   [0m | [0m0.9523   [0m |
| [0m4        [0m | [0m0.747    [0m | [0m0.9507   [0m |
| [0m5        [0m | [0m0.7462   [0m | [0m0.9508   [0m |
| [0m6        [0m | [0m0.753    [0m | [0m0.3746   [0m |
| [0m7        [0m | [0m0.7509   [0m | [0m0.9507   [0m |
| [0m8        [0m | [0m0.7531   [0m | [0m0.3745   [0m |
| [0m9        [0m | [0m0.7515   [0m | [0m0.3747   [0m |
| [0m10       [0m | [0m0.7527   [0m | [0m0.3747   [0m |
| [0m11       [0m | [0m0.7513   [0m | [0m0.3745   [0m |
| [0m12       [0m | [0m0.7494   [0m | [0m0.6526   [0m |
|   iter    |  target   | learni... | max_depth | min_ch... | min_sp... | n_esti... | reg_alpha | reg_la... |
-------------------------------------------------------------------

100%|██████████| 1/1 [09:34<00:00, 574.84s/it]

| [0m220      [0m | [0m0.7462   [0m | [0m1.0      [0m | [0m0.0      [0m | [0m0.5      [0m | [0m0.3      [0m | [0m7.0      [0m | [0m6.0      [0m | [0m173.1    [0m | [0m40.0     [0m | [0m232.2    [0m | [0m1.0      [0m |





In [47]:
best_params = load_pickle('./models', 'best_params')
models = defaultdict(dict)
for target in targets:
    for alg in algs:
        model = algs[alg](**best_params[f'{alg}_{target}'], **model_static_param[alg])
        model.fit(X_train, Y_train[target])
        models[alg][target] = model

In [48]:
def evaluate(model, X, Y):
    result = {}
    for target, label in Y.items():
        # check model.classes_ to confirm prediction of positive label is at index 1
        pred = model[target].predict_proba(X)[: ,1]
        auprc = average_precision_score(label, pred)
        auroc = roc_auc_score(label, pred)
        result[target] = {'AUPRC': auprc, 'AUROC': auroc}
    return pd.DataFrame(result)

In [49]:
pd.concat([evaluate(model, X_valid, Y_valid) for alg, model in models.items()], keys=models.keys()).T

Unnamed: 0_level_0,LR,LR,XGB,XGB,LGBM,LGBM
Unnamed: 0_level_1,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC
ED_visit,0.245577,0.753183,0.246744,0.780887,0.251515,0.783063


In [50]:
pd.concat([evaluate(model, X_test, Y_test) for alg, model in models.items()], keys=models.keys()).T

Unnamed: 0_level_0,LR,LR,XGB,XGB,LGBM,LGBM
Unnamed: 0_level_1,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC
ED_visit,0.174511,0.637605,0.186467,0.677193,0.189465,0.668737


In [53]:
save_pickle(models['XGB'][target], './models', 'XGB_ED_visit')

# Scratch Notes

### Results prior to removing the drug, morphology features and restricting to GI patients only

In [24]:
evaluate(models['XGB'][target], X_valid, Y_valid)

Unnamed: 0,ED_visit
AUPRC,0.209723
AUROC,0.749193


In [25]:
evaluate(models['XGB'][target], X_test, Y_test)

Unnamed: 0,ED_visit
AUPRC,0.189172
AUROC,0.698418
