In [1]:
%%capture
%cd ../
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score
from tqdm import tqdm
from xgboost import XGBClassifier
import pandas as pd
pd.set_option('display.max_rows', 150)

from src.constants import symp_cols
from src.label import convert_to_binary_symptom_labels, get_symptom_labels, get_label_distribution
from src.prepare.filter import (
    drop_highly_missing_features, 
    drop_samples_outside_study_date, 
    drop_samples_with_no_targets,
    drop_unused_drug_features
)
from src.prepare.engineer import collapse_rare_categories, get_change_since_prev_session, get_missingness_features
from src.prepare.pipeline import PrepSympData
from src.prepare.prep import fill_missing_data
from src.summarize import feature_summary
from src.util import get_nunique_categories, get_nmissing, initialize_folders

from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

initialize_folders()

logging.basicConfig(
    filename=f"./logs/{datetime.now().strftime('%Y-%m-%d %H.%M.%S')}_symptom_target.log",
    level=logging.INFO, 
    format='%(asctime)s %(levelname)s:%(message)s', 
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [3]:
# Load data
df = pd.read_parquet('data/treatment_centered_clinical_dataset.parquet.gzip')

In [4]:
# scoring increase thresholds for determining symptom deterioration
target_pt_increases = [1, 3]

# Prep Data

In [5]:
# get the change in measurement since previous assessment
df = get_change_since_prev_session(df)

# extract labels
symp = pd.read_parquet('./data/external/symptom.parquet.gzip')
df = get_symptom_labels(df, symp)
for pt_increase in target_pt_increases:
    scoring_map = {symp: pt_increase for symp in symp_cols if symp != 'patient_ecog'}
    df = convert_to_binary_symptom_labels(df, scoring_map=scoring_map)

# filter out sessions without any labels
target_cols = 'target_' + pd.Index(symp_cols) + '_change'
df = drop_samples_with_no_targets(df, target_cols)

# filter out dates before 2014 and after 2020
df = drop_samples_outside_study_date(df)

# drop drug features that were never used
df = drop_unused_drug_features(df)

100%|██████████| 9297/9297 [00:11<00:00, 788.71it/s]


In [6]:
get_nunique_categories(df)

Unnamed: 0,regimen,intent
Number of Unique Categories,107,4


In [7]:
nmissing = get_nmissing(df)
nmissing[~nmissing.index.str.endswith('_date')].tail(20)

Unnamed: 0,Missing (N),Missing (%)
aspartate_aminotransferase_change,16677,62.849
alanine_aminotransferase_change,16863,63.55
alkaline_phosphatase_change,16899,63.686
total_bilirubin_change,16961,63.919
glucose_change,17094,64.421
lactate_dehydrogenase_change,17477,65.864
phosphate_change,17833,67.206
albumin_change,18096,68.197
eosinophil_change,18965,71.472
bicarbonate,21059,79.363


In [8]:
# fill missing data that can be filled heuristically
df = fill_missing_data(df)

# drop features with high missingness
keep_cols = df.columns[df.columns.str.contains('target_')]
df = drop_highly_missing_features(df, missing_thresh=75, keep_cols=keep_cols)

# create missingness features
df = get_missingness_features(df)

# collapse rare morphology and cancer sites into 'Other' category
df = collapse_rare_categories(df, catcols=['cancer_site', 'morphology'])

In [9]:
prep = PrepSympData()
X, Y, metainfo = prep.run_pipeline(df, split_date='2017-10-01', target_pt_increases=target_pt_increases)
# clean up Y
Y = Y[[col for col in Y.columns if col.endswith('pt_change')]]
for substr in ['target_', 'esas_']: Y.columns = Y.columns.str.replace(substr, '')

In [10]:
train_mask, valid_mask, test_mask = metainfo['split'] == 'Train', metainfo['split'] == 'Valid', metainfo['split'] == 'Test'
X_train, X_valid, X_test = X[train_mask], X[valid_mask], X[test_mask]
Y_train, Y_valid, Y_test = Y[train_mask], Y[valid_mask], Y[test_mask]

# Describe Data

In [11]:
count = pd.DataFrame({
    'Number of sessions': metainfo.groupby('split').apply(len), 
    'Number of patients': metainfo.groupby('split')['mrn'].nunique()}
).T
count['Total'] = count.sum(axis=1)
print(f'\n{count.to_string()}')


split               Test  Train  Valid  Total
Number of sessions  6477  16145   3913  26535
Number of patients   960   1816    455   3231


In [12]:
get_label_distribution(Y, metainfo, with_respect_to='sessions').sort_index()

Unnamed: 0_level_0,Test,Test,Test,Train,Train,Train,Valid,Valid,Valid,Total,Total,Total
Unnamed: 0_level_1,0,1,-1,0,1,-1,0,1,-1,0,1,-1
anxiety_1pt_change,4420,1999,58,11259,4643,243,2678,1167,68,18357,7809,369
anxiety_3pt_change,5686,539,252,14145,1298,702,3469,289,155,23300,2126,1109
appetite_1pt_change,4092,2286,99,10212,5681,252,2462,1386,65,16766,9353,416
appetite_3pt_change,5162,982,333,13138,2276,731,3133,580,200,21433,3838,1264
depression_1pt_change,4550,1881,46,11372,4568,205,2750,1107,56,18672,7556,307
depression_3pt_change,5809,495,173,14393,1262,490,3519,280,114,23721,2037,777
drowsiness_1pt_change,3822,2620,35,9782,6121,242,2393,1479,41,15997,10220,318
drowsiness_3pt_change,5223,996,258,13258,2233,654,3251,548,114,21732,3777,1026
nausea_1pt_change,4539,1891,47,11117,4803,225,2716,1165,32,18372,7859,304
nausea_3pt_change,5680,688,109,14099,1712,334,3447,413,53,23226,2813,496


In [13]:
get_label_distribution(Y, metainfo, with_respect_to='patients').sort_index()

Unnamed: 0_level_0,Test,Test,Train,Train,Valid,Valid,Total,Total
Unnamed: 0_level_1,1,0,1,0,1,0,1,0
anxiety_1pt_change,615,345,1230,586,314,141,2159,1072
anxiety_3pt_change,235,725,530,1286,142,313,907,2324
appetite_1pt_change,639,321,1379,437,337,118,2355,876
appetite_3pt_change,388,572,816,1000,207,248,1411,1820
depression_1pt_change,571,389,1197,619,302,153,2070,1161
depression_3pt_change,227,733,522,1294,134,321,883,2348
drowsiness_1pt_change,726,234,1440,376,361,94,2527,704
drowsiness_3pt_change,405,555,830,986,211,244,1446,1785
nausea_1pt_change,556,404,1201,615,301,154,2058,1173
nausea_3pt_change,278,682,636,1180,147,308,1061,2170


In [14]:
# Feature Characteristics
x = prep.ohe.encode(df.loc[X_train.index].copy(), verbose=False) # get original (non-normalized, non-imputed) data one-hot encoded
x = x[[col for col in x.columns if not (col in metainfo.columns or col.startswith('target'))]]
feature_summary(x, save_path='result/tables/feature_summary.csv').sample(10, random_state=42)

Unnamed: 0,Features,Group,Mean (SD),Missingness (%)
149,Eastern Cooperative Oncology Group (ECOG) Perf...,Symptoms,0.022 (0.455),17.9
106,Percentage of Ideal Dose Given MITOMYCIN,Treatment,0.977 (0.065),0.0
83,Monocyte (x10e9/L),Laboratory,0.590 (0.326),35.1
178,Regimen GI-PANITUMUMAB,Treatment,0.013 (0.115),0.0
48,"Morphology ICD-0-3 848, Cystic, mucinous, and ...",Cancer,0.037 (0.188),0.0
62,ESAS Appetite Score,Symptoms,1.880 (2.396),0.2
170,Regimen GI-GEM 7-WEEKLY,Treatment,0.004 (0.061),0.0
207,Regimen LU-NIVOLUMAB (SAP),Treatment,0.004 (0.066),0.0
74,Hematocrit (L/L),Laboratory,0.349 (0.050),34.6
104,Percentage of Ideal Dose Given CETUXIMAB,Treatment,0.997 (0.020),0.0


# Train Model

In [20]:
# Logistic Regression
targets = Y.columns
LR_params = {'C': 0.3, 'penalty': 'l2', 'class_weight': 'balanced', 'max_iter': 2000, 'random_state': 42} # 'solver': 'saga', 
LR_model = {target: LogisticRegression(**LR_params) for target in targets}
XGB_params = dict(n_estimators=100, max_depth=6, learning_rate=0.01, min_child_weight=6, random_state=42)
XGB_model = {target: XGBClassifier(**XGB_params) for target in targets}
for target in tqdm(targets):
    mask = Y_train[target] != -1
    LR_model[target].fit(X_train[mask], Y_train.loc[mask, target])
    XGB_model[target].fit(X_train[mask], Y_train.loc[mask, target])

100%|██████████| 18/18 [02:56<00:00,  9.79s/it]


In [21]:
def evaluate(model, X, Y):
    result = {}
    for target, label in Y.items():
        mask = label != -1
        # check model.classes_ to confirm prediction of positive label is at index 1
        pred = model[target].predict_proba(X[mask])[: ,1]
        auprc = average_precision_score(label[mask], pred)
        auroc = roc_auc_score(label[mask], pred)
        result[target] = {'AUPRC': auprc, 'AUROC': auroc}
    return pd.DataFrame(result).T

In [22]:
evaluate(LR_model, X_valid, Y_valid)

Unnamed: 0,AUPRC,AUROC
pain_1pt_change,0.556448,0.684272
tiredness_1pt_change,0.656066,0.699594
nausea_1pt_change,0.485958,0.683714
depression_1pt_change,0.390373,0.63438
anxiety_1pt_change,0.409052,0.602399
drowsiness_1pt_change,0.583045,0.698058
appetite_1pt_change,0.567018,0.690192
well_being_1pt_change,0.610164,0.694019
shortness_of_breath_1pt_change,0.415077,0.652561
pain_3pt_change,0.334447,0.743771


In [23]:
evaluate(LR_model, X_test, Y_test)

Unnamed: 0,AUPRC,AUROC
pain_1pt_change,0.505272,0.662678
tiredness_1pt_change,0.655866,0.692875
nausea_1pt_change,0.468333,0.670905
depression_1pt_change,0.378636,0.597825
anxiety_1pt_change,0.393109,0.58365
drowsiness_1pt_change,0.593446,0.685212
appetite_1pt_change,0.54052,0.669799
well_being_1pt_change,0.594569,0.674719
shortness_of_breath_1pt_change,0.390876,0.610694
pain_3pt_change,0.251134,0.692169


In [24]:
evaluate(XGB_model, X_valid, Y_valid)

Unnamed: 0,AUPRC,AUROC
pain_1pt_change,0.544146,0.712755
tiredness_1pt_change,0.649622,0.700841
nausea_1pt_change,0.496063,0.704249
depression_1pt_change,0.418522,0.681586
anxiety_1pt_change,0.431,0.651861
drowsiness_1pt_change,0.614952,0.71619
appetite_1pt_change,0.589528,0.711749
well_being_1pt_change,0.596041,0.697586
shortness_of_breath_1pt_change,0.443365,0.680528
pain_3pt_change,0.308291,0.741611


In [25]:
evaluate(XGB_model, X_test, Y_test)

Unnamed: 0,AUPRC,AUROC
pain_1pt_change,0.526741,0.683103
tiredness_1pt_change,0.67078,0.690211
nausea_1pt_change,0.486248,0.688041
depression_1pt_change,0.420131,0.662109
anxiety_1pt_change,0.424399,0.639286
drowsiness_1pt_change,0.621907,0.692188
appetite_1pt_change,0.543971,0.682613
well_being_1pt_change,0.588564,0.674237
shortness_of_breath_1pt_change,0.422715,0.646291
pain_3pt_change,0.241068,0.704686
