© 2018 Institute for Clinical Evaluative Sciences. All rights reserved.

TERMS OF USE:
##Not for distribution.## This code and data is provided to the user solely for its own non-commercial use by individuals and/or not-for-profit corporations. User shall not distribute without express written permission from the Institute for Clinical Evaluative Sciences.

##Not-for-profit.## This code and data may not be used in connection with profit generating activities.

##No liability.## The Institute for Clinical Evaluative Sciences makes no warranty or representation regarding the fitness, quality or reliability of this code and data.

##No Support.## The Institute for Clinical Evaluative Sciences will not provide any technological, educational or informational support in connection with the use of this code and data.

##Warning.## By receiving this code and data, user accepts these terms, and uses the code and data, solely at its own risk.

In [None]:
%cd ../../
%load_ext autoreload
%autoreload 2

In [None]:
import os
import shutil
import subprocess
import logging
from collections import defaultdict

import pandas as pd
import numpy as np
import maptlotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

from scripts.feat_imp import FeatImportance
from src import logger
from src.config import root_path, can_folder, split_date, cancer_code_mapping
from src.evaluate import EvaluateClf, EvaluateReg, EvaluateBaselineModel
from src.prep_data import PrepDataCAN
from src.train import Ensembler, Trainer
from src.utility import (
    initialize_folders, load_pickle, get_clean_variable_names, get_units
)
from src.visualize import importance_plot, tile_plot

logger.setLevel(logging.WARNING)

pd.set_options('display.max_columns', 100)
pd.set_options('display.max_rows', 100)

In [None]:
main_dir = f'{root_path}/projects/{can_folder}'

In [None]:
prep = PrepDataCAN(adverse_event='ckd', target_keyword='SCr|dialysis|next')

# all treatments
orig_data = prep.load_data()
orig_data = prep.get_creatinine_data(orig_data)

# first treatments
df = prep.get_data(missing_thresh=80, include_comorbidity=True, verbose=True)
train_df, test_df = prep.split_data(df, split_date=split_date)

# model input features
X, Y, tag = prep.split_and_transform_data(df, split_date=split_date)
model_data = df.loc[X.index]

# TODO: change the definition to measurement date - last cisplatin date
# you will need to modify Build Non-Cisplatin Cohort, but logic remains the same
followup_time = (prep.event_dates['next_SCr_obs_date'] - prep.event_dates['visit_date']).dt.days

# SPLINE predictions
spline_preds = load_pickle(f'{main_dir}/models/CKD/preds', 'SPLINE_preds')
spline_preds = pd.concat(spline_preds['SPLINE']).droplevel(level=0)
spline_preds = spline_preds.loc[X.index]

# labels
labels = {split: g[['CKD']] for split, g in Y.groupby(tag['split'])}

# Final Figures

In [None]:
test_mask = tag['split'] == 'Test'
output_path = f'{main_dir}/models/CKD/'
preds = load_pickle(f'{output_path}/preds', 'SPLINE_preds')

In [None]:
# Figure 1a
pred_ci = load_pickle(f'{output_path}/preds', 'SPLINE_preds_ci')
baseline_evaluator = EvaluateBaselineModel(
    model_data['baseline_eGFR'][test_mask], preds, labels, output_path, pred_ci=pred_ci
)
fig, ax = plt.subplots(figsize=(6, 6))
for target_event in Y.columns:
    baseline_evaluator.pred_pred_vs_base(ax, alg='SPLINE', target_event=target_event, split='Test', open_top_right=False)
plt.savefig(f'{main_dir}/figures/Figure1a.svg', format='svg', dpi=300, bbox_inches='tight')

In [None]:
# Figure 1b
alg, split = 'SPLINE', 'Test'
fig, ax = plt.subplots(figsize=(6, 6))
evaluator = EvaluateClf(output_path, preds, labels)
for target_event in Y.columns:
    Y_true = Y[test_mask][target_event]
    Y_pred_prob = preds[alg][split][target_event]
    evaluator.plot_auc_curve(
        ax, Y_true, Y_pred_prob,
        label_prefix=f'{target_event}\n',
        ci_name=f'{alg}_{split}_{target_event}'
    )
plt.savefig(f'{main_dir}/figures/Figure1b.svg', format='svg', dpi=300, bbox_inches='tight')

# Follow Up Length

In [None]:
followup_time = (prep.event_dates['next_SCr_obs_date'] - prep.event_dates['last_cisplatin_date']).dt.days

In [None]:
# take the minimum of the maximum follow up times of each cohort
max_followup_time = followup_time.groupby(tag['cohort']).max().min()

In [None]:
# compare follow up times - no significant difference in follow up times between the two cohorts
t_stat, p_val = st.ranksums(followup_time[tag['cohort'] == 'Development'], followup_time[tag['cohort'] == 'Test'])
t_stat, p_val

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,6))
axes = axes.flatten()
for idx, (cohort, time) in enumerate(followup_time.groupby(tag['cohort'])):
    tmp = pd.DataFrame()
    tmp['CKD'] = Y.loc[time.index, ['CKD']]
    tmp['followup_time'] = time
    tmp['followup_time_bins'] = pd.cut(time, bins=[90, 180, 270, 365, np.inf], include_lowest=True)
    counts = tmp.groupby('followup_time_bins').apply(len)

    sns.barplot(data=tmp, x='followup_time_bins', y='CKD', ax=axes[idx])
    axes[idx].set(ylabel='Proportion of Post-Treatment CKD', xlabel='Followup Time (Months)', xticklabels=['3-6', '6-9', '9-12', '12+'])
    axes[idx].set(ylim=(0, 0.56))
    for i, count in enumerate(counts):
        axes[idx].text(i, 0.01, f'N={count}', ha='center', v1='bottom', fontsize=10)
plt.savefig(f'{root_path}/data/output/CKD_results/figures/followup_time.jpg', dpi=300, bbox_inches='tight')

In [None]:
# bar plot for months 0-12
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12,6))
axes = axes.flatten()
for idx, (cohort, time) in enumerate(followup_time.groupby(tag['cohort'])):
    tmp = pd.DataFrame()
    tmp['eGFR_change'] = model_data.loc[time.index, 'baseline_eGFR'] - model_data.loc[time.index, 'next_eGFR']
    tmp['followup_time_bins'] = pd.cut(time, bins=list(range(90, 365, 30)) + [np.inf], include_lowest=True)

    sns.boxplot(data=tmp, x='followup_time_bins', y='eGFR_change', fliersize=0, ax=axes[idx])
    axes[idx].set(ylabel='eGFR Change from Pre-treatment to Post-treatment', xlabel='Followup Time (Months)', 
                  xticklabels=list(range(3, 12)) + ['12+'], ylim=(-65, 105))
plt.savefig(f'{root_path}/data/output/eGFR_results/figures/followup_time.jpg', dpi=300, bbox_inches='tight')

# More Models

In [None]:
output_path = f'{main_dir}/experiment/more_linear_models'
initialize_folders(f'{output_path}/CKD')
initialize_folders(f'{output_path}/eGFR')

In [None]:
train_mask, valid_mask, test_mask = tag['split'] == 'Train', tag['split'] == 'Valid', tag['split'] == 'Test'
X_train, X_valid, X_test = X[train_mask], X[valid_mask], X[test_mask]
Y_train, Y_valid, Y_test = Y[train_mask], Y[valid_mask], Y[test_mask]

In [None]:
tmp = X_train.astype(float).copy()

# statsmodel don't like these characters in the column name
col_map = {col: col.replace('(', '_').replace(')', '').replace('-', '_') for col in tmp.columns}
rev_col_map = {v: k for k, v in col_map.items()}
rev_col_map['Intercept'] = rev_col_map['const'] = 'Intercept'
tmp.columns = tmp.columns.map(col_map)

# take only top 10 regiments as covariates
counts = tmp.loc[:, tmp.columns.str.contains('regimen')].sum().sort_values(ascending=False)
print(f"Top 10 regimen make up {counts[:10].sum():.0f}/{counts.sum():.0f} ({counts[:10].sum()/counts.sum()*100:.1f}%) of the samples")
top_regimens = counts.index[:10].tolist()

# take only top 10 cancers as covariates
counts = tmp.loc[:, tmp.columns.str.contains('cancer_topog')].sum().sort_values(ascending=False)
print(f"Top 10 cancers make up {counts[:10].sum():.0f}/{counts.sum():.0f} ({counts[:10].sum()/counts.sum()*100:.1f}%) of the samples")
top_cancers = counts.index[:10].tolist()

covars = ['baseline_eGFR', 'age', 'body_surface_area', 'sex', 'diabetes', 'hypertension', 'cisplatin_dosage'] + top_regimens + top_cancers
res = sm.Logit(Y_train['CKD'], sm.add_constant(tmp[covars])).fit()
odds_ratio = pd.DataFrame({"OR": res.params, "Lower CI": res.conf_int()[0], "Upper CI": res.conf_int()[1]})
odds_ratio = np.exp(odds_ratio)
odds_ratio['P value'] = res.pvalues
odds_ratio = odds_ratio.sort_values(by='P value')
# rename the indices
odds_ratio.index = odds_ratio.index.map(rev_col_map)
rename_map = {name: f'{name} ({unit})' for name, unit in get_units().items()}
odds_ratio = odds_ratio.rename(index=rename_map)
odds_ratio.index = get_clean_variable_names(odds_ratio.index)
odds_ratio.round(3)

In [None]:
cols = [rev_col_map.get(col, col) for col in covars]
test_pred = res.predict(sm.add_constant(X_test[cols].astype(float)))
valid_pred = res.predict(sm.add_constant(X_valid[cols].astype(float)))
pred = {'LR': {'Test': pd.DataFrame(test_pred, columns=['CKD']), 'Valid': pd.DataFrame(valid_pred, columns=['CKD'])}}
evaluator = EvaluateClf(None, preds, labels)
evaluator.get_evaluation_scores(display_ci=True, save_ci=False, save_score=False)

# Split Distribution

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
ax = sns.histplot(x=prep.event_dates['visit_date'].dt.to_period('M').astype(str).sort_values())
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.savefig(f'{root_path}/data/output/CKD_results/figures/split_dist.jpg', dpi=300, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
ax = sns.histplot(x=prep.event_dates['visit_date'].dt.to_period('M').astype(str).sort_values(), stat='percent')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.show()

In [None]:
tmp = df[['baseline_eGFR']].copy()
tmp['month_year'] = prep.event_dates['visit_date'].dt.to_period('M').astype(str)
tmp = tmp.sort_values(by='month_year')
fig, ax = plt.subplots(figsize=(12, 6))
ax = sns.barplot(data=tmp, x='month_year', y='baseline_eGFR')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.show()

# eGFR vs Age

In [None]:
data = df[['baseline_eGFR', 'next_eGFR', 'age']].copy()
data['baseline_eGFR_bin'] = pd.cut(data['baseline_eGFR'], bins=[0, 30, 45, 60, np.inf])
age_bins = [(18, 39), (40, 64), (65, np.inf)]
data['age_bin'] = pd.cut(data['age'], bins=pd.IntervalIndex.from_tuples(age_bins, closed='both'))

In [None]:
data.groupby(['baseline_eGFR_bin', 'age_bin']).apply(len).unstack(level='baseline_eGFR_bin')

In [None]:
res = defaultdict(dict)
for (age, baseline_eGFR), group in data.groupby(['age_bin', 'baseline_eGFR_bin']):
    drop = group['baseline_eGFR'] - group['next_eGFR']
    mean = drop.mean()
    lower, upper = st.norm.interval(0.95, mean, drop.sem())
    res[age][baseline_eGFR] = f'{mean:.3f} ({lower:.3f}-{upper:.3f})'
res = pd.DataFrame(res).sort_index().T
res

In [None]:
tile_plot(
    x=data['age'], y=data['baseline_eGFR'],
    xlabel='Age', ylabel='Pre-treatment eGFR',
    clip=True, equal_axis=False, drop_marginal_plots=True, discrete_colorbar=True, axis_limit=None
)
plt.savefig(f'{root_path}/data/output/eGFR_results/figures/age_vs_eGFR.jpg', dpi=300, bbox_inches='tight')

In [None]:
# check how SPLINE performs in each age bins
tmp = pd.DataFrame()
tmp['pred'] = spline_preds['CKD']
tmp['label'] = Y['CKD']
tmp['split'] = tag['split']
tmp['age_bin'] = data['age_bin']
for age_bin, group in tmp.groupby('age_bin'):
    print(f"Age={age_bin}. N={len(group)}. CKD Rate={group['label'].mean():.3f}")
    label = {split: pd.DataFrame(y).rename(columns={'label': 'CKD'}) for split, y in group['label'].groupby(group['split'])}
    pred = {'SPLINE': {split: pd.DataFrame(p).rename(columns={'pred': 'CKD'}) for split, p in group['pred'].groupby(group['split'])}}

    evaluator = EvaluateClf(None, pred, label)
    scores = evaluator.get_evaluation_scores(display_ci=True, load_ci=False, save_ci=False, save_score=False, splits=['Test'])
    print(scores)

In [None]:
# check how SPLINE trained on age performs
from src.train import PolynomialModelTrainer
output_path = f'{main_dir}/experiment/ckd_age_spline'
initialize_folders(output_path)
trainer = PolynomialModelTrainer(X, Y, tag, output_path, base_col='age', alg='SPLINE')
model = trainer.run(bayesopt=False, train=True, save=True)
preds = load_pickle(f'{main_dir}/experiment/ckd_age_spline/preds', 'SPLINE_preds')
evaluator = EvaluateClf(output_path, preds, labels)
scores = evaluator.get_evaluation_scores(display_ci=False)
scores

In [None]:
log_reg = smf.ols("next_eGFR ~ baseline_eGFR + age", data=data).fit()
log_reg.summary() # -0.22 is the estimated change in next_eGFR for every 1-unit increase in age, holding others constant

In [None]:
log_reg = smf.ols("next_eGFR ~ baseline_eGFR + age + baseline_eGFR * age", data=data).fit()
log_reg.summary() # -0.22 is the estimated change in next_eGFR for every 1-unit increase in age, holding others constant

## Extra Info

In [None]:
ax = sns.lineplot(train_df, x='age', y='baseline_eGFR', label='Pre-treatment eGFR')
sns.lineplot(train_df, x='age', y='next_eGFR', label='Post-treatment eGFR', ax=ax)
ax.set(ylabel='eGFR', xlabel='Age')
ax.legend(frameon=False)

In [None]:
sns.histplot(train_df, x='age', bins=75)

In [None]:
ax = sns.boxplot(x=pd.cut(train_df['age'], bins=[18,40,65, np.inf]), y=train_df['baseline_eGFR'])
ax.set(ylabel='Pre-treatment eGFR', xlabel='Age')

# eGFR vs Dose

In [None]:
data = df[['baseline_eGFR', 'cisplatin_dosage', 'regimen', 'cancer_topog_cd', 'intent_of_systemic_treatment', 'body_surface_area']].copy()
data['cancer_type'] = data['cancer_topog_cd'].map(cancer_code_mapping)

In [None]:
# radiation therapy regimens and cancer type combinations
data.loc[data['regimen'].str.contains('rt'), ['regimen', 'cancer_type']].value_counts().head(100)

## Collect Standard Cisplatin Dosage

In [None]:
cisplatin_regimen_dose_map = {
    'cisppeme': {
        ("default", "default"): 75,
    },
    'cispetop(rt)': {
        ("default", "default"): 50,
    },
    'cispetop(3d)': {
        ("default", "default"): 25,
        ("C", "C62"): 50,
    },
    'cisp(rt-w)': {
        ("default", "default"): 30,
        ("P", "C25"): 25,
        ("A", "C22"): 25,
        ("P", "C22"): 25,
        ("N", "C67"): 35,
        ("A", "C67"): 35,
        ("P", "C67"): 35,
    },
    'cispvino(w)': {
        ("default", "default"): 50,
    },
    'cisp(rt)': {
        ("default", "default"): 100,
        ("A", "C54"): 50
    },
    'ecx': {
        ("default", "default"): 60,
    },
    'cispvino': {
        ("default", "default"): 75,
    },
    'cisppacl': {
        ("default", "default"): 75,
        ("P", "C53"): 50,
    },
    'cispgemc': {
        ("default", "default"): 70, # 70-80
    },
    'capecisp+tras': {
        ("default", "default"): 80,
    },
    'cispvino(rt)': {
        ("default", "default"): 80,
    },
    'cisp': {
        ("default", "default"): 50, # 50-75
    },
    'cispfu+tras': {
        ("default", "default"): 80
    }
}

In [None]:
x = pd.DataFrame(cisplatin_regimen_dose_map)
x = x.melt(ignore_index=False, var_name='Regimen', value_name='Standard Dose (mg/m^2)')
x = x[x['Standard Dose (mg/m^2)'].notna()]
x = x.reset_index(names=["Intent", "Cancer Code"])
x["Drug"] = "Cisplatin"
x = x[['Drug', 'Regimen', 'Intent', 'Cancer Code', 'Standard Dose (mg/m^2)']]
x["Cancer Type"] = x["Cancer Code"].map(cancer_code_mapping)
x.to_csv(f"{root_path}/data/regimen_dose.csv", index=False)
x

## Compute Percentage of Ideal Dose

In [None]:
regimen_dose = pd.read_csv(f"{root_path}/data/regimen_dose.csv")
regimen_dose = regimen_dose.query("Drug == 'Cisplatin'")
default_key = regimen_dose["Regimen"]
custom_key = regimen_dose[["Regimen", "Intent", "Cancer Code"]].agg("".join, axis=1)
default_dose_map = dict(regimen_dose.set_index(default_key).query("Intent == 'default'")["Standard Dose (mg/m^2)"])
custom_dose_map = dict(regimen_dose.set_index(custom_key).query("Intent != 'default'")["Standard Dose (mg/m^2)"])

In [None]:
# only keep the regimens which I have the standard dosing for
mask = data['regimen'].isin(regimen_dose['Regimen'])
print(f"Removed {sum(~mask)} ({(~mask).mean()*100:.2f}%) rows")
data = data[mask]

# map the standard doses
data['dosage_guide'] = data['regimen'].map(default_dose_map)
match_key = data['regimen'] + data['intent_of_systemic_treatment'] + data['cancer_topog_cd']
mask = match_key.isin(custom_dose_map)
data.loc[mask, 'dosage_guide'] = match_key[mask].map(custom_dose_map)

# compute percent ideal dose
# I suspect the cisplatin dosage was provided as mg/m^2, NOT in mg. So basically I don't need to convert. I should ask Sho.
data["perc_ideal_dose"] = data['cisplatin_dosage'] / data['dosage_guide']
g = sns.FacetGrid(data, col='regimen', col_wrap=3, sharex=False, sharey=False)
g.map_dataframe(sns.histplot, x='perc_ideal_dose', bins=10)

In [None]:
# exclude patients with less than 10% ideal dose
mask = data['perc_ideal_dose'] < 0.1
print(f"Removed {mask.sum()} ({mask.mean()*100:.2f}%) rows")
data = data[~mask]

In [None]:
# excklude patients with more than 110% ideal dose
mask = data['perc_ideal_dose'] > 1.1
print(f"Removed {mask.sum()} ({mask.mean()*100:.2f}%) rows")
data = data[~mask]

## Bar Plots

In [None]:
data['baseline_eGFR_bin'] = pd.cut(data['baseline_eGFR'], bins=[0, 30, 45, 60, 90, np.inf])
data['cisplatin_reduced_by_over_10%'] = data['perc_ideal_dose'] < 0.9

In [None]:
counts = data['baseline_eGFR_bin'].value_counts().sort_index()
counts

In [None]:
mask = data['baseline_eGFR'] < 60
data.groupby(mask)['cisplatin_reduced_by_over_10%'].agg({'sum', 'mean'})

In [None]:
ax = sns.barplot(data=data, x='baseline_eGFR_bin', y='cisplatin_reduced_by_over_10%')
ax.set(ylabel='Proportion of Patients Given <90% of the Ideal Dose', xlabel='Baseline eGFR (mL/min/1.73m²)', xticklabels=['0-29', '30-44', '45-59', '60-89', '90+'])
plt.savefig(f'{root_path}/data/output/CKD_results/figures/eGFR_vs_dose.jpg', dpi=300, bbox_inches='tight')

# Homogenous Cancer Cohort

In [None]:
def pipeline(model_data, X, Y, tag, output_path, random_state=42):
    initialize_folders(output_path)
    if not os.path.exists(f'{output_path}/best_params/ENS_params.pkl'):
        shutil.copytree(f'{main_dir}/models/CKD/best_params', f'{output_path}/best_params', dirs_exist_ok=True)

    print(prep.get_label_distribution(Y, tag, with_respect_to='sessions'))

    untrained = not os.path.exists(f'{output_path}/preds/all_preds.pkl')
    if untrained:
        trainer = Trainer(X, Y, tag, output_path)
        trainer.run(bayesopt=False, train=True, save_preds=True, algs=['LR', 'RF', 'XGB', 'NN'])
    
    preds = load_pickle(f'{output_path}/preds', 'all_preds')
    ensembler = Ensembler(X, Y, tag, output_path, preds)
    ensembler.run(bayesopt=True, calibrate=True)

    preds, labels = ensembler.preds, ensembler.labels
    evaluator = EvaluateClf(output_path, preds, labels)
    scores = evaluator.get_evaluation_scores(display_ci=True, load_ci=True, save_ci=True)
    print(scores)

    unpermuted = not os.path.exists(f'{output_path}/feat_importance/ENS_feature_importance.csv')
    if unpermuted:
        class CKDFeatImportance(FeatImportance):
            def get_data(self): return X, Y, tag, model_data.loc[:, ~model_data.columns.str.contains('SCr|dialysis|next')], None
        fi = CKDFeatImportance(output_path)
        fi.params['random_state'] = random_state
        fi.get_feature_importance('ENS')

    importance_plot('ENS', evaluator.target_events, output_path, figsize=(6,5), top=10, importance_by='feature', padding={'pad_x0': 3.1})


In [None]:
# Train for each top 12 cancer type x regimen
top_cancer_regimens = df[['cancer_topog_cd', 'regimen']].value_counts().index[0:12]
for i, (cancer, regimen) in enumerate(top_cancer_regimens):
    random_state = 42 if i in [6, 9] else 43
    print('#' * 100)
    print(cancer_code_mapping[cancer], regimen)
    print('#' * 100)

    # only keep the selected cancer cohort
    assert X[f'cancer_topog_cd_{cancer}'].nunique() == 2
    assert X[f'regimen_{regimen}'].nunique() == 2
    mask = X[f'cancer_topog_cd_{cancer}'].astype(bool) & X[f'regimen_{regimen}'].astype(bool)
    m, x, y, t = model_data[mask], X[mask], Y[mask], tag[mask].copy()

    # remove the other targets
    y = y[['CKD']]

    # if not enough items, skip
    if any(y.groupby(t['split'])['CKD'].sum() < 2):
        print(f"Not enough positive examples, skipping {cancer} {regimen}...")
        continue

    # remove the cancer regimen columns
    cancer_regimen_cols = x.columns[x.columns.str.startswith('cancer') | x.columns.str.startswith('regimen')]
    drop_cols = cancer_regimen_cols
    x = x.drop(columns=drop_cols)
    m = m.drop(columns=['cancer_topog_cd', 'cancer_morph_cd', 'regimen'])

    pipeline(m, x, y, t, f'{main_dir}/experiment/ckd_homo_cancer/{cancer}_{regimen}', random_state=random_state)

In [None]:
# save all the results to a single folder
save_path = f'{root_path}/data/output/ckd_homo_cancer_feat_imp'
os.makedirs(save_path, exist_ok=True)
for folder in os.listdir(f'{main_dir}/experiment/ckd_homo_cancer'):
    if folder.startswith('.'): continue
    shutil.copy(f'{main_dir}/experiment/ckd_homo_cancer/{folder}/figures/important_features/ENS_CKD.jpg', f'{save_path}/{folder}_ENS_CKD_feat_imp.jpg')

In [None]:
# check how the global SPLINE performs
top_cancer_regimens = df[['cancer_topog_cd', 'regimen']].value_counts().index[0:12]
for i, (cancer, regimen) in enumerate(top_cancer_regimens):
    print(cancer_code_mapping[cancer], regimen)

    # only keep the selected cancer cohort
    mask = X[f'cancer_topog_cd_{cancer}'].astype(bool) & X[f'regimen_{regimen}'].astype(bool)
    pred, label, t = spline_preds[mask], Y[mask], tag[mask]

    # remove the other targets
    pred, label = pred[['CKD']], label[['CKD']]

    # if not enough items, skip
    if any(label.groupby(t['split'])['CKD'].sum() < 2):
        print(f"Not enough positive examples, skipping {cancer} {regimen}...")
        continue

    label = {split: y for split, y in label.groupby(t['split'])}
    pred = {'SPLINE': {split: p for split, p in pred.groupby(t['split'])}}

    evaluator = EvaluateClf(None, pred, label)
    scores = evaluator.get_evaluation_scores(display_ci=True, load_ci=False, save_ci=False)
    print(scores)

# Non-Cisplatin Matched Cohort

In [None]:
output_path = f'{main_dir}/experiment/non_cisplatin_cancer'
initialize_folders(output_path, extra_folders=['data'])

## Build Non-Cisplatin Cohort

In [None]:
from src.preprocess import Laboratory, clean_up_systemic
from src.config import systemic_cols
from src.utility import load_included_regimens, numpy_ffill, get_eGFR

cisplatin_regimens = load_included_regimens(criteria='cisplatin_containing')['regimen']

systemic = pd.read_parquet(f'{root_path}/data/systemic.parquet.gzip')
systemic = clean_up_systemic(systemic)
systemic = systemic[systemic_cols].drop_duplicates()
systemic = systemic[~systemic['regimen'].isin(cisplatin_regimens)] # non-cisplatin regimens
systemic = systemic.groupby('ikn').nth(0).reset_index() # take the very first visit
systemic = systemic[systemic['inpatient_flag'] == 'N'] # remove inpatients
systemic.to_parquet(f'{output_path}/data/systemic.parquet.gzip', compression='gzip', index=False)

# NOTE: need to restart the kernel after this point to avoid annoying IOStream.flush timed out messages
labr = Laboratory(f'{output_path}/data')
labr.preprocess(set(systemic['ikn'])) # takes 11min

In [None]:
systemic = pd.read_parquet(f'{output_path}/data/systemic.parquet.gzip')

In [None]:
from src.preprocess import Laboratory, Symptoms
labr = Laboratory(f'{output_path}/data', processes=64)
lab_df = labr.run(systemic, time_window=(-30, 28))
lab_df.to_parquet(f'{output_path}/data/lab.parquet.gzip', compression='gzip', index=False)

symp = Symptoms(processes=64)
symp_df = symp.run(systemic)
symp_df.to_parquet(f'{output_path}/data/symptoms.parquet.gzip', compression='gzip', index=False)

In [None]:
from src.config import DATE, OBS_CODE, OBS_DATE
from src.preprocess import Demographic, combine_demographic_data, combine_lab_data, combine_symptom_data, process_dialysis_data
lab = pd.read_parquet(f'{output_path}/data/lab.parquet.gzip')
symp = pd.read_parquet(f'{output_path}/data/symptoms.parquet.gzip')
demographic = Demographic().run(exclude_blood_cancer=False)
cohort, lab_map, _ = combine_lab_data(systemic, lab)
cohort = combine_symptom_data(cohort, symp)
cohort = combine_demographic_data(cohort, demographic)
cohort = cohort.drop(columns=['inpatient_flag', 'ethnic', 'country_birth', 'official_language', 'nat_language'])
cohort = process_dialysis_data(cohort)

scr = lab_map['14682-9'].loc[cohort.index]
scr.columns = scr.columns.astype(int)
cohort['baseline_creatinine_value'] = numpy_ffill(scr[range(-30, 1)])

cohort.to_parquet(f'{output_path}/data/non_cisplatin_cohort.parquet.gzip', compression='gzip', index=False)

In [None]:
olis = pd.read_parquet(f'{output_path}/data/olis')
scr = olis.query(f"{OBS_CODE} == '14682-9'").copy()
scr[OBS_DATE] = pd.to_datetime(scr[OBS_DATE])
# only keep measurements after the first trt visit
scr['first_trt_date'] = scr['ikn'].map(cohort.set_index('ikn')['visit_date'])
scr = scr.query('ObservationDateTime >= first_trt_date')
# only keep measurements between the minimum and maximum followup times in the cisplatin cohort
scr['followup_time'] = (scr[OBS_DATE] - scr['first_trt_date']).dt.days
scr = scr[scr['followup_time'].between(followup_time.min(), followup_time.max())]
scr.to_parquet(f'{output_path}/data/followup_creatinine.parquet.gzip', compression='gzip', index=False)

In [None]:
# merge followup creatinine and the cohort
cohort = pd.read_parquet(f'{output_path}/data/non_cisplatin_cohort.parquet.gzip')
scr = pd.read_parquet(f'{output_path}/data/followup_creatinine.parquet.gzip')
scr = scr[['ikn', 'value', OBS_DATE, 'followup_time']]
scr = scr.rename(columns={'value': 'next_SCr_value', OBS_DATE: 'next_SCr_obs_date'})
cohort = pd.merge(cohort, scr, on='ikn')
cohort.to_parquet(f'{output_path}/data/final_data.parquet.gzip', compressiopn='gzip', index=False)

## Match with Cisplatin Cohort

In [None]:
cisplatin_cohort = df[['ikn', 'age', 'sex', 'baseline_eGFR']].copy()
cisplatin_cohort['followup_time'] = followup_time

In [None]:
non_cisplatin_cohort = pd.read_parquet(f'{output_path}/data/final_data.parquet.gzip')
non_cisplatin_cohort = get_eGFR(non_cisplatin_cohort, col='baseline_creatinine_value', prefix='baseline_')
non_cisplatin_cohort = get_eGFR(non_cisplatin_cohort, col='next_SCr_value', prefix='next_')
non_cisplatin_cohort = non_cisplatin_cohort[non_cisplatin_cohort['baseline_eGFR'].notna()]

In [None]:
egfr_bins_1pt = [0, *np.arange(24, 142, 1), np.inf]
egfr_bins_2pt = [0, *np.arange(24, 142, 2), np.inf]
egfr_bins_3pt = [0, *np.arange(24, 142, 3), np.inf]
egfr_bins_5pt = [0, *np.arange(24, 142, 5), np.inf]

In [None]:
followup_bins_30d = [0, *np.arange(90, followup_time.max(), 30), np.inf]
followup_bins_60d = [0, *np.arange(90, followup_time.max(), 60), np.inf]
followup_bins_90d = [0, *np.arange(90, followup_time.max(), 90), np.inf]
followup_bins_120d = [0, *np.arange(90, followup_time.max(), 120), np.inf]

In [None]:
def get_matched(cisplatin_cohort, non_cisplatin_cohort, egfr_bins=egfr_bins_1pt, followup_bins=followup_bins_30d):
    cisplatin_cohort['match_key'] = (
        cisplatin_cohort['age'].astype(str)
        + cisplatin_cohort['sex']
        + pd.cut(cisplatin_cohort['baseline_eGFR'], bins=egfr_bins).astype(str)
        + pd.cut(cisplatin_cohort['followup_time'], bins=followup_bins).astype(str)
    )
    non_cisplatin_cohort['match_key'] = (
        non_cisplatin_cohort['age'].astype(str)
        + non_cisplatin_cohort['sex']
        + pd.cut(non_cisplatin_cohort['baseline_eGFR'], bins=egfr_bins).astype(str)
        + pd.cut(non_cisplatin_cohort['followup_time'], bins=followup_bins).astype(str)
    )
    matched = pd.merge(cisplatin_cohort, non_cisplatin_cohort, on='match_key', suffixes=('_cisplatin_cohort', ''), how='inner')
    return matched

In [None]:
res = defaultdict(dict)
for pt, egfr_bins in {'1pt': egfr_bins_1pt, '2pt': egfr_bins_2pt, '3pt': egfr_bins_3pt, '5pt': egfr_bins_5pt}.items():
    for day, followup_bins in {'30d': followup_bins_30d, '60d': followup_bins_60d, '90d': followup_bins_90d, '120d': followup_bins_120d}.items():
        matched = get_matched(cisplatin_cohort, non_cisplatin_cohort, egfr_bins=egfr_bins, followup_bins=followup_bins)
        N = sum(~cisplatin_cohort['ikn'].isin(matched['ikn_cisplatin_cohort']))
        res[f"{pt} eGFR bins"][f"{day} followup bins"] = f"{N} ({N/len(cisplatin_cohort)*100:.2f}%)"
print("Number of patients unable to be matched")
pd.DataFrame(res)


In [None]:
matched = get_matched(cisplatin_cohort, non_cisplatin_cohort, egfr_bins=egfr_bins_1pt, followup_bins=followup_bins_30d)

In [None]:
# average number of matches per key
matched.groupby('match_key').apply(len).describe()

In [None]:
# number of cisplatin patients
matched['ikn_cisplatin_cohort'].nunique()

In [None]:
%%time
# restrict number of matches to maximum 1 non-cisplatin patients per cisplatin patient
from tqdm import tqdm

res = {}
used_ikns = []

# loop through the keys starting from the least common key
key_freq = matched['match_key'].value_counts(ascending=True)
for key in tqdm(key_freq.index):

    # get all the samples that matches the key, make sure not to use patients who've already been selected
    group = matched.query('match_key == @key and ikn not in @used_ikns')
    if group.empty:
        continue

    # count the number of unique cisplatin patients in this group
    cisplatin_cohort_ikns = group['ikn_cisplatin_cohort'].unique()
    n = len(cisplatin_cohort_ikns)

    # only keep 1 row per non-cisplatin patient
    # a non-cisplatin patient could have matched with multiple cisplatin patients and/or have multiple measurements in the same followup bins
    group = group.groupby('ikn', group_keys=False).apply(lambda x: x.sample(n=1, random_state=42)) # randomly select one instead of most recent measurement, minimum measurement, etc, fow now

    # sample 1 non-cisplatin patient per cisplatin patient
    # a cisplatin patient could have matched with multiple non-cisplatin patients
    group = group.sample(n=min(n, len(group)), random_state=42)

    # keep track pf which cisplatin patients were matched with which non-cisplatin patients
    group['ikn_cisplatin_cohort'] = list(cisplatin_cohort_ikns)[0:len(group)]

    # keep track of used patients
    used_ikns += group['ikn'].tolist()

    res[key] = group
res = pd.concat(res)

In [None]:
# number of cisplatin patients after the matching
res['ikn_cisplatin_cohort'].nunique()

In [None]:
# number of non-cisplatin patients after hte matching
res['ikn'].nunique()

In [None]:
res.to_parquet(f'{output_path}/data/matched_data.parquet.gzip', compression='gzip', index=False)

## Analysis

In [None]:
matched = pd.read_parquet(f'{output_path}/data/matched_data.parquet.gzip')
matched = matched.drop(columns=['ikn_cisplatin_cohort', 'age_cisplatin_cohort', 'sex_cisplatin_cohort', 'baseline_eGFR_cisplatin_cohort', 'match_key'])

In [None]:
matched['visit_date'].min(), matched['visit_date'].max()

In [None]:
prep_m = PrepDataCAN(adverse_event='ckd', target_keyword='SCr|dialysis|next')
matched = prep_m.prepare_features(matched, missing_thresh=80, include_comorbidity=True, verbose=True)
X_m, Y_m, tag_m = prep_m.split_and_transform_data(matched, split_date=split_date)
matched = matched.loc[X_m.index]

In [None]:
from src.summarize import data_description_summary
subgroups = [
    'sex', 'immigration', 'birth_region', 'language', 'income', 'area_density',
    'regimen', 'cancer_type', 'cancer_location', 'target', 'comorbidity', 'dialysis', 'ckd'
]
data_description_summary(
    matched, Y_m, tag_m, save_dir=f'{output_path}/tables', partition_method='cohort', target_event='CKD',
    subgroups=subgroups
)

In [None]:
# eGFR decrease
case_diff = model_data['next_eGFR'] - model_data['baseline_eGFR'] # cisplatin cohort
cont_diff = matched['next_eGFR'] - matched['baseline_eGFR'] # matched non-cisplatin cohort

for cohort, diff, mask in [('Cisplatin Cohort', case_diff, model_data['baseline_eGFR'] >= 60), ('Matched Cohort', cont_diff, matched['baseline_eGFR'] >= 60)]:
    mean, sigma = np.mean(diff), np.std(diff)/np.sqrt(len(diff))
    lower, upper = st.norm.interval(0.95, mean, sigma)
    print(f"{cohort}: ")
    print(f"\teGFR descreased on average by {mean:.3f} ({lower:.3f}, {upper:.3f})")

    mean, sigma = np.mean(diff[mask]), np.std(diff[mask])/np.sqrt(len(diff[mask]))
    lower, upper = st.norm.interval(0.95, mean, sigma)
    print(f"\tFor patients without pre-treatment CKD, eGFR decreased on average by {mean:.3f} ({lower:.3f}, {upper:.3f})")

# Welch's t-test to compare the means in two independent sample sets
stat, pval = st.ttest_ind(case_diff, cont_diff, equal_var=False)
stat, pval

In [None]:
# odds ratio of CKD vs cisplatin
tmp = pd.concat([Y, Y_m], keys=[True, False], names=['cisplatin'])
tmp = tmp.reset_index(level=0).astype(int)
tmp.columns = ['cisplatin', 'CKD', 'CKD_3b', 'CKD_4']
res = {}
for target in ['CKD', 'CKD_3b', 'CKD_4']:
    log_reg = smf.logit(f"{target} ~ cisplatin", data=tmp).fit()
    odds_ratio = pd.DataFrame({"OR": log_reg.params, "Lower CI": log_reg.conf_int()[0], "Upper CI": log_reg.conf_int()[1]})
    odds_ratio = np.exp(odds_ratio)
    odds_ratio['P value'] = log_reg.pvalues
    res[target] = odds_ratio
pd.concat(res)