© 2023 Institute for Clinical Evaluative Sciences. All rights reserved.

TERMS OF USE:
##Not for distribution.## This code and data is provided to the user solely for its own non-commercial use by individuals and/or not-for-profit corporations. User shall not distribute without express written permission from the Institute for Clinical Evaluative Sciences.

##Not-for-profit.## This code and data may not be used in connection with profit generating activities.

##No liability.## The Institute for Clinical Evaluative Sciences makes no warranty or representation regarding the fitness, quality or reliability of this code and data.

##No Support.## The Institute for Clinical Evaluative Sciences will not provide any technological, educational or informational support in connection with the use of this code and data.

##Warning.## By receiving this code and data, user accepts these terms, and uses the code and data, solely at its own risk.

In [None]:
%cd ../../
%load_ext autoreload
%autoreload 2

In [None]:
import copy

from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)
import seaborn as sns
import shap

from src.config import root_path, can_folder, split_date, SCr_rise_threshold
from src.evaluate import EvaluateReg, EvaluateBaselineModel
from src.prep_data import PrepDataCAN
from src.train import Ensembler, Trainer, PolynomialModelTrainer
from src.utility import initialize_folders, load_pickle, get_hyperparameters
from src.visualize import shap_plot

In [None]:
processes = 64
target_keyword = 'SCr|dialysis|next'
main_dir = f'{root_path}/projects/{can_folder}'
output_path = f'{main_dir}/models/eGFR'
initialize_folders(output_path)

# Prepare Data for Model Training

In [None]:
prep = PrepDataCAN(adverse_event='ckd', target_keyword=target_keyword)
model_data = prep.get_data(missing_thresh=80, include_comorbidity=True, verbose=True)
(model_data['next_eGFR'] - model_data['baseline_eGFR']).hist(bins=100)
X, _, tag = prep.split_and_transform_data(model_data, split_date=split_date)
# remove sessions in model_data that were excluded during split_and_transform
model_data = model_data.loc[tag.index]

In [None]:
diff = model_data['next_eGFR'] - model_data['baseline_eGFR']
mean, sigma = np.mean(diff), np.std(diff)/np.sqrt(len(diff))
conf_int = norm.interval(0.95, loc=mean, scale=sigma)
print(f"eGFR decreased on average by {mean:.3f}, 95% CI: {conf_int}")

mask = model_data['baseline_eGFR'] < 60
diff = model_data.loc[~mask, 'next_eGFR'] - model_data.loc[~mask, 'baseline_eGFR']
mean, sigma = np.mean(diff), np.std(diff)/np.sqrt(len(diff))
conf_int = norm.interval(0.95, loc=mean, scale=sigma)
print(f"For patients without pre-treatment CKD, eGFR decreased on average by {mean:.3f}, 95% CI {conf_int}")

In [None]:
# Convenience variables
train_mask, valid_mask, test_mask = tag['split'] == 'Train', tag['split'] == 'Valid', tag['split'] == 'Test'
X_train, X_valid, X_test = X[train_mask], X[valid_mask], X[test_mask]

In [None]:
# setup regression label
Y = pd.DataFrame()
Y['next_eGFR'] = model_data['next_eGFR']
Y['eGFR_change'] = model_data['next_eGFR'] - model_data['baseline_eGFR']
# scale the target
scaler = StandardScaler()
Y[train_mask] = Y_train = scaler.fit_transform(Y[train_mask])
Y[valid_mask] = Y_valid = scaler.transform(Y[valid_mask])
Y[test_mask] = Y_test = scaler.transform(Y[test_mask])

# Train Models

## Spline Baseline Model

In [None]:
trainer = PolynomialModelTrainer(X, Y, tag, output_path, base_col='baseline_eGFR', alg='SPLINE', task_type='R')
trainer.run(bayesopt=True, train=True, save=True)

In [None]:
# save the model as a table
df = trainer.model_to_table(
    model=load_pickle(output_path, 'SPLINE'),
    base_vals=model_data['baseline_eGFR'],
    extra_info=model_data[['baseline_creatinine_value', 'next_eGFR']].rename(columns={'next_eGFR': 'true_next_eGFR'})
)
df[Y.columns] = scaler.inverse_transform(df[Y.columns])
df.to_csv(f'{output_path}/SPLINE_model.csv')
df

## Main Models

In [None]:
trainer = Trainer(X, Y, tag, output_path, task_type='R')
trainer.run(bayesopt=True, train=True, save_preds=True, algs=['LR', 'RF', 'XGB', 'NN'], allow_duplicate_points=True)

## ENS Model 
Find Optimal Ensemble Weights


In [None]:
preds = load_pickle(f'{output_path}/preds', 'all_preds')
ensembler = Ensembler(X, Y, tag, output_path, preds, task_type='R')
ensembler.run(bayesopt=True, calibrate=False)

# Evaluate Models

In [None]:
preds, labels = copy.deepcopy(ensembler.preds), copy.deepcopy(ensembler.labels)
# Include the baseline models
preds.update(load_pickle(f'{output_path}/preds', 'SPLINE_preds'))

In [None]:
for split, label in labels.items():
    # inverse scale the labels
    labels[split][:] = scaler.inverse_transform(label)
    # inverse scale the predictions
    for alg, pred in preds.items():
        preds[alg][split][:] = scaler.inverse_transform(pred[split])

In [None]:
evaluator = EvaluateReg(output_path, preds, labels)
evaluator.get_evaluation_scores(display_ci=True, load_ci=False, save_ci=True)

In [None]:
evaluator.plot_err_dist(alg='ENS', target_event='next_eGFR')
evaluator.plot_err_dist(alg='ENS', target_event='eGFR_change')

## Most Important Features

In [None]:
data = X_test.astype(float)
bg_dist = X_valid.astype(float)
ensemble_weights = load_pickle(f'{output_path}/best_params', 'ENS_params')
models = {alg: load_pickle(output_path, alg) for alg in ensemble_weights}
def predict(X):
    weights, preds = [], []
    for alg, weight in ensemble_weights.items():
        pred = models[alg].predict(X)
        if alg == 'NN': pred = pred.cpu().detach().numpy()
        preds.append(pred)
        weights.append(weight)
    return scaler.inverse_transform(np.average(preds, axis=0, weights=weights))

In [None]:
%%time
# compute shap values for the ENS models
# NOTE: the explainer will loop through each sample row, and create multiple versions of the sample row
# with different feature permutations, where the values are replaced with the background distribution values
explainer = shap.Explainer(predict, bg_dist, seed=42)
shap_values = explainer(data, max_evals=800)
save_pickle(shap_values, f'{output_path}/feat_importance', 'ENS_shap_values')

In [None]:
shap_values = load_pickle(f'{output_path}/feat_importance', 'ENS_shap_values')

# set display version of data (unnormalized)
data = X_test.astype(float)
norm_cols = prep.scaler.feature_names_in_
data[norm_cols] = prep.scaler.inverse_transform(data[norm_cols])
shap_values.data = data.to_numpy()

# separate the two shap values
next_eGFR_shap_values = shap_values[:, :, 0]
eGFR_change_shap_values = shap_values[:, :, 1]

In [None]:
shap_plot(next_eGFR_shap_values, output_path, prefix='ENS_next_eGFR_')

In [None]:
shap_plot(eGFR_change_shap_values, output_path, prefix='ENS_eGFR_change_')

## Prediction vs Baseline Plots

In [None]:
preds = load_pickle(f'{output_path}/preds', 'SPLINE_preds')
preds_ci = load_pickle(f'{output_path}/preds', 'SPLINE_preds_ci')
# inverse scale the predictions
for split, alg in zip(labels.keys(), preds.keys()):
    preds[alg][split][:] = scaler.inverse_transform(preds[alg][split])
    preds_min, preds_max = preds_ci[alg][split]
    preds_min[:], preds_max[:] = scaler.inverse_transform(preds_min), scaler.inverse_transform(preds_max)
    preds_ci[alg][split] = (preds_min, preds_max)

base_vals = model_data['baseline_eGFR'][test_mask]
baseline_evaluator = EvaluateBaselineModel(base_vals, preds, labels, output_path, preds_ci=preds_ci)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
baseline_evaluator.plot_pred_vs_base(axes[0], alg='SPLINE', target_event='next_eGFR', split='Test', show_diagonal=True, use_legend=False, axis_limit=(-5, 145))
baseline_evaluator.plot_pred_vs_base(axes[1], alg='SPLINE', target_event='eGFR_change', split='Test', use_legend=False)
plt.savefig(f'{output_path}/figures/baseline/SPLINE_pred_vs_baseline.jpg', bbox_inches='tight', dpi=300)

## Prediction vs Label Plots

In [None]:
evaluator.plot_label_vs_pred(alg='ENS', target_event='next_eGFR', split='Test', equal_axis=True, axis_limit=(-5, 145))
evaluator.plot_label_vs_pred(alg='ENS', target_event='eGFR_change', split='Test', equal_axis=False)

In [None]:
evaluator.plot_label_vs_pred(alg='SPLINE', target_event='next_eGFR', split='Test', equal_axis=True, axis_limit=(-5, 145))
evaluator.plot_label_vs_pred(alg='SPLINE', target_event='eGFR_change', split='Test', equal_axis=False)