# Predicting emergency department visits anchored on clinic dates
---
## Background
Before, we built a model to predict emergency department (ED) visits anchored on treatment dates.

The problem with that is the primary physicians do not interact with their patients during their treatment sessions. They only meet during their clinic visits. That is the best time for the model to nudge the physician for an intervention. Thus, we now want to build a model to predict patient's risk of ED visits prior to clinic date instead of prior to treatment session.

---

In [None]:
%%capture
%cd ../../
%load_ext autoreload
%autoreload 2

In [None]:
import logging

import pandas as pd
from datetime import datetime

from ml_common.summary import get_label_distribution
from ml_common.util import load_pickle, save_pickle

from preduce.acu.eval import evaluate_valid, evaluate_test, predict
from preduce.acu.pipeline import PrepACUData
from preduce.acu.train import train_models, tune_params
from preduce.summarize import feature_summary
from preduce.util import compute_threshold

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 100)

logging.basicConfig(
    level=logging.INFO, 
    format='%(levelname)s:%(message)s', 
)

## Load feature data

## Prepare Data

In [None]:
df = pd.read_parquet('./data/processed/clinic_centered_feature_dataset.parquet.gzip')

In [None]:
prep = PrepACUData()
df = prep.preprocess(df)
X, Y, metainfo = prep.prepare(df, n_folds=5) # n_folds=3
df = df.loc[X.index]

In [None]:
train_mask, test_mask = metainfo['split'] == 'Train', metainfo['split'] == 'Test'
X_train, X_test = X[train_mask], X[test_mask]
Y_train, Y_test = Y[train_mask], Y[test_mask]
metainfo_train, metainfo_test = metainfo[train_mask], metainfo[test_mask]

In [None]:
# Save the data prep for silent deployment
# So we transform new incoming data using the original data preparer
save_pickle(prep, './result', 'prep_ED_visit_clinic_anchored')

## Describe Data

In [None]:
count = pd.DataFrame({
    'Number of sessions': metainfo.groupby('split').apply(len, include_groups=False), 
    'Number of patients': metainfo.groupby('split')['mrn'].nunique()}
).T
count['Total'] = count.sum(axis=1)
print(f'\n{count.to_string()}')

In [None]:
no_trts_prior = df['treatment_date'].isnull()
pd.concat([
    get_label_distribution(Y[no_trts_prior], metainfo[no_trts_prior], with_respect_to='sessions'),
    get_label_distribution(Y[~no_trts_prior], metainfo[~no_trts_prior], with_respect_to='sessions'),
    get_label_distribution(Y, metainfo, with_respect_to='sessions')
], keys=['First Visit', 'Subsequent Visit', 'All'], axis=1)

In [None]:
pd.concat([
    get_label_distribution(Y[no_trts_prior], metainfo[no_trts_prior], with_respect_to='patients'),
    get_label_distribution(Y[~no_trts_prior], metainfo[~no_trts_prior], with_respect_to='patients'),
    get_label_distribution(Y, metainfo, with_respect_to='patients')
], keys=['First Visit', 'Subsequent Visit', 'All'], axis=1)

In [None]:
# Feature Characteristics
x = prep.ohe.encode(df.loc[X_train.index].copy(), verbose=False) # get original (non-normalized, non-imputed) data one-hot encoded
x = x[[col for col in x.columns if not (col in metainfo.columns or col.startswith('target'))]]
feature_summary(x, save_path='result/tables/feature_summary_ED_clinic_anchored.csv').sample(10, random_state=42)

# Traditional Training Pipeline

## Train Models

In [None]:
# LGBM does not like non alphanumeric characters (except for _)
for char in ['(', ')', '+', '-', '/', ',']: 
    X_train.columns = X_train.columns.str.replace(char, '_')
    X_test.columns = X_test.columns.str.replace(char, '_')

In [None]:
%%capture
# Hyperparameter tuning
# TODO: try greater kappa for greater exploration
algs = ['LASSO', 'RF', 'Ridge', 'XGB', 'LGBM']
best_params = {}
for alg in algs:
    best_params[alg] = tune_params(alg, X_train, Y_train['ED_visit'], metainfo_train)
save_pickle(best_params, './models', 'best_params_clinic_anchored')
save_pickle(best_params, './models', f'best_params_clinic_anchored-{datetime.now()}')

In [None]:
best_params = load_pickle('./models', 'best_params_clinic_anchored')
models = train_models(X_train, Y_train, metainfo_train, best_params) # NOTE: Number of CV folds = 3

## Model Selection
Select final model based on the average performance across the validation folds

In [None]:
evaluate_valid(models, X_train, Y_train, metainfo_train)

## Evaluate Model

In [None]:
pd.concat([evaluate_test(model, X_test, Y_test) for alg, model in models.items()], keys=models.keys()).T

In [None]:
model = models['XGB']
mask = metainfo_test['treatment_date'].isnull()
pd.concat([
    evaluate_test(model, X_test[mask], Y_test[mask]),
    evaluate_test(model, X_test[~mask], Y_test[~mask])
], keys=['First Visit', 'Subsequent Visits']).T

In [None]:
# compute threshold that achieves 10% and 20% alarm rate
pred = predict(model['ED_visit'], X_test)
res = [compute_threshold(pred, desired_alarm_rate) for desired_alarm_rate in [0.1, 0.2]]
pd.DataFrame(res, columns=['Prediction Threshold', 'Alarm Rate'])

In [None]:
save_pickle(model['ED_visit'], './models', 'XGB_ED_visit_clinic_anchored')

# Autogluon Training Pipeline

In [None]:
from autogluon.tabular import TabularPredictor
from preduce.ag.eval import evaluate
from preduce.ag.train import train_models

## Train Models

In [None]:
models = train_models(X_train, Y_train, metainfo_train, presets='medium_quality', eval_metric='roc_auc')

## Evaluate Model

In [None]:
# Quickly check the validation scores
models = {'ED_visit': TabularPredictor.load('./AutogluonModels/20250203_184425-ED_visit-medium-roc_auc', verbosity=0)}
print(models['ED_visit'].leaderboard().head(n=5)[['model', 'score_val']])

In [None]:
for target, model in models.items(): 
    print(f'Best model for {target}: {model.model_best}')
evaluate(models, X_test, Y_test)

In [None]:
# Clone the predictor without all the extra files
models['ED_visit'].clone_for_deployment('./AutogluonModels/WE_ED_visit_clinic_anchored')