<a href="https://colab.research.google.com/github/philipp-lampert/mymandible/blob/main/data_science/05_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model training

## Import

### Libraries

In [1]:
import warnings
import numpy as np
import pandas as pd
import copy
from statistics import mean, stdev
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import make_scorer, matthews_corrcoef, f1_score, accuracy_score, average_precision_score, roc_auc_score, brier_score_loss
from sklearn.model_selection import cross_validate, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import optuna
from optuna.samplers import TPESampler
import shap
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt

### Data

In [2]:
df_dropped_first_imp = pd.read_parquet('/Users/philipp.lampert/repositories/mymandible/data/dropped_first_imputed.parquet')
df_all_levels_imp = pd.read_parquet('/Users/philipp.lampert/repositories/mymandible/data/all_levels_imputed.parquet')

## Pipeline

### Preprocessing

In [2]:
from modules.functions import preprocessing as prp

### Scoring metrics

In [5]:
from modules.functions import threshold_optimized_metrics as tom

In [6]:
acc_scorer = make_scorer(tom.optimized_accuracy, needs_proba=True)
f1_scorer = make_scorer(tom.optimized_f1, needs_proba=True)
mcc_scorer = make_scorer(tom.optimized_mcc, needs_proba=True)
pr_auc_scorer = make_scorer(average_precision_score, needs_proba=True)

### Nested Cross-Validation

In [6]:
def objective(trial, classifier, x, y, n, scorer):   
    inner_cv = StratifiedKFold(n_splits=n, shuffle=True, random_state=0)
    classifier_obj = classifier(trial)  
    scores = cross_validate(
        estimator=classifier_obj, 
        X=x, 
        y=y, 
        cv=inner_cv, 
        scoring=scorer, 
        n_jobs=-1
    )   
    return scores['test_score'].mean()

In [7]:
def nested_cv_optuna(outcome, 
                     model, 
                     min_follow_up_days, 
                     scaler, 
                     df, 
                     classifier, 
                     drop_cols, 
                     n,
                     scorer):
    
    outer_cv = StratifiedKFold(n_splits=n, shuffle=True, random_state=0)
    x, y = prp.get_x_y(df=df, outcome=outcome, min_follow_up_days=min_follow_up_days, scaler=scaler, drop_cols=drop_cols)
    
    outer_scores = {
        'mcc': [],
        'brier': [],
        'f1': [],
        'acc': [],
        'pr_auc': [],
        'roc_auc': []
    }
    
    optuna.logging.set_verbosity(optuna.logging.WARNING)  
    sampler = TPESampler(seed=0)
    
    i = 0
    studies = {}
    best_params = []
    
    for i, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(x, y)):
        
        x_train_outer, x_test_outer = x.iloc[outer_train_index], x.iloc[outer_test_index]
        y_train_outer, y_test_outer = y.iloc[outer_train_index], y.iloc[outer_test_index]
        
        studies[i] = optuna.create_study(direction='maximize', study_name=i, sampler=sampler)
        studies[i].optimize(lambda trial: objective(trial, classifier, x_train_outer, y_train_outer, n, scorer), n_trials=500)

        best_params.append(studies[i].best_params)
        current_model = copy.deepcopy(model)
        current_model.set_params(**best_params[-1])
        
        current_model.fit(x_train_outer, y_train_outer)
        y_pred_outer = current_model.predict_proba(x_test_outer)
        y_pred_outer = y_pred_outer[:, 1]
        y_test_outer = y_test_outer.astype('int')
        
        mcc = optimized_mcc(y_test_outer, y_pred_outer)
        pr_auc = average_precision_score(y_test_outer, y_pred_outer)
        
        f1 = optimized_f1(y_test_outer, y_pred_outer)
        acc = optimized_accuracy(y_test_outer, y_pred_outer)       
        roc_auc = roc_auc_score(y_test_outer, y_pred_outer)

        outer_scores['mcc'].append(mcc)
        outer_scores['pr_auc'].append(pr_auc)
        outer_scores['f1'].append(f1)
        outer_scores['acc'].append(acc)
        outer_scores['roc_auc'].append(roc_auc)
        
        print(f"Optimized {i+1} out of {n_splits} models")
    
    print("")
    print("Mean MCC: "f"{mean(outer_scores['mcc']):.3f} ± {stdev(outer_scores['mcc']):.3f}")
    print("Mean PR AUC: "f"{mean(outer_scores['pr_auc']):.3f} ± {stdev(outer_scores['pr_auc']):.3f}")
    print("")    
    print("Mean F1: "f"{mean(outer_scores['f1']):.3f} ± {stdev(outer_scores['f1']):.3f}")              
    print("Mean ROC AUC: "f"{mean(outer_scores['roc_auc']):.3f} ± {stdev(outer_scores['roc_auc']):.3f}")
    print("Mean Accuracy: "f"{mean(outer_scores['acc']):.3f} ± {stdev(outer_scores['acc']):.3f}")
        
    return best_params, x, y

### SHAP-Values

In [11]:
def get_shap_rf(best_params, x, y):
    for i, params in enumerate(best_params):
        model = RandomForestClassifier(**params)
        model.fit(x, y)
        exp = shap.TreeExplainer(model)
        sv = exp.shap_values(x)
        shap.summary_plot(sv[1], x, max_display=11)
        shap.summary_plot(sv[1], x, max_display=11, plot_type='bar')

In [12]:
def get_shap_xgb(best_params, x, y):    
    for i, params in enumerate(best_params):
        model = XGBClassifier(**params)
        model.fit(x, y)
        exp = shap.TreeExplainer(model)
        sv = exp(x)
        shap.plots.beeswarm(sv, max_display=11)
        shap.plots.bar(sv, max_display=11)
    return sv

In [13]:
def get_avg_params(param_list):
    df = pd.DataFrame(param_list)
    numeric_averages = df.select_dtypes(include=['number']).median()
    categorical_modes = df.select_dtypes(exclude=['number']).mode().iloc[0]
    avg_params = {**numeric_averages, **categorical_modes}
    if 'max_depth' in avg_params:
        avg_params['max_depth'] = avg_params['max_depth'].astype('int')
    return avg_params

In [14]:
def get_base_values(selected_sv):
    bv = []
    for sv in selected_sv:
        bv.append(sv.base_values[0])
    return bv

## Model configuration

### Logistic Regression

In [15]:
def lr_newton_classifier(trial):
    
    solver_chosen = trial.suggest_categorical('solver', ['newton-cg', 'newton-cholesky'])
    C_chosen = trial.suggest_float('C', 1e-10, 1e10, log=True)
    class_weight_chosen = trial.suggest_categorical('class_weight', ['balanced', None])
    penalty_chosen = trial.suggest_categorical('penalty', ['l2'])
    
    classifier_obj = LogisticRegression(
        solver=solver_chosen, 
        C=C_chosen, 
        penalty=penalty_chosen, 
        class_weight=class_weight_chosen
    )
        
    return classifier_obj

In [16]:
def lr_liblinear_classifier(trial):
    
    solver_chosen = trial.suggest_categorical('solver', ['liblinear'])
    C_chosen = trial.suggest_float('C', 1e-10, 1e10, log=True)
    class_weight_chosen = trial.suggest_categorical('class_weight', ['balanced', None])
    penalty_chosen = trial.suggest_categorical('penalty', ['l1', 'l2'])
    
    classifier_obj = LogisticRegression(
        solver=solver_chosen, 
        C=C_chosen, 
        penalty=penalty_chosen, 
        class_weight=class_weight_chosen
    )
          
    return classifier_obj

In [17]:
def statsmodel(outcome, min_follow_up_days, scaler, df, drop_cols, cv):
    
    x, y = prp.get_x_y(df=df, outcome=outcome, min_follow_up_days=min_follow_up_days, scaler=scaler, drop_cols=drop_cols)
    boolean_columns = x.select_dtypes(include=bool).columns
    x[boolean_columns] = x[boolean_columns].astype('int')
    numeric_columns = x.select_dtypes(include='number').columns
    x[numeric_columns] = x[numeric_columns].astype('float64')
    y = y.astype('int')    
    x_columns = x.columns
    all_columns = "+".join(x_columns)
    formula = outcome +  '~' + all_columns
    
    if cv == True:
        cv = StratifiedKFold(n_splits=cv, shuffle=True, random_state=0)
        scores = {
            'mcc': [],
            'brier': [],
            'f1': [],
            'acc': [],
            'pr_auc': [],
            'roc_auc': []
        }   

        for i, (train_index, test_index) in enumerate(cv.split(x, y)):
            x_train, x_test = x.iloc[train_index], x.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            data_train = pd.concat([x_train, y_train], axis=1)

            model = smf.logit(formula=formula, data=data_train).fit() 
            y_pred = model.predict(x_test)

            mcc = optimized_mcc(y_test, y_pred)
            f1 = optimized_f1(y_test, y_pred)
            acc = optimized_accuracy(y_test, y_pred)
            pr_auc = average_precision_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred)

            scores['mcc'].append(mcc)
            scores['f1'].append(f1)
            scores['acc'].append(acc)
            scores['pr_auc'].append(pr_auc)
            scores['roc_auc'].append(roc_auc)

        print("")
        print("Mean MCC: "f"{mean(scores['mcc']):.3f} ± {stdev(scores['mcc']):.3f}")
        print("Mean PR AUC: "f"{mean(scores['pr_auc']):.3f} ± {stdev(scores['pr_auc']):.3f}")
        print("")
        print("Mean F1: "f"{mean(scores['f1']):.3f} ± {stdev(scores['f1']):.3f}")     
        print("Mean Accuracy: "f"{mean(scores['acc']):.3f} ± {stdev(scores['acc']):.3f}")   
        print("Mean ROC AUC: "f"{mean(scores['roc_auc']):.3f} ± {stdev(scores['roc_auc']):.3f}")
        print("")
    
    data = pd.concat([x, y], axis=1)
    final_model = smf.logit(formula, data).fit()
    print(final_model.summary())

### kNN

In [18]:
def knn_classifier(trial):
    
    n_neighbors_chosen = trial.suggest_int('n_neighbors', 2, 25)
    weights_chosen = trial.suggest_categorical('weights', ['uniform', 'distance'])
    algorithm_chosen = trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute'])
    leaf_size_chosen = trial.suggest_int('leaf_size', 1, 1e+6, log=True)
    p_chosen = trial.suggest_float('p', 1, 1e+6, log=True)
    
    classifier_obj = KNeighborsClassifier(
        n_neighbors=n_neighbors_chosen, 
        weights=weights_chosen, 
        algorithm=algorithm_chosen, 
        leaf_size=leaf_size_chosen, 
        p=p_chosen
    )
        
    return classifier_obj

### Random Forest

In [19]:
def rf_classifier(trial):
    
    #n_estimators_chosen = trial.suggest_int('n_estimators', 50, 3000)
    criterion_chosen = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    max_depth_chosen = trial.suggest_int('max_depth', 1, 10)
    min_samples_split_chosen = trial.suggest_float('min_samples_split', 1e-6, 1)
    min_samples_leaf_chosen = trial.suggest_float('min_samples_leaf', 1e-6, 1)
    min_weight_fraction_leaf_chosen = trial.suggest_float('min_weight_fraction_leaf', 0, 0.5)
    max_features_chosen = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    bootstrap_chosen = trial.suggest_categorical('bootstrap', [True, False])
    class_weight_chosen = trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None])
    
    classifier_obj = RandomForestClassifier(
        n_estimators=500, 
        criterion=criterion_chosen,
        max_depth=max_depth_chosen, 
        min_samples_split=min_samples_split_chosen,
        min_samples_leaf=min_samples_leaf_chosen,
        min_weight_fraction_leaf=min_weight_fraction_leaf_chosen,
        max_features=max_features_chosen,
        bootstrap=bootstrap_chosen,
        class_weight=class_weight_chosen, 
        random_state=0
    )
        
    return classifier_obj

### XGBoost

In [20]:
def xgb_classifier(trial):
    
    eta_chosen = trial.suggest_float('eta', 1e-6, 1, log=True)
    gamma_chosen = trial.suggest_float('gamma', 1e-6, 1e+4, log=True)
    max_depth_chosen = trial.suggest_int('max_depth', 1, 10)
    min_child_weight_chosen = trial.suggest_float('min_child_weight', 1e-6, 1e+4, log=True)
    max_delta_step_chosen = trial.suggest_float('max_delta_step', 0, 10)
    subsample_chosen = trial.suggest_float('subsample', 0, 1)
    colsample_bytree_chosen = trial.suggest_float('colsample_bytree', 0, 1)
    lambda_chosen = trial.suggest_float('reg_lambda', 1e-6, 1e+4, log=True)
    alpha_chosen = trial.suggest_float('reg_alpha', 1e-6, 1e+4, log=True)
    tree_method_chosen = trial.suggest_categorical('tree_method', ['exact', 'approx', 'hist'])
    scale_pos_weight_chosen = trial.suggest_float('scale_pos_weight', 1, 9)
   
    classifier_obj = XGBClassifier(
        eta=eta_chosen, 
        gamma=gamma_chosen, 
        max_depth=max_depth_chosen, 
        min_child_weight=min_child_weight_chosen,
        max_delta_step=max_delta_step_chosen,
        subsample=subsample_chosen,
        colsample_bytree=colsample_bytree_chosen,
        reg_lambda=lambda_chosen,
        reg_alpha=alpha_chosen,
        tree_method=tree_method_chosen,
        scale_pos_weight=scale_pos_weight_chosen,
        random_state=0,
        verbosity=0
    )
        
    return classifier_obj

## Results

#### Any complication

##### Configuration

In [21]:
follow_up_any_cx = np.mean([
    df_all_levels_imp['days_to_whd_recipient_site'].median(),
    df_all_levels_imp['days_to_partial_necrosis'].median(),
    df_all_levels_imp['days_to_bone_exposure'].median(),
    df_all_levels_imp['days_to_plate_exposure'].median(),
    df_all_levels_imp['days_to_wound_infection'].median(),
    df_all_levels_imp['days_to_flap_loss'].median(),
    df_all_levels_imp['days_to_whd_donor_site'].median(),
    df_all_levels_imp['days_to_osteoradionecrosis'].median(),
    df_all_levels_imp['days_to_plate_removal'].median(),
    df_all_levels_imp['days_to_plate_loosening'].median(),
    df_all_levels_imp['days_to_fracture'].median(),
    df_all_levels_imp['days_to_dislocation'].median()
])

drop_cols_any_cx = []

##### Logistic Regression
Mean MCC: 0.281, Mean PR AUC: 0.328

###### Statsmodel
Mean MCC: 0.281, Mean PR AUC: 0.328

In [30]:
# NUR 3 SEGMENTE
def lr_any_cx_no_cv():
    
    drop_cols_any_cx_3 = drop_cols_any_cx.copy()
    drop_cols_any_cx_3.extend([
        'comorbidity___autoimmune_disease', 
        'comorbidity___hypothyroidism', 
        'comorbidity___copd', 
        'radiotherapy___pre_surgery',
        'chemotherapy___pre_surgery',
        'comorbidity___hypertension',
        'comorbidity___chronic_kidney_disease',
        'prior_flap___non_bony', 
        'prior_flap___bony',
        'venous_anastomosis_type___end_end',
        'venous_anastomosis_type___end_side',
        'skin_transplanted',
        'surgery_duration_min',
        'urkens_classification___c'
    ])
    
    x, y = get_x_y(
    df=df_dropped_first_3, 
    outcome='any_complication', 
    min_follow_up_days=follow_up_any_cx, 
    scaler='None', 
    drop_cols=drop_cols_any_cx_3
    )

    boolean_columns = x.select_dtypes(include=bool).columns
    x[boolean_columns] = x[boolean_columns].astype('int')
    numeric_columns = x.select_dtypes(include='number').columns
    x[numeric_columns] = x[numeric_columns].astype('float64')
    y = y.astype('int')

    x_columns = x.columns
    all_columns = "+".join(x_columns)
    formula = 'any_complication' +  '~' + all_columns

    data = pd.concat([x, y], axis=1)
    final_model = smf.logit(formula, data).fit()
    print(final_model.summary())
    
lr_any_cx_no_cv()

         Current function value: 0.244284
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:       any_complication   No. Observations:                   62
Model:                          Logit   Df Residuals:                       47
Method:                           MLE   Df Model:                           14
Date:                Wed, 14 Feb 2024   Pseudo R-squ.:                  0.4774
Time:                        16:09:42   Log-Likelihood:                -15.146
converged:                      False   LL-Null:                       -28.982
Covariance Type:            nonrobust   LLR p-value:                   0.01572
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
Intercept                               -21.4872      7.797     -2.756      0.006     -36.

The outcome variable has been inversed due to the positive values being the majority class. This may lead to misinterpretation of coefficients and/or feature importances but keeps metrics such as F1-scores comparable.
Maximum Likelihood optimization failed to converge. Check mle_retvals


In [36]:
# Statsmodel LR
lr_statsmodels(
    outcome='any_complication', 
    min_follow_up_days=follow_up_any_cx, 
    scaler='None',
    df=df_dropped_first_imp, 
    drop_cols=drop_cols_any_cx
)

The outcome variable has been inversed due to the positive values being the majority class. This may lead to misinterpretation of coefficients and/or feature importances but keeps metrics such as F1-scores comparable.


Optimization terminated successfully.
         Current function value: 0.420781
         Iterations 7
         Current function value: 0.387633
         Iterations: 35


Maximum Likelihood optimization failed to converge. Check mle_retvals
Maximum Likelihood optimization failed to converge. Check mle_retvals


         Current function value: 0.449589
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.426781
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.405215
         Iterations 8

Mean MCC: 0.281 ± 0.138
Mean PR AUC: 0.328 ± 0.081

Mean F1: 0.389 ± 0.074
Mean Accuracy: 0.793 ± 0.025
Mean ROC AUC: 0.501 ± 0.097

Optimization terminated successfully.
         Current function value: 0.445292
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:       any_complication   No. Observations:                  237
Model:                          Logit   Df Residuals:                      204
Method:                           MLE   Df Model:                           32
Date:                Wed, 14 Feb 2024   Pseudo R-squ.:                  0.1621
Time:                        16:12:35   Log-Likelihood:                -105.53
converged:          

###### Newton
Mean MCC: 0.235, Mean PR AUC: 0.331

In [None]:
# Logistic Regression (newton-cholesky, newton-cg)
params_newton_any_cx, x_newton_any_cx, y_newton_any_cx = nested_cv_optuna(
    outcome='any_complication', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_any_cx, 
    scaler='None',
    df=df_dropped_first_imp, 
    classifier=lr_newton_classifier,
    drop_cols=drop_cols_any_cx
)

###### Liblinear
Mean MCC: 203, Mean PR AUC: 306

In [None]:
# Liblinear Logistic Regression
params_liblinear_any_cx, x_liblinear_any_cx, y_liblinear_any_cx = nested_cv_optuna(
    outcome='any_complication', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_any_cx, 
    scaler='None', 
    df=df_dropped_first_imp, 
    classifier=lr_liblinear_classifier,
    drop_cols=drop_cols_any_cx
)

##### kNN
Mean MCC: 0.257, Mean PR AUC: 0.343

In [None]:
# Does not converge
#params_knn_any_cx, x_knn_any_cx, y_knn_any_cx = nested_cv_optuna(
#    outcome='any_complication', 
#    model=KNeighborsClassifier(),
#    min_follow_up_days=follow_up_any_cx, 
#    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
#    df=df_all_levels_imp, 
#    classifier=knn_classifier,
#    drop_cols=drop_cols_any_cx
#)

##### Random Forest
Mean MCC: 0.262, Mean PR AUC: 0.362

In [None]:
params_rf_any_cx, x_rf_any_cx, y_rf_any_cx = nested_cv_optuna(
    outcome='any_complication', 
    model=RandomForestClassifier(n_estimators=500, random_state=0),
    min_follow_up_days=follow_up_any_cx, 
    scaler='None', 
    df=df_all_levels_imp, 
    classifier=rf_classifier,
    drop_cols=drop_cols_any_cx
)

In [None]:
avg_params_rf_any_cx = get_avg_params(params_rf_any_cx)
get_shap_rf([avg_params_rf_any_cx], x_rf_any_cx, y_rf_any_cx)

##### XGBoost
Mean MCC: 0.183, Mean PR AUC: 0.288

In [None]:
params_xgb_any_cx, x_xgb_any_cx, y_xgb_any_cx = nested_cv_optuna(
    outcome='any_complication', 
    model=XGBClassifier(),
    min_follow_up_days=follow_up_any_cx, 
    scaler='None', 
    df=df_all_levels_imp, 
    classifier=xgb_classifier,
    drop_cols=drop_cols_any_cx
)

In [None]:
avg_params_xgb_any_cx = get_avg_params(params_xgb_any_cx)
get_shap_xgb([avg_params_xgb_any_cx], x_xgb_any_cx, y_xgb_any_cx)

#### Plate exposure

##### Configuration

In [32]:
follow_up_plate_exp = df_all_levels_imp['days_to_plate_exposure'].median()
drop_cols_plate_exp = ['venous_anastomosis_type___end_side', 'venous_anastomosis_type___end_end']

##### Logistic Regression
Mean MCC: 0.360, Mean PR AUC: 0.371

###### Statsmodel
Mean MCC: 0.362, Mean PR AUC: 0.384

In [None]:
def lr_plate_exp_no_cv():
    
    drop_cols_plate_exp_3 = drop_cols_plate_exp.copy()
    drop_cols_plate_exp_3.extend([
        'comorbidity___autoimmune_disease', 
        'comorbidity___hypothyroidism', 
        'comorbidity___copd', 
        'comorbidity___hyperlipidemia',
        'radiotherapy___pre_surgery',
        'chemotherapy___pre_surgery',
        'comorbidity___hypertension',
        'comorbidity___chronic_kidney_disease',
        'prior_flap___non_bony', 
        'prior_flap___bony'
    ])
    
    x, y = get_x_y(
    df=df_dropped_first_3, 
    outcome='complication_plate___exposure', 
    min_follow_up_days=follow_up_plate_exp, 
    scaler='None', 
    drop_cols=drop_cols_plate_exp_3
    )

    boolean_columns = x.select_dtypes(include=bool).columns
    x[boolean_columns] = x[boolean_columns].astype('int')
    numeric_columns = x.select_dtypes(include='number').columns
    x[numeric_columns] = x[numeric_columns].astype('float64')
    y = y.astype('int')

    x_columns = x.columns
    all_columns = "+".join(x_columns)
    formula = 'complication_plate___exposure' +  '~' + all_columns

    data = pd.concat([x, y], axis=1)
    final_model = smf.logit(formula, data).fit()
    print(final_model.summary())
    
lr_plate_exp_no_cv()

In [None]:
# Statsmodel LR
lr_statsmodels(
    outcome='complication_plate___exposure', 
    min_follow_up_days=follow_up_plate_exp, 
    scaler='None',
    df=df_dropped_first_imp, 
    drop_cols=drop_cols_plate_exp
)

###### Newton
Mean MCC: 0.352, Mean PR AUC: 0.384

In [None]:
# Logistic Regression (newton-cholesky, newton-cg)
params_newton_plate_exp, x_newton_plate_exp, y_newton_plate_exp = nested_cv_optuna(
    outcome='complication_plate___exposure', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_plate_exp, 
    #scaler='None',
    scaler=QuantileTransformer(random_state=0),
    df=df_dropped_first_imp, 
    classifier=lr_newton_classifier,
    drop_cols=drop_cols_plate_exp
)

###### Liblinear
Mean MCC: 0.321, Mean PR AUC: 0.335

In [None]:
# Liblinear Logistic Regression
params_liblinear_plate_exp, x_liblinear_plate_exp, y_liblinear_plate_exp = nested_cv_optuna(
    outcome='complication_plate___exposure', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_plate_exp, 
    scaler='None',
    df=df_dropped_first_imp, 
    classifier=lr_liblinear_classifier,
    drop_cols=drop_cols_plate_exp
)

##### kNN
Mean MCC: 0.326, Mean PR AUC: 0.379

In [None]:
params_knn_plate_exp, x_knn_plate_exp, y_knn_plate_exp = nested_cv_optuna(
    outcome='complication_plate___exposure', 
    model=KNeighborsClassifier(),
    min_follow_up_days=follow_up_plate_exp, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=knn_classifier,
    drop_cols=drop_cols_plate_exp
)

##### Random Forest
Mean MCC: 0.315, Mean PR AUC: 0.390

In [None]:
params_rf_plate_exp, x_rf_plate_exp, y_rf_plate_exp = nested_cv_optuna(
    outcome='complication_plate___exposure', 
    model=RandomForestClassifier(n_estimators=500, random_state=0),
    min_follow_up_days=follow_up_plate_exp, 
    scaler='None', 
    df=df_all_levels_imp, 
    classifier=rf_classifier,
    drop_cols=drop_cols_plate_exp
)

In [None]:
avg_params_rf_plate_exp = get_avg_params(params_rf_plate_exp)
get_shap_rf([avg_params_rf_plate_exp], x_rf_plate_exp, y_rf_plate_exp)

##### XGBoost

Mean MCC: 0.348,
Mean PR AUC: 0.394

In [None]:
params_xgb_plate_exp, x_xgb_plate_exp, y_xgb_plate_exp = nested_cv_optuna(
    outcome='complication_plate___exposure', 
    model=XGBClassifier(random_state=0, verbosity=0),
    min_follow_up_days=follow_up_plate_exp, 
    scaler='None',
    df=df_all_levels_imp, 
    classifier=xgb_classifier,
    drop_cols=drop_cols_plate_exp
)

In [None]:
avg_params_xgb_plate_exp = get_avg_params(params_xgb_plate_exp)
sv_xgb_plate_exp = get_shap_xgb([avg_params_xgb_plate_exp], x_xgb_plate_exp, y_xgb_plate_exp)

#### Bone exposure

##### Configuration

In [None]:
follow_up_bone_exp = df_all_levels_imp['days_to_bone_exposure'].median()
drop_cols_bone_exp = ['venous_anastomosis_type___end_side', 'venous_anastomosis_type___end_end']

##### Logistic Regression

###### Statsmodels

In [None]:
# Statsmodel LR
lr_statsmodels(
    outcome='complication___bone_exposure', 
    min_follow_up_days=follow_up_bone_exp, 
    scaler='None',
    df=df_dropped_first_imp, 
    drop_cols=drop_cols_bone_exp
)

###### Newton

In [None]:
# Logistic Regression (newton-cholesky, newton-cg)
params_newton_bone_exp, x_newton_bone_exp, y_newton_bone_exp = nested_cv_optuna(
    outcome='complication___bone_exposure', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_bone_exp, 
    scaler='None',
    df=df_dropped_first_imp, 
    classifier=lr_newton_classifier,
    drop_cols=drop_cols_bone_exp
)

###### Liblinear

In [None]:
# Liblinear Logistic Regression
params_liblinear_bone_exp, x_liblinear_bone_exp, y_liblinear_bone_exp = nested_cv_optuna(
    outcome='complication___bone_exposure', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_bone_exp, 
    scaler='None',
    df=df_dropped_first_imp, 
    classifier=lr_liblinear_classifier,
    drop_cols=drop_cols_bone_exp
)

##### kNN

In [None]:
params_knn_bone_exp, x_knn_bone_exp, y_knn_bone_exp = nested_cv_optuna(
    outcome='complication___bone_exposure', 
    model=KNeighborsClassifier(),
    min_follow_up_days=follow_up_bone_exp, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=knn_classifier,
    drop_cols=drop_cols_bone_exp
)

##### Random Forest

In [None]:
params_rf_bone_exp, x_rf_bone_exp, y_rf_bone_exp = nested_cv_optuna(
    outcome='complication___bone_exposure', 
    model=RandomForestClassifier(n_estimators=500, random_state=0),
    min_follow_up_days=follow_up_bone_exp, 
    scaler='None', 
    df=df_all_levels_imp, 
    classifier=rf_classifier,
    drop_cols=drop_cols_bone_exp
)

##### XGBoost

In [None]:
params_xgb_bone_exp, x_xgb_bone_exp, y_xgb_bone_exp = nested_cv_optuna(
    outcome='complication___bone_exposure', 
    model=XGBClassifier(random_state=0, verbosity=0),
    min_follow_up_days=follow_up_bone_exp, 
    scaler='None',
    df=df_all_levels_imp, 
    classifier=xgb_classifier,
    drop_cols=drop_cols_bone_exp
)

#### ORN redefined

##### Configuration

ORN will be defined as either ORN or Bone exposure while post-operative RT was received.

In [None]:
follow_up_orn = df_all_levels_imp['days_to_osteoradionecrosis'].median()
drop_cols_orn = ['venous_anastomosis_type___end_side', 'venous_anastomosis_type___end_end']

##### Logistic Regression

###### Statsmodel

In [None]:
# 3 SEGMENTS ONLY
def lr_orn_no_cv_3():
    
    drop_cols_orn_3 = drop_cols_orn.copy()
    drop_cols_orn_3.extend([
        'comorbidity___autoimmune_disease', 
        'comorbidity___hypothyroidism', 
        'comorbidity___copd', 
        'comorbidity___hyperlipidemia',
        'radiotherapy___pre_surgery',
        'chemotherapy___pre_surgery',
        'comorbidity___hypertension',
        'comorbidity___chronic_kidney_disease',
        'prior_flap___non_bony', 
        'prior_flap___bony'
    ])
    
    x, y = get_x_y(
    df=df_dropped_first_3, 
    outcome='complication___osteoradionecrosis', 
    min_follow_up_days=follow_up_orn, 
    scaler='None', 
    drop_cols=drop_cols_orn_3
    )

    boolean_columns = x.select_dtypes(include=bool).columns
    x[boolean_columns] = x[boolean_columns].astype('int')
    numeric_columns = x.select_dtypes(include='number').columns
    x[numeric_columns] = x[numeric_columns].astype('float64')
    y = y.astype('int')

    x_columns = x.columns
    all_columns = "+".join(x_columns)
    formula = 'complication___osteoradionecrosis' +  '~' + all_columns

    data = pd.concat([x, y], axis=1)
    final_model = smf.logit(formula, data).fit()
    print(final_model.summary())
    
lr_orn_no_cv_3()

In [None]:
#lr_statsmodels(
#    outcome='orn_redefined', 
#    min_follow_up_days=follow_up_orn, 
#    scaler='None',
#    df=df_dropped_first_imp, 
#    drop_cols=drop_cols_orn
#)

# DID NOT CONVERGE

In [None]:
df_all_levels_imp['complication_plate___fracture'].value_counts()

In [None]:
def lr_orn_no_cv():
    x, y = get_x_y(
    df=df_dropped_first_imp, 
    outcome='orn_redefined', 
    min_follow_up_days=follow_up_orn, 
    scaler='None', 
    drop_cols=drop_cols_orn
    )

    boolean_columns = x.select_dtypes(include=bool).columns
    x[boolean_columns] = x[boolean_columns].astype('int')
    numeric_columns = x.select_dtypes(include='number').columns
    x[numeric_columns] = x[numeric_columns].astype('float64')
    y = y.astype('int')

    x_columns = x.columns
    all_columns = "+".join(x_columns)
    formula = 'orn_redefined' +  '~' + all_columns

    data = pd.concat([x, y], axis=1)
    final_model = smf.logit(formula, data).fit()
    print(final_model.summary())
    
lr_orn_no_cv()

###### Newton

In [None]:
params_newton_orn, x_newton_orn, y_newton_orn = nested_cv_optuna(
    outcome='orn_redefined', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_orn, 
    scaler=QuantileTransformer(random_state=0),
    df=df_dropped_first_imp, 
    classifier=lr_newton_classifier,
    drop_cols=drop_cols_orn
)

###### Liblinear

In [None]:
# Liblinear Logistic Regression
params_liblinear_orn, x_liblinear_orn, y_liblinear_orn = nested_cv_optuna(
    outcome='orn_redefined', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_orn, 
    scaler=QuantileTransformer(random_state=0),
    df=df_dropped_first_imp, 
    classifier=lr_liblinear_classifier,
    drop_cols=drop_cols_orn
)

##### kNN

In [None]:
params_knn_orn, x_knn_orn, y_knn_orn = nested_cv_optuna(
    outcome='orn_redefined', 
    model=KNeighborsClassifier(),
    min_follow_up_days=follow_up_orn, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=knn_classifier,
    drop_cols=drop_cols_orn
)

##### Random Forest

In [None]:
params_rf_orn, x_rf_orn, y_rf_orn = nested_cv_optuna(
    outcome='orn_redefined', 
    model=RandomForestClassifier(n_estimators=500, random_state=0),
    min_follow_up_days=follow_up_orn, 
    scaler='None', 
    df=df_all_levels_imp, 
    classifier=rf_classifier,
    drop_cols=drop_cols_orn
)

In [None]:
avg_params_rf_orn = get_avg_params(params_rf_orn)
get_shap_rf([avg_params_rf_orn], x_rf_orn, y_rf_orn)

##### XGBoost

In [None]:
params_xgb_orn, x_xgb_orn, y_xgb_orn = nested_cv_optuna(
    outcome='orn_redefined', 
    model=XGBClassifier(random_state=0, verbosity=0),
    min_follow_up_days=follow_up_orn, 
    scaler='None',
    df=df_all_levels_imp, 
    classifier=xgb_classifier,
    drop_cols=drop_cols_orn
)

In [None]:
avg_params_xgb_orn = get_avg_params(params_xgb_orn)
sv_xgb_orn = get_shap_xgb([avg_params_xgb_orn], x_xgb_orn, y_xgb_orn)

#### Nonunion

##### Configuration

In [None]:
follow_up_nonunion = 0 
drop_cols_nonunion = ['skin_transplanted']

# No minimum follow-up since all patients without imaging data are excluded 
# as they have a NaN in the nonunion column

##### Logistic Regression
Mean MCC: 0.329, Mean PR AUC: 0.59

###### Statsmodel
Mean MCC: 0.294, Mean PR AUC: 0.566

In [None]:
df_dropped_first_tumor_3 = df_dropped_first_3[(df_dropped_first_3['indication___osteoradionecrosis'] == False) & (df_dropped_first_3['indication___secondary_reconstruction'] == False)]
df_dropped_first_tumor_3 = df_dropped_first_tumor_3.drop(['indication___osteoradionecrosis', 'indication___secondary_reconstruction'], axis=1)
df_dropped_first_tumor_3['nonunion'].value_counts()

In [None]:
def lr_nonunion_no_cv():
    
    drop_cols_nonunion_3 = drop_cols_nonunion.copy()
    drop_cols_nonunion_3.extend([      
        'comorbidity___autoimmune_disease', 
        'comorbidity___hypothyroidism', 
        'comorbidity___copd',   
        'radiotherapy___pre_surgery',
        'chemotherapy___pre_surgery',
        'prior_flap___non_bony',
        'prior_flap___bony',
        'comorbidity___hyperlipidemia',
        'venous_anastomosis_type___end_side',
        'venous_anastomosis_type___end_end',
        'comorbidity___hypertension',
        'comorbidity___chronic_kidney_disease',
        'comorbidity___atherosclerosis'
    ])
    
    x, y = get_x_y(
    df=df_dropped_first_tumor_3,
    outcome='nonunion', 
    min_follow_up_days=follow_up_nonunion, 
    scaler='None', 
    drop_cols=drop_cols_nonunion_3
    )

    boolean_columns = x.select_dtypes(include=bool).columns
    x[boolean_columns] = x[boolean_columns].astype('int')
    numeric_columns = x.select_dtypes(include='number').columns
    x[numeric_columns] = x[numeric_columns].astype('float64')
    y = y.astype('int')

    x_columns = x.columns
    all_columns = "+".join(x_columns)
    formula = 'nonunion' +  '~' + all_columns

    data = pd.concat([x, y], axis=1)
    final_model = smf.logit(formula, data).fit()
    print(final_model.summary())
    
lr_nonunion_no_cv()

In [None]:
# Statsmodel LR
lr_statsmodels(
    outcome='nonunion', 
    min_follow_up_days=follow_up_nonunion, 
    scaler='None',
    df=df_dropped_first_imp, 
    drop_cols=drop_cols_nonunion
)

###### Newton (Scaled)
Mean MCC: 0.329, Mean PR AUC: 0.590

In [None]:
# Logistic Regression (lbfgs, newton-cholesky, newton-cg)
params_newton_nonunion, x_newton_nonunion, y_newton_nonunion = nested_cv_optuna(
    outcome='nonunion', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_nonunion,  
    scaler='None',
    df=df_dropped_first_imp, 
    classifier=lr_newton_classifier,
    drop_cols=drop_cols_nonunion
)

###### Liblinear (Scaled)
Mean MCC: 0.312, Mean PR AUC: 0.586

In [None]:
# Liblinear Logistic Regression
params_liblinear_nonunion, x_liblinear_nonunion, y_liblinear_nonunion = nested_cv_optuna(
    outcome='nonunion', 
    model=LogisticRegression(max_iter=1000),
    min_follow_up_days=follow_up_nonunion, 
    scaler='None', 
    df=df_dropped_first_imp, 
    classifier=lr_liblinear_classifier,
    drop_cols=drop_cols_nonunion
)

##### kNN
Mean MCC: 0.211, Mean PR AUC: 0.5

In [None]:
params_knn_nonunion, x_knn_nonunion, y_knn_nonunion = nested_cv_optuna(
    outcome='nonunion', 
    model=KNeighborsClassifier(),
    min_follow_up_days=follow_up_nonunion, 
    scaler=QuantileTransformer(n_quantiles=150, random_state=0), 
    df=df_all_levels_imp, 
    classifier=knn_classifier,
    drop_cols=drop_cols_nonunion
)

##### Random Forest
Mean MCC: 0.337, Mean PR AUC: 0.583

In [None]:
params_rf_nonunion, x_rf_nonunion, y_rf_nonunion = nested_cv_optuna(
    outcome='nonunion', 
    model=RandomForestClassifier(random_state=0),
    min_follow_up_days=follow_up_nonunion, 
    scaler='None', 
    df=df_all_levels_imp, 
    classifier=rf_classifier,
    drop_cols=drop_cols_nonunion
)

In [None]:
avg_params_rf_nonunion = get_avg_params(params_rf_nonunion)
get_shap_rf([avg_params_rf_nonunion], x_rf_nonunion, y_rf_nonunion)

##### XGBoost
Mean MCC: 0.278, Mean PR AUC: 0.578

In [None]:
params_xgb_nu, x_xgb_nu, y_xgb_nu = nested_cv_optuna(
    outcome='nonunion', 
    model=XGBClassifier(),
    min_follow_up_days=follow_up_nonunion, 
    scaler='None', 
    df=df_all_levels_imp, 
    classifier=xgb_classifier,
    drop_cols=drop_cols_nonunion
)

In [None]:
avg_params_xgb_nu = get_avg_params(params_xgb_nu)
get_shap_xgb([avg_params_xgb_nu], x_xgb_nu, y_xgb_nu)

#### Soft tissue complication

##### Configuration

In [None]:
follow_up_stx = np.mean([
    df_all_levels_imp['days_to_whd_recipient_site'].median(),
     df_all_levels_imp['days_to_partial_necrosis'].median(),
     df_all_levels_imp['days_to_bone_exposure'].median(),
     df_all_levels_imp['days_to_plate_exposure'].median(),
     df_all_levels_imp['days_to_wound_infection'].median()
])

drop_cols_stx = []

##### Logistic Regression
Mean MCC: 0.458, Mean PR AUC: 691

###### Statsmodel
Mean MCC: 0.383, Mean PR AUC: 0.648

In [None]:
# 3 SEGMENTS ONLY

def lr_stx_no_cv():
    
    drop_cols_stx_3 = drop_cols_stx.copy()
    drop_cols_stx_3.extend([
        'comorbidity___autoimmune_disease', 
        'comorbidity___hypothyroidism', 
        'comorbidity___copd',   
        'radiotherapy___pre_surgery',
        'chemotherapy___pre_surgery',
        'prior_flap___non_bony',
        'prior_flap___bony',
        'comorbidity___hyperlipidemia',
        'venous_anastomosis_type___end_side',
        'venous_anastomosis_type___end_end',
        'comorbidity___hypertension',
        'comorbidity___chronic_kidney_disease',
        'comorbidity___atherosclerosis'

    ])
    
    x, y = get_x_y(
    df=df_dropped_first_tumor_3, 
    outcome='soft_tissue_complication', 
    min_follow_up_days=follow_up_stx, 
    scaler='None', 
    drop_cols=drop_cols_stx_3
    )

    boolean_columns = x.select_dtypes(include=bool).columns
    x[boolean_columns] = x[boolean_columns].astype('int')
    numeric_columns = x.select_dtypes(include='number').columns
    x[numeric_columns] = x[numeric_columns].astype('float64')
    y = y.astype('int')

    x_columns = x.columns
    all_columns = "+".join(x_columns)
    formula = 'soft_tissue_complication' +  '~' + all_columns

    data = pd.concat([x, y], axis=1)
    final_model = smf.logit(formula, data).fit()
    print(final_model.summary())
    
lr_stx_no_cv()

In [None]:
# Statsmodel LR
lr_statsmodels(
    outcome='soft_tissue_complication', 
    min_follow_up_days=follow_up_stx, 
    scaler='None',
    df=df_dropped_first_imp, 
    drop_cols=drop_cols_stx
)

In [None]:
df_all_levels_imp['flap_segment_count'].value_counts()

###### Newton (Scaled)
Mean MCC: 0.458, Mean PR AUC: 0.691

In [None]:
# Logistic Regression (lbfgs, newton-cholesky, newton-cg)
params_newton_stx, x_newton_stx, y_newton_stx = nested_cv_optuna(
    outcome='soft_tissue_complication', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_stx,  
    scaler='None', 
    df=df_dropped_first_imp, 
    classifier=lr_newton_classifier,
    drop_cols=drop_cols_stx
)

###### Liblinear (Scaled)
Mean MCC: 0.416, Mean PR AUC: 0.696

In [None]:
params_liblinear_stx, x_liblinear_stx, y_liblinear_stx = nested_cv_optuna(
    outcome='soft_tissue_complication', 
    model=LogisticRegression(max_iter=1000),
    min_follow_up_days=follow_up_stx, 
    scaler='None', 
    df=df_dropped_first_imp, 
    classifier=lr_liblinear_classifier,
    drop_cols=drop_cols_stx
)

##### kNN
Mean MCC: 0.31, Mean PR AUC: 0.598

In [None]:
params_knn_stx, x_knn_stx, y_knn_stx = nested_cv_optuna(
    outcome='soft_tissue_complication', 
    model=KNeighborsClassifier(),
    min_follow_up_days=follow_up_stx, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=knn_classifier,
    drop_cols=drop_cols_stx
)

##### Random Forest
Mean MCC: 0.375, Mean PR AUC: 0.685

In [None]:
params_rf_stx, x_rf_stx, y_rf_stx = nested_cv_optuna(
    outcome='soft_tissue_complication', 
    model=RandomForestClassifier(random_state=0),
    min_follow_up_days=follow_up_stx, 
    scaler='None', 
    df=df_all_levels_imp, 
    classifier=rf_classifier,
    drop_cols=drop_cols_stx
)

In [None]:
avg_params_rf_stx = get_avg_params(params_rf_stx)
get_shap_rf([avg_params_rf_stx], x_rf_stx, y_rf_stx)

##### XGBoost
Mean MCC: 0.325, Mean PR AUC: 0.659

In [None]:
params_xgb_stx, x_xgb_stx, y_xgb_stx = nested_cv_optuna(
    outcome='soft_tissue_complication', 
    model=XGBClassifier(),
    min_follow_up_days=follow_up_stx, 
    scaler='None', 
    df=df_all_levels_imp, 
    classifier=xgb_classifier,
    drop_cols=drop_cols_stx
)

In [None]:
avg_params_xgb_stx = get_avg_params(params_xgb_stx)
get_shap_xgb([avg_params_xgb_stx], x_xgb_stx, y_xgb_stx)

#### Wound infection

In [None]:
follow_up_wi = df_all_levels_imp['days_to_wound_infection'].median()
drop_cols_wi = []

##### Logistic Regression

###### Statsmodel

In [None]:
# Statsmodel LR
lr_statsmodels(
    outcome='wound_infection', 
    min_follow_up_days=follow_up_wi, 
    scaler='None',
    df=df_dropped_first_imp, 
    drop_cols=drop_cols_wi
)

###### Newton

In [None]:
# Logistic Regression (lbfgs, newton-cholesky, newton-cg)
params_newton_wi, x_newton_wi, y_newton_wi = nested_cv_optuna(
    outcome='wound_infection', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_wi, 
    scaler='None', 
    df=df_dropped_first_imp, 
    classifier=lr_lnn_classifier,
    drop_cols=drop_cols_wi
)

###### Liblinear

In [None]:
# Liblinear Logistic Regression
params_liblinear_wi, x_liblinear_wi, y_liblinear_wi = nested_cv_optuna(
    outcome='wound_infection', 
    model=LogisticRegression(max_iter=1000, random_state=0),
    min_follow_up_days=follow_up_wi, 
    scaler='None', 
    df=df_dropped_first_imp, 
    classifier=lr_liblinear_classifier,
    drop_cols=drop_cols_wi
)

##### kNN

In [None]:
nested_cv_optuna(
    outcome='wound_infection', 
    model=KNeighborsClassifier(),
    min_follow_up_days=follow_up_wi, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=knn_classifier,
    drop_cols=drop_cols_wi
)

##### Random Forest

In [None]:
params_rf_wi, x_rf_wi, y_rf_wi = nested_cv_optuna(
    outcome='wound_infection', 
    model=RandomForestClassifier(random_state=0),
    min_follow_up_days=follow_up_wi, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=rf_classifier,
    drop_cols=drop_cols_wi
)

In [None]:
avg_params_rf_wi = get_avg_params(params_rf_wi)
get_shap_rf([avg_params_rf_wi], x_rf_wi, y_rf_wi)

##### XGBoost

In [None]:
params_xgb_wi, x_xgb_wi, y_xgb_wi = nested_cv_optuna(
    outcome='wound_infection', 
    model=XGBClassifier(random_state=0, verbosity=0),
    min_follow_up_days=follow_up_wi, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=xgb_classifier,
    drop_cols=drop_cols_wi
)

In [None]:
avg_params_xgb_wi = get_avg_params(params_xgb_wi)
get_shap_xgb([avg_params_xgb_wi], x_xgb_wi, y_xgb_wi)

#### Fistula

In [None]:
follow_up_fistula = follow_up_wi
drop_cols_fistula = drop_cols_wi

##### Logistic Regression

###### Statsmodel

In [None]:
lr_statsmodels(
    outcome='infectious_complication___fistula', 
    min_follow_up_days=follow_up_fistula, 
    scaler='None',
    df=df_dropped_first_imp, 
    drop_cols=drop_cols_fistula
)

###### Newton

In [None]:
params_newton_fistula, x_newton_fistula, y_newton_fistula = nested_cv_optuna(
    outcome='infectious_complication___fistula', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_fistula, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_dropped_first_imp, 
    classifier=lr_newton_classifier,
    drop_cols=drop_cols_fistula
)

###### Liblinear

In [None]:
nested_cv_optuna(
    outcome='infectious_complication___fistula', 
    model=LogisticRegression(max_iter=1000, random_state=0),
    min_follow_up_days=follow_up_wi, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_dropped_first_imp, 
    classifier=lr_liblinear_classifier,
    drop_cols=drop_cols_fistula
)

##### kNN

In [None]:
params_knn_fistula, x_knn_fistula, y_knn_fistula = nested_cv_optuna(
    outcome='infectious_complication___fistula', 
    model=KNeighborsClassifier(),
    min_follow_up_days=follow_up_fistula, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=knn_classifier,
    drop_cols=drop_cols_fistula
)

##### Random Forest

In [None]:
params_rf_fistula, x_rf_fistula, y_rf_fistula = nested_cv_optuna(
    outcome='infectious_complication___fistula', 
    model=RandomForestClassifier(random_state=0),
    min_follow_up_days=follow_up_fistula, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=rf_classifier,
    drop_cols=drop_cols_fistula
)

In [None]:
avg_params_rf_fistula = get_avg_params(params_rf_fistula)
get_shap_rf([avg_params_rf_fistula], x_rf_fistula, y_rf_fistula)

##### XGBoost

In [None]:
params_xgb_fistula, x_xgb_fistula, y_xgb_fistula = nested_cv_optuna(
    outcome='infectious_complication___fistula', 
    model=XGBClassifier(random_state=0, verbosity=0),
    min_follow_up_days=follow_up_fistula, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=xgb_classifier,
    drop_cols=drop_cols_fistula
)

In [None]:
avg_params_xgb_fistula = get_avg_params(params_xgb_fistula)
get_shap_xgb([avg_params_xgb_fistula], x_xgb_fistula, y_xgb_fistula)

#### Flap loss
No prediction possible - Not enough positives.

In [None]:
follow_up_fl = 0
drop_cols_fl = ['radiotherapy___post_surgery', 'chemotherapy___post_surgery']

##### Logistic Regression

In [None]:
# Statsmodel LR
# Did not converge
#lr_statsmodels(
#    outcome='flap_loss', 
#    min_follow_up_days=follow_up_fl, 
#    scaler='None',
#    df=df_dropped_first_imp, 
#    drop_cols=drop_cols_fl
#)

In [None]:
def lr_fl_no_cv():
    x, y = get_x_y(
    df=df_dropped_first_imp, 
    outcome='flap_loss', 
    min_follow_up_days=follow_up_fl, 
    scaler='None', 
    drop_cols=drop_cols_fl
    )

    boolean_columns = x.select_dtypes(include=bool).columns
    x[boolean_columns] = x[boolean_columns].astype('int')
    numeric_columns = x.select_dtypes(include='number').columns
    x[numeric_columns] = x[numeric_columns].astype('float64')
    y = y.astype('int')

    x_columns = x.columns
    all_columns = "+".join(x_columns)
    formula = 'flap_loss' +  '~' + all_columns

    data = pd.concat([x, y], axis=1)
    final_model = smf.logit(formula, data).fit()
    print(final_model.summary())
    
lr_fl_no_cv()

In [None]:
# Logistic Regression (lbfgs, newton-cholesky, newton-cg)
# Did not converge
nested_cv_optuna(
    outcome='flap_loss', 
    model=LogisticRegression(max_iter=10000, random_state=0),
    min_follow_up_days=follow_up_fl, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_dropped_first_imp, 
    classifier=lr_lnn_classifier
)

In [None]:
# Liblinear Logistic Regression
# Did not converge
#params_liblinear_fl, x_liblinear_fl, y_liblinear_fl = nested_cv_optuna(
#    outcome='flap_loss', 
#    model=LogisticRegression(max_iter=1000, random_state=0),
#    min_follow_up_days=follow_up_fl, 
#    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
#    df=df_dropped_first_imp, 
#    classifier=lr_liblinear_classifier,
#    drop_cols=drop_cols_fl
#)

##### kNN

In [None]:
nested_cv_optuna(
    outcome='flap_loss', 
    model=KNeighborsClassifier(),
    min_follow_up_days=follow_up_fl, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=knn_classifier
)

##### Random Forest

In [None]:
nested_cv_optuna(
    outcome='flap_loss', 
    model=RandomForestClassifier(random_state=0),
    min_follow_up_days=follow_up_fl, 
    scaler=QuantileTransformer(n_quantiles=200, random_state=0), 
    df=df_all_levels_imp, 
    classifier=rf_classifier
)

##### XGBoost

In [None]:
params_xgb_fl, x_xgb_fl, y_xgb_fl = nested_cv_optuna(
    outcome='flap_loss', 
    model=XGBClassifier(random_state=0, verbosity=0),
    min_follow_up_days=follow_up_fl, 
    scaler='None', 
    df=df_all_levels_imp, 
    classifier=xgb_classifier,
    drop_cols=drop_cols_fl
)

In [None]:
sv_xgb_fl = get_shap_xgb(params_xgb_fl, x_xgb_fl, y_xgb_fl)