# Model training for 3-segment reconstructions

## Import

In [91]:
import warnings
import numpy as np
import pandas as pd
import copy
from statistics import mean, stdev
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import make_scorer, matthews_corrcoef, f1_score, accuracy_score, average_precision_score, roc_auc_score, brier_score_loss
from sklearn.model_selection import cross_validate, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from scipy.stats import chi2_contingency
import shap
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt

In [65]:
df_dropped_first = pd.read_parquet('/Users/philipp.lampert/repositories/mymandible/data/dropped_first_imputed.parquet')
df_all_levels = pd.read_parquet('/Users/philipp.lampert/repositories/mymandible/data/all_levels_imputed.parquet')

In [66]:
def get_3_segment_df(df):
    df3 = df[
        (df['flap_segment_count'] == 3) 
        & (df['flap_donor_site___scapula'] == False) 
        & (df['plate_type___cad_mini'] == False)
        & (df['flap_loss'] == False)
        & (df['indication___secondary_reconstruction'] == False)
        & (df['indication___osteoradionecrosis'] == False)
    ].copy()

    # drop unused variables
    df3 = df3.drop(['flap_donor_site___scapula', 
                    'flap_segment_count', 
                    'plate_type___cad_mini', 
                    'urkens_classification___s',
                    'indication___osteoradionecrosis',
                    'indication___secondary_reconstruction',
                    'prior_flap___bony'
                   ], axis=1)
    return df3

In [67]:
df_df = get_3_segment_df(df_dropped_first)
df_all = get_3_segment_df(df_all_levels)

## Preprocessing

In [68]:
from modules.functions import preprocessing as prp
from modules.functions import threshold_optimized_metrics as tom

In [69]:
acc_scorer = make_scorer(tom.optimized_accuracy, needs_proba=True)
f1_scorer = make_scorer(tom.optimized_f1, needs_proba=True)
mcc_scorer = make_scorer(tom.optimized_mcc, needs_proba=True)
pr_auc_scorer = make_scorer(average_precision_score, needs_proba=True)

## Model setup

In [70]:
def logreg_regularized(outcome, scaler, df, method, alpha):
    
    x, y = prp.get_x_y(df=df, outcome=outcome, min_follow_up_days=90, scaler=scaler, drop_cols=drop_cols, inverse_pos=False)
    boolean_columns = x.select_dtypes(include=bool).columns
    x[boolean_columns] = x[boolean_columns].astype('int')
    numeric_columns = x.select_dtypes(include='number').columns
    x[numeric_columns] = x[numeric_columns].astype('float64')
    y = y.astype('int')    
    x_columns = x.columns
    all_columns = "+".join(x_columns)
    formula = outcome +  '~' + all_columns
    
    data = pd.concat([x, y], axis=1)
    final_model = smf.logit(formula, data).fit_regularized(method=method, alpha=alpha)
    print(final_model.summary())

In [71]:
drop_cols = [
    #'sex_female', 
    #'comorbidity___smoking', 
    #'comorbidity___alcohol',
    'comorbidity___copd', 
    'comorbidity___hypertension',
    'comorbidity___diabetes', 
    #'comorbidity___atherosclerosis',
    'comorbidity___hyperlipidemia', 
    'comorbidity___hypothyroidism',
    'comorbidity___chronic_kidney_disease',
    'comorbidity___autoimmune_disease', 
    #'age_surgery_years',
    'radiotherapy___pre_surgery', 
    #'radiotherapy___post_surgery',
    'chemotherapy___pre_surgery', 
    'chemotherapy___post_surgery',
    'urkens_classification___c', 
    'urkens_classification___r',
    'surgery_duration_min', 
    #'bmi', 
    'skin_transplanted',
    'prior_flap___non_bony', 
    #'plate_type___cad_mix'
]

In [72]:
df_df['prior_flap___non_bony'].value_counts()

prior_flap___non_bony
False    61
True      4
Name: count, dtype: Int64

## Logistic Regression

### Any complication

In [73]:
df_df['comorbidity___autoimmune_disease'].value_counts()

comorbidity___autoimmune_disease
False    61
True      4
Name: count, dtype: Int64

In [74]:
logreg_regularized('any_complication', 'None', df_df, 'l1', alpha=0)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.4086571679232351
            Iterations: 52
            Function evaluations: 57
            Gradient evaluations: 52
                           Logit Regression Results                           
Dep. Variable:       any_complication   No. Observations:                   55
Model:                          Logit   Df Residuals:                       46
Method:                           MLE   Df Model:                            8
Date:                Fri, 23 Feb 2024   Pseudo R-squ.:                  0.3223
Time:                        13:59:10   Log-Likelihood:                -22.476
converged:                       True   LL-Null:                       -33.163
Covariance Type:            nonrobust   LLR p-value:                  0.006218
                                    coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------

### Soft tissue complication

In [75]:
df_df['soft_tissue_complication'].value_counts()

soft_tissue_complication
True     38
False    25
Name: count, dtype: Int64

In [76]:
logreg_regularized('soft_tissue_complication', 'None', df_df, 'l1', alpha=0)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.5048345240365472
            Iterations: 51
            Function evaluations: 55
            Gradient evaluations: 51
                              Logit Regression Results                              
Dep. Variable:     soft_tissue_complication   No. Observations:                   55
Model:                                Logit   Df Residuals:                       46
Method:                                 MLE   Df Model:                            8
Date:                      Fri, 23 Feb 2024   Pseudo R-squ.:                  0.2408
Time:                              13:59:10   Log-Likelihood:                -27.766
converged:                             True   LL-Null:                       -36.572
Covariance Type:                  nonrobust   LLR p-value:                   0.02433
                                    coef    std err          z      P>|z|      [0.025      0.975]
------------

### Nonunion

In [77]:
df_df['nonunion'].value_counts()

nonunion
False    19
True     18
Name: count, dtype: Int64

In [78]:
logreg_regularized('nonunion', 'None', df_df, 'l1', alpha=0)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.6033640946183092
            Iterations: 47
            Function evaluations: 51
            Gradient evaluations: 47
                           Logit Regression Results                           
Dep. Variable:               nonunion   No. Observations:                   36
Model:                          Logit   Df Residuals:                       27
Method:                           MLE   Df Model:                            8
Date:                Fri, 23 Feb 2024   Pseudo R-squ.:                  0.1276
Time:                        13:59:10   Log-Likelihood:                -21.721
converged:                       True   LL-Null:                       -24.898
Covariance Type:            nonrobust   LLR p-value:                    0.6077
                                    coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------

### Wound infection

In [79]:
df_df['wound_infection'].value_counts()

wound_infection
False    44
True     19
Name: count, dtype: Int64

In [80]:
logreg_regularized('wound_infection', 'None', df_df, 'l1', alpha=0)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.43258859730883764
            Iterations: 52
            Function evaluations: 56
            Gradient evaluations: 52
                           Logit Regression Results                           
Dep. Variable:        wound_infection   No. Observations:                   55
Model:                          Logit   Df Residuals:                       46
Method:                           MLE   Df Model:                            8
Date:                Fri, 23 Feb 2024   Pseudo R-squ.:                  0.3158
Time:                        13:59:10   Log-Likelihood:                -23.792
converged:                       True   LL-Null:                       -34.773
Covariance Type:            nonrobust   LLR p-value:                  0.004989
                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------

### Plate exposure

In [81]:
df_df['complication_plate___exposure'].value_counts()

complication_plate___exposure
False    43
True     20
Name: count, dtype: Int64

In [82]:
logreg_regularized('complication_plate___exposure', 'None', df_df, 'l1', alpha=0)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.5244285667627183
            Iterations: 48
            Function evaluations: 53
            Gradient evaluations: 48
                                 Logit Regression Results                                
Dep. Variable:     complication_plate___exposure   No. Observations:                   55
Model:                                     Logit   Df Residuals:                       46
Method:                                      MLE   Df Model:                            8
Date:                           Fri, 23 Feb 2024   Pseudo R-squ.:                  0.1999
Time:                                   13:59:10   Log-Likelihood:                -28.844
converged:                                  True   LL-Null:                       -36.051
Covariance Type:                       nonrobust   LLR p-value:                   0.07155
                                    coef    std err          z      P>

## Univariate Analysis

In [113]:
def chi2_test(outcome, df):
    contingency = pd.crosstab(df[outcome], df['plate_type___cad_mix'])
    c, p, dof, expected = chi2_contingency(contingency)
    print(contingency)
    print(f"p-value: {p}")

### ORN

In [114]:
chi2_test('orn', df_df)

plate_type___cad_mix  False  True 
orn                               
False                    38     17
True                      4      4
p-value: 0.5035516539877656


### Plate failure

In [108]:
chi2_test('plate_failure', df_df)

plate_type___cad_mix  False  True 
plate_failure                     
False                    44     17
True                      0      4
p-value: 0.0148275781432512


### Any complication

In [110]:
chi2_test('any_complication', df_df)

plate_type___cad_mix  False  True 
any_complication                  
False                    17      4
True                     27     17
p-value: 0.19507196999197834


### Soft tissue complication

In [115]:
chi2_test('soft_tissue_complication', df_df)

plate_type___cad_mix      False  True 
soft_tissue_complication              
False                        18      7
True                         24     14
p-value: 0.6489418131874136


### Nonunion

In [109]:
chi2_test('nonunion', df_df)

plate_type___cad_mix  False  True 
nonunion                          
False                    13      6
True                      9      9
p-value: 0.420397139637945


### Wound infection

In [116]:
chi2_test('wound_infection', df_df)

plate_type___cad_mix  False  True 
wound_infection                   
False                    32     12
True                     10      9
p-value: 0.20704693531945387


### Plate exposure

In [117]:
chi2_test('complication_plate___exposure', df_df)

plate_type___cad_mix           False  True 
complication_plate___exposure              
False                             31     12
True                              11      9
p-value: 0.2925183839115074
