# Model training for 3-segment reconstructions

## Import

In [758]:
np.exp(0.269)

1.3086551410464666

In [689]:
import warnings
import numpy as np
import pandas as pd
import copy
from statistics import mean, stdev
from sklearn.preprocessing import QuantileTransformer, RobustScaler
from sklearn.metrics import make_scorer, matthews_corrcoef, f1_score, accuracy_score, average_precision_score, roc_auc_score, brier_score_loss
from sklearn.model_selection import cross_validate, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from scipy.stats import chi2_contingency, chi2, ttest_ind, barnard_exact
import shap
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import seaborn as sns
import matplotlib.pyplot as plt

In [650]:
df_dropped_first = pd.read_parquet('/Users/philipp.lampert/repositories/mymandible/data/dropped_first_imputed.parquet')
df_all_levels = pd.read_parquet('/Users/philipp.lampert/repositories/mymandible/data/all_levels_imputed.parquet')

In [698]:
def get_3_segment_df(df):
    df3 = df[
        (df['flap_segment_count'] == 3) 
        & (df['flap_donor_site___scapula'] == False) 
        & (df['plate_type___cad_mini'] == False)
        & (df['indication___secondary_reconstruction'] == False)
        & (df['indication___osteoradionecrosis'] == False)
        & (df['days_to_follow_up'] >= 91)
    ].copy()
    
    df3['follow_up_completed'] = df3['days_to_follow_up'] > 365
    
    flap_loss_mix = df3[(df3['plate_type___cad_mix'] == True) & (df3['days_to_flap_loss'] < 366)]['flap_loss'].sum()
    flap_loss_reco = df3[(df3['plate_type___cad_mix'] == False) & (df3['days_to_flap_loss'] < 366)]['flap_loss'].sum()

    print("Number of flap loss within 12 months for mixed plates:", flap_loss_mix)
    print("Number of flap loss within 12 months for reco plates:", flap_loss_reco)
    
    df3 = df3[~((df3['flap_loss'] == True) & (df3['days_to_flap_loss'] < 366))]

    # drop unused variables
    df3 = df3.drop(['flap_donor_site___scapula', 
                    'flap_segment_count', 
                    'plate_type___cad_mini', 
                    'urkens_classification___s',
                    'indication___osteoradionecrosis',
                    'indication___secondary_reconstruction',
                    'prior_flap___bony'
                   ], axis=1)
    return df3

In [699]:
df_df = get_3_segment_df(df_dropped_first)
df_all = get_3_segment_df(df_all_levels)

Number of flap loss within 12 months for mixed plates: 1
Number of flap loss within 12 months for reco plates: 2
Number of flap loss within 12 months for mixed plates: 1
Number of flap loss within 12 months for reco plates: 2


## Preprocessing

In [653]:
from modules.functions import preprocessing as prp
from modules.functions import threshold_optimized_metrics as tom

In [654]:
acc_scorer = make_scorer(tom.optimized_accuracy, needs_proba=True)
f1_scorer = make_scorer(tom.optimized_f1, needs_proba=True)
mcc_scorer = make_scorer(tom.optimized_mcc, needs_proba=True)
pr_auc_scorer = make_scorer(average_precision_score, needs_proba=True)

## Model setup

In [655]:
def logreg_regularized(outcome, scaler, df, method, alpha):
    
    x, y = prp.get_x_y(df=df, outcome=outcome, min_follow_up_days=91, scaler=scaler, drop_cols=drop_cols, inverse_pos=False)
    boolean_columns = x.select_dtypes(include=bool).columns
    x[boolean_columns] = x[boolean_columns].astype('int')
    numeric_columns = x.select_dtypes(include='number').columns
    x[numeric_columns] = x[numeric_columns].astype('float64')
    y = y.astype('int')    
    x_columns = x.columns
    all_columns = "+".join(x_columns)
    formula = outcome +  '~' + all_columns
    
    data = pd.concat([x, y], axis=1)
    final_model = smf.logit(formula, data).fit_regularized(method=method, alpha=alpha)
    print(final_model.summary())

In [656]:
drop_cols = [
    #'sex_female', 
    #'comorbidity___smoking', 
    #'comorbidity___alcohol',
    'comorbidity___copd', 
    'comorbidity___hypertension',
    'comorbidity___diabetes', 
    #'comorbidity___atherosclerosis',
    'comorbidity___vascular_disease',
    'comorbidity___hyperlipidemia', 
    'comorbidity___hypothyroidism',
    'comorbidity___chronic_kidney_disease',
    'comorbidity___autoimmune_disease', 
    #'age_surgery_years',
    'radiotherapy___pre_surgery', 
    #'radiotherapy___post_surgery',
    'chemotherapy___pre_surgery', 
    #'chemotherapy___post_surgery',
    'urkens_classification___c', 
    'urkens_classification___r',
    #'surgery_duration_min', 
    #'bmi', 
    #'skin_transplanted',
    'prior_flap___non_bony', 
    #'plate_type___cad_mix'
]

## Patient characteristics

### Numeric variable

In [657]:
def num_variable(variable):
    cad_mix_values = df_df.loc[df_df['plate_type___cad_mix'], variable]
    cad_long_values = df_df.loc[~df_df['plate_type___cad_mix'], variable]
    
    cad_mix_mean = round(cad_mix_values.mean(), 1)
    cad_mix_std = round(cad_mix_values.std(), 1)
    cad_long_mean = round(cad_long_values.mean(), 1)
    cad_long_std = round(cad_long_values.std(), 1)
    
    overall_mean = round(df_df[variable].mean(), 1)
    overall_std = round(df_df[variable].std(), 1)
    
    data = {
        'Mix': [cad_mix_mean, cad_mix_std],
        'Reco': [cad_long_mean, cad_long_std],
        'Overall': [overall_mean, overall_std]
    }
    
    df = pd.DataFrame(data, index=['mean', 'std'])
    print(df)

In [658]:
def t_test(variable, df):

    df_mix = df[df['plate_type___cad_mix'] == True]
    df_reco = df[df['plate_type___cad_mix'] == False]
    array_mix = df_mix[variable].values.astype('int')
    array_reco = df_reco[variable].values.astype('int')
    t_test = ttest_ind(a=array_mix, b=array_reco, nan_policy='raise')
    print(num_variable(variable))
    print(f"p-value: {t_test.pvalue}")

### Nominal variables

In [659]:
def cat_variable(variable):
    # Absolute frequencies
    cad_mix_counts = df_df.loc[df_df['plate_type___cad_mix'], variable].value_counts()
    cad_long_counts = df_df.loc[~df_df['plate_type___cad_mix'], variable].value_counts()
    overall_counts = df_df[variable].value_counts()
    
    # Relative probabilities
    cad_mix_probs = round((cad_mix_counts / cad_mix_counts.sum())*100, 1)
    cad_long_probs = round((cad_long_counts / cad_long_counts.sum())*100, 1)
    overall_probs = round((overall_counts / overall_counts.sum())*100, 1)
    
    # Create DataFrames for absolute frequencies and relative probabilities
    absolute_freq_df = pd.DataFrame({
        'Mix': cad_mix_counts,
        'Reco': cad_long_counts,
        'Overall': overall_counts
    }).fillna(0)  # Fill NaN values with 0
    
    relative_prob_df = pd.DataFrame({
        'Mix': cad_mix_probs,
        'Reco': cad_long_probs,
        'Overall': overall_probs
    }).fillna(0)  # Fill NaN values with 0
    
    print("Absolute frequencies:")
    print(absolute_freq_df)
    
    print("\nRelative probabilities:")
    print(relative_prob_df)

In [660]:
def chi2_test(variable, df):
    contingency = pd.crosstab(df[variable], df['plate_type___cad_mix'])
    chi2_object = chi2_contingency(observed=contingency, correction=False)
    N = contingency.sum().sum()
    n_minus_1 = chi2_object.statistic * (N - 1) / N
    p_value = chi2.sf(n_minus_1, 1)
    print(cat_variable(variable))
    print(f"p-value: {p_value}")

In [692]:
def barnard_test(variable, df):
    contingency = pd.crosstab(df[variable], df['plate_type___cad_mix'])
    obj = barnard_exact(contingency)
    print(cat_variable(variable))
    print(f"p-value: {obj.pvalue}")

### Results

In [661]:
t_test('age_surgery_years', df_df)

       Mix  Reco  Overall
mean  62.3  63.0     62.8
std    7.5   9.3      8.7
None
p-value: 0.7797515178913486


In [662]:
chi2_test('sex_female', df_df)

Absolute frequencies:
            Mix  Reco  Overall
sex_female                    
False        14    30       44
True          5    10       15

Relative probabilities:
             Mix  Reco  Overall
sex_female                     
False       73.7  75.0     74.6
True        26.3  25.0     25.4
None
p-value: 0.9143674293438667


In [663]:
t_test('bmi', df_df)

            Mix  Reco  Overall
mean  24.299999  23.0     23.4
std    3.700000   4.2      4.1
None
p-value: 0.29794716373633423


In [664]:
chi2_test('comorbidity___smoking', df_df)

Absolute frequencies:
                       Mix  Reco  Overall
comorbidity___smoking                    
False                   12    22       34
True                     7    18       25

Relative probabilities:
                        Mix  Reco  Overall
comorbidity___smoking                     
False                  63.2  55.0     57.6
True                   36.8  45.0     42.4
None
p-value: 0.5568847020132546


In [688]:
chi2_test('comorbidity___alcohol', df_df)

Absolute frequencies:
                       Mix  Reco  Overall
comorbidity___alcohol                    
False                   14    29       43
True                     5    11       16

Relative probabilities:
                        Mix  Reco  Overall
comorbidity___alcohol                     
False                  73.7  72.5     72.9
True                   26.3  27.5     27.1
None
p-value: 0.9244828491119699


In [665]:
chi2_test('comorbidity___atherosclerosis', df_df)

Absolute frequencies:
                               Mix  Reco  Overall
comorbidity___atherosclerosis                    
False                           17    31       48
True                             2     9       11

Relative probabilities:
                                Mix  Reco  Overall
comorbidity___atherosclerosis                     
False                          89.5  77.5     81.4
True                           10.5  22.5     18.6
None
p-value: 0.2739391689162969


In [666]:
chi2_test('radiotherapy___post_surgery', df_df)

Absolute frequencies:
                             Mix  Reco  Overall
radiotherapy___post_surgery                    
True                          12    23       35
False                          7    17       24

Relative probabilities:
                              Mix  Reco  Overall
radiotherapy___post_surgery                     
True                         63.2  57.5     59.3
False                        36.8  42.5     40.7
None
p-value: 0.6819083187890074


In [667]:
chi2_test('chemotherapy___post_surgery', df_df)

Absolute frequencies:
                             Mix  Reco  Overall
chemotherapy___post_surgery                    
False                         14    21       35
True                           5    19       24

Relative probabilities:
                              Mix  Reco  Overall
chemotherapy___post_surgery                     
False                        73.7  52.5     59.3
True                         26.3  47.5     40.7
None
p-value: 0.12488364142845684


In [668]:
t_test('surgery_duration_min', df_df)

        Mix   Reco  Overall
mean  702.4  576.5    617.1
std   107.3  102.2    118.8
None
p-value: 5.660997643798076e-05


In [669]:
chi2_test('skin_transplanted', df_df)

Absolute frequencies:
                   Mix  Reco  Overall
skin_transplanted                    
True                14    33       47
False                5     7       12

Relative probabilities:
                    Mix  Reco  Overall
skin_transplanted                     
True               73.7  82.5     79.7
False              26.3  17.5     20.3
None
p-value: 0.43576309397587243


In [670]:
chi2_test('follow_up_completed', df_df)

Absolute frequencies:
                     Mix  Reco  Overall
follow_up_completed                    
True                  12    24       36
False                  7    16       23

Relative probabilities:
                      Mix  Reco  Overall
follow_up_completed                     
True                 63.2  60.0     61.0
False                36.8  40.0     39.0
None
p-value: 0.8177723874627011


## Logistic Regression

In [671]:
vif_df, y = prp.get_x_y(df=df_df, outcome='any_complication', min_follow_up_days=91, scaler='None', drop_cols=drop_cols, inverse_pos=False)
boolean_columns = vif_df.select_dtypes(include=bool).columns
vif_df[boolean_columns] = vif_df[boolean_columns].astype('int')
numeric_columns = vif_df.select_dtypes(include='number').columns
vif_df[numeric_columns] =vif_df[numeric_columns].astype('float64')
X = add_constant(vif_df)
pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)

const                            132.988364
sex_female                         1.195649
comorbidity___smoking              1.559293
comorbidity___alcohol              1.617233
comorbidity___atherosclerosis      1.102941
age_surgery_years                  1.255529
radiotherapy___post_surgery        1.284153
chemotherapy___post_surgery        1.432032
surgery_duration_min               1.632967
bmi                                1.138792
skin_transplanted                  1.260866
plate_type___cad_mix               1.493307
dtype: float64

### Any complication

In [672]:
df_df['comorbidity___autoimmune_disease'].value_counts()

comorbidity___autoimmune_disease
False    56
True      3
Name: count, dtype: Int64

In [673]:
logreg_regularized('any_complication', QuantileTransformer(output_distribution='normal'), df_df, 'l1', alpha=0)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.4032554132111667
            Iterations: 66
            Function evaluations: 66
            Gradient evaluations: 66
                           Logit Regression Results                           
Dep. Variable:       any_complication   No. Observations:                   59
Model:                          Logit   Df Residuals:                       47
Method:                           MLE   Df Model:                           11
Date:                Tue, 05 Mar 2024   Pseudo R-squ.:                  0.3284
Time:                        19:51:04   Log-Likelihood:                -23.792
converged:                       True   LL-Null:                       -35.428
Covariance Type:            nonrobust   LLR p-value:                   0.01618
                                    coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------

n_quantiles (1000) is greater than the total number of samples (59). n_quantiles is set to n_samples.


### Soft tissue complication

In [674]:
df_df['soft_tissue_complication'].value_counts()

soft_tissue_complication
True     35
False    24
Name: count, dtype: Int64

In [675]:
logreg_regularized('soft_tissue_complication', QuantileTransformer(output_distribution='normal'), df_df, 'l1', alpha=0)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.5361068224720393
            Iterations: 57
            Function evaluations: 57
            Gradient evaluations: 57
                              Logit Regression Results                              
Dep. Variable:     soft_tissue_complication   No. Observations:                   59
Model:                                Logit   Df Residuals:                       47
Method:                                 MLE   Df Model:                           11
Date:                      Tue, 05 Mar 2024   Pseudo R-squ.:                  0.2065
Time:                              19:51:04   Log-Likelihood:                -31.630
converged:                             True   LL-Null:                       -39.864
Covariance Type:                  nonrobust   LLR p-value:                    0.1246
                                    coef    std err          z      P>|z|      [0.025      0.975]
------------

n_quantiles (1000) is greater than the total number of samples (59). n_quantiles is set to n_samples.


### Nonunion

In [676]:
df_df['nonunion'].value_counts()

nonunion
True     21
False    19
Name: count, dtype: Int64

In [677]:
logreg_regularized('nonunion', QuantileTransformer(output_distribution='normal'), df_df, 'l1', alpha=0)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.5254445105478194
            Iterations: 60
            Function evaluations: 60
            Gradient evaluations: 60
                           Logit Regression Results                           
Dep. Variable:               nonunion   No. Observations:                   40
Model:                          Logit   Df Residuals:                       28
Method:                           MLE   Df Model:                           11
Date:                Tue, 05 Mar 2024   Pseudo R-squ.:                  0.2406
Time:                        19:51:04   Log-Likelihood:                -21.018
converged:                       True   LL-Null:                       -27.676
Covariance Type:            nonrobust   LLR p-value:                    0.2732
                                    coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------

n_quantiles (1000) is greater than the total number of samples (40). n_quantiles is set to n_samples.


### Wound infection

In [678]:
df_df['wound_infection'].value_counts()

wound_infection
False    38
True     21
Name: count, dtype: Int64

In [679]:
logreg_regularized('wound_infection', QuantileTransformer(output_distribution='normal'), df_df, 'l1', alpha=0)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.4962138845688427
            Iterations: 60
            Function evaluations: 60
            Gradient evaluations: 60
                           Logit Regression Results                           
Dep. Variable:        wound_infection   No. Observations:                   59
Model:                          Logit   Df Residuals:                       47
Method:                           MLE   Df Model:                           11
Date:                Tue, 05 Mar 2024   Pseudo R-squ.:                  0.2378
Time:                        19:51:04   Log-Likelihood:                -29.277
converged:                       True   LL-Null:                       -38.411
Covariance Type:            nonrobust   LLR p-value:                   0.07553
                                    coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------

n_quantiles (1000) is greater than the total number of samples (59). n_quantiles is set to n_samples.


### Plate exposure

In [680]:
df_df['complication_plate___exposure'].value_counts()

complication_plate___exposure
False    39
True     20
Name: count, dtype: Int64

In [681]:
logreg_regularized('complication_plate___exposure', QuantileTransformer(output_distribution='normal'), df_df, 'l1', alpha=0)

n_quantiles (1000) is greater than the total number of samples (59). n_quantiles is set to n_samples.


Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.4744457710061555
            Iterations: 62
            Function evaluations: 62
            Gradient evaluations: 62
                                 Logit Regression Results                                
Dep. Variable:     complication_plate___exposure   No. Observations:                   59
Model:                                     Logit   Df Residuals:                       47
Method:                                      MLE   Df Model:                           11
Date:                           Tue, 05 Mar 2024   Pseudo R-squ.:                  0.2591
Time:                                   19:51:04   Log-Likelihood:                -27.992
converged:                                  True   LL-Null:                       -37.781
Covariance Type:                       nonrobust   LLR p-value:                   0.05148
                                    coef    std err          z      P>

## Univariate Analysis

### Plate failure

In [682]:
chi2_test('plate_failure', df_df)

Absolute frequencies:
               Mix  Reco  Overall
plate_failure                    
False           15    39       54
True             4     1        5

Relative probabilities:
                Mix  Reco  Overall
plate_failure                     
False          78.9  97.5     91.5
True           21.1   2.5      8.5
None
p-value: 0.017762871103301235


### Any complication

In [683]:
chi2_test('any_complication', df_df)

Absolute frequencies:
                  Mix  Reco  Overall
any_complication                    
True               15    27       42
False               4    13       17

Relative probabilities:
                   Mix  Reco  Overall
any_complication                     
True              78.9  67.5     71.2
False             21.1  32.5     28.8
None
p-value: 0.36841266267085904


### Soft tissue complication

In [684]:
chi2_test('soft_tissue_complication', df_df)

Absolute frequencies:
                          Mix  Reco  Overall
soft_tissue_complication                    
True                       11    24       35
False                       8    16       24

Relative probabilities:
                           Mix  Reco  Overall
soft_tissue_complication                     
True                      57.9  60.0     59.3
False                     42.1  40.0     40.7
None
p-value: 0.8787876895620828


### Nonunion

In [685]:
chi2_test('nonunion', df_df)

Absolute frequencies:
          Mix  Reco  Overall
nonunion                    
False       6    13       19
True        8    13       21

Relative probabilities:
           Mix  Reco  Overall
nonunion                     
False     42.9  50.0     47.5
True      57.1  50.0     52.5
None
p-value: 0.6700657354052906


### Wound infection

In [686]:
chi2_test('wound_infection', df_df)

Absolute frequencies:
                 Mix  Reco  Overall
wound_infection                    
False             11    27       38
True               8    13       21

Relative probabilities:
                  Mix  Reco  Overall
wound_infection                     
False            57.9  67.5     64.4
True             42.1  32.5     35.6
None
p-value: 0.47529792916920766


### Plate exposure

In [687]:
chi2_test('complication_plate___exposure', df_df)

Absolute frequencies:
                               Mix  Reco  Overall
complication_plate___exposure                    
False                           11    28       39
True                             8    12       20

Relative probabilities:
                                Mix  Reco  Overall
complication_plate___exposure                     
False                          57.9  70.0     66.1
True                           42.1  30.0     33.9
None
p-value: 0.3628155101053636
