<a href="https://colab.research.google.com/github/philipp-lampert/mymandible/blob/main/data_science/05_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model training

### Importing libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, Normalizer, QuantileTransformer
from sklearn.metrics import make_scorer, matthews_corrcoef, f1_score, accuracy_score, average_precision_score, roc_auc_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_validate, StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV, HalvingGridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

### Importing data

In [6]:
df_dropped_first_cca = pd.read_parquet('/Users/philipp.lampert/repositories/mymandible/data/dropped_first_cca.parquet')
df_dropped_first_imp = pd.read_parquet('/Users/philipp.lampert/repositories/mymandible/data/dropped_first_imputed.parquet')

df_all_levels_cca = pd.read_parquet('/Users/philipp.lampert/repositories/mymandible/data/all_levels_cca.parquet')
df_all_levels_imp = pd.read_parquet('/Users/philipp.lampert/repositories/mymandible/data/dropped_first_imputed.parquet')

In [7]:
df_dropped_first_cca.columns.values

array(['sex_female', 'comorbidity___smoking', 'comorbidity___alcohol',
       'comorbidity___copd', 'comorbidity___hypertension',
       'comorbidity___diabetes', 'comorbidity___atherosclerosis',
       'comorbidity___hyperlipidemia', 'comorbidity___hypothyroidism',
       'comorbidity___chronic_kidney_disease',
       'comorbidity___autoimmune_disease', 'age_surgery_years',
       'radiotherapy___pre_surgery', 'radiotherapy___post_surgery',
       'chemotherapy___pre_surgery', 'chemotherapy___post_surgery',
       'urkens_classification___c', 'urkens_classification___r',
       'urkens_classification___s', 'flap_segment_count',
       'surgery_duration_min', 'bmi', 'skin_transplanted',
       'venous_anastomosis_type___end_end',
       'venous_anastomosis_type___end_side',
       'indication___osteoradionecrosis',
       'indication___secondary_reconstruction', 'prior_flap___bony',
       'prior_flap___non_bony', 'flap_donor_site___scapula',
       'plate_type___cad_mini', 'plate_type

## Defining functions

### Preprocessing

In [8]:
def get_x_y(df, outcome, min_follow_up_days, scaling, remove_cols):

    first_outcome_var = df.columns.get_loc('days_to_follow_up')
    predictors = df.columns[:first_outcome_var].tolist()

    data = df[df['days_to_follow_up'] >= min_follow_up_days].copy()
    data['days_to_flap_loss'] = data['days_to_flap_loss'].fillna(10000)
    data = data[data['days_to_flap_loss'] >= min_follow_up_days]
    data = data[predictors + [outcome]].dropna()

    data.drop(remove_cols, axis=1)

    if scaling != 'None':
        numeric_columns = data[predictors].select_dtypes(np.number).columns.tolist()
        scaler = scaling
        data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

    return data[predictors], data[outcome]

### Scoring metrics

In [9]:
def optimized_accuracy(y_test, y_pred):
    thresholds = np.arange(0.05, 0.95, 0.05)
    best_acc = 0

    for threshold in thresholds:
        predicted_labels = (y_pred >= threshold).astype(int)
        acc = accuracy_score(y_test, predicted_labels)
        if acc > best_acc:
            best_acc = acc

    return best_acc

In [10]:
def optimized_f1(y_test, y_pred):
    thresholds = np.arange(0.05, 0.95, 0.05)
    best_f1 = 0

    for threshold in thresholds:
        predicted_labels = (y_pred >= threshold).astype(int)
        f1 = f1_score(y_test, predicted_labels)
        if f1 > best_f1:
            best_f1 = f1

    return best_f1

In [11]:
def optimized_mcc(y_test, y_pred):
    thresholds = np.arange(0.05, 0.95, 0.05)
    best_mcc = -1

    for threshold in thresholds:
        predicted_labels = (y_pred >= threshold).astype(int)
        mcc = matthews_corrcoef(y_test, predicted_labels)
        if mcc > best_mcc:
            best_mcc = mcc

    return best_mcc

In [12]:
acc_scorer = make_scorer(optimized_accuracy, needs_proba=True)
f1_scorer = make_scorer(optimized_f1, needs_proba=True)
mcc_scorer = make_scorer(optimized_mcc, needs_proba=True)

### Nested CV

In [13]:
# Draft, not yet in use
outcome_counts = df_all_levels_cca['complication_plate___exposure'].value_counts().tolist()
outcome_counts = outcome_counts[1]
nested_inner_k = round(outcome_counts / 20, 0)
nested_outer_k = round(outcome_counts / 20, 0)
repeated_inner_k = round(outcome_counts / 12, 0)
repeated_outer_k = round(outcome_counts / 20, 0)

In [14]:
def nested_cv(outcome, min_follow_up_days, inner_k, outer_k, remove_cols, model, parameter_grid, scaling, df):
    
    inner_cv = StratifiedKFold(n_splits=inner_k, shuffle=True, random_state=0)
    outer_cv = StratifiedKFold(n_splits=outer_k, shuffle=True, random_state=0)   

    x, y = get_x_y(df, outcome, min_follow_up_days, scaling, remove_cols) 

    # Inner cross-validation for parameter search
    inner_model = GridSearchCV(estimator=model, param_grid=parameter_grid, cv=inner_cv, n_jobs=-1, scoring='average_precision')

    # Outer cross-validation to compute the testing score
    cv_results = cross_validate(inner_model, x, y, cv=outer_cv, n_jobs=-1, scoring={'mcc': mcc_scorer, 'f1': f1_scorer, 'accuracy': acc_scorer, 'pr_auc': 'average_precision', 'roc_auc': 'roc_auc'})

    print("Mean MCC: "f"{cv_results['test_mcc'].mean():.3f} ± {cv_results['test_mcc'].std():.3f}")
    print("Mean F1: "f"{cv_results['test_f1'].mean():.3f} ± {cv_results['test_f1'].std():.3f}")
    print("Mean Accuracy: "f"{cv_results['test_accuracy'].mean():.3f} ± {cv_results['test_accuracy'].std():.3f}")
    print("Mean PR AUC: "f"{cv_results['test_pr_auc'].mean():.3f} ± {cv_results['test_pr_auc'].std():.3f}")
    print("Mean ROC AUC: "f"{cv_results['test_roc_auc'].mean():.3f} ± {cv_results['test_roc_auc'].std():.3f}")

### Nested repeated CV

In [20]:
def nested_repeated_cv(outcome, min_follow_up_days, inner_k, outer_k, remove_cols, model, parameter_grid, scaling, df):

    inner_cv = StratifiedKFold(n_splits=inner_k, shuffle=True, random_state=0)
    outer_cv = RepeatedStratifiedKFold(n_splits=outer_k, n_repeats=8, random_state=0)

    x, y = get_x_y(df, outcome, min_follow_up_days, scaling, remove_cols)

    # Inner cross-validation for parameter search
    inner_model = GridSearchCV(estimator=model, param_grid=parameter_grid, cv=inner_cv, n_jobs=-1, scoring='average_precision')

    # Outer cross-validation to compute the testing score
    cv_results = cross_validate(inner_model, x, y, cv=outer_cv, n_jobs=-1, scoring={'mcc': mcc_scorer, 'f1': f1_scorer, 'accuracy': acc_scorer, 'pr_auc': 'average_precision', 'roc_auc': 'roc_auc'})

    print("Mean MCC: "f"{cv_results['test_mcc'].mean():.3f} ± {cv_results['test_mcc'].std():.3f}")
    print("Mean F1: "f"{cv_results['test_f1'].mean():.3f} ± {cv_results['test_f1'].std():.3f}")
    print("Mean Accuracy: "f"{cv_results['test_accuracy'].mean():.3f} ± {cv_results['test_accuracy'].std():.3f}")
    print("Mean PR AUC: "f"{cv_results['test_pr_auc'].mean():.3f} ± {cv_results['test_pr_auc'].std():.3f}")
    print("Mean ROC AUC: "f"{cv_results['test_roc_auc'].mean():.3f} ± {cv_results['test_roc_auc'].std():.3f}")

### Logistic Regression

In [16]:
weights = np.arange(0, 1, 0.05)
lr_param_grid = {
    'max_iter': [5000],
    'C': np.arange(0.1, 10, 0.2),
    'class_weight': [{0:x, 1:1-x} for x in weights],
    #'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']
    'solver': ['lbfgs', 'liblinear', 'newton-cg']
}

#### Plate exposure

In [17]:
remove_cols_pe = ['venous_anastomosis_type___end_end', 'venous_anastomosis_type___end_side', 'urkens_classification___c', 'surgery_duration_min']

##### Complete-case-analysis

In [18]:
nested_cv(
    outcome = 'complication_plate___exposure', 
    min_follow_up_days = 60, 
    inner_k = 3,
    outer_k = 3,
    remove_cols = remove_cols_pe,
    model = LogisticRegression(), 
    parameter_grid = lr_param_grid, 
    scaling = RobustScaler(), 
    df = df_dropped_first_cca
)

Mean MCC: 0.064 ± 0.090
Mean F1: 0.396 ± 0.021
Mean Accuracy: 0.413 ± 0.251
Mean PR AUC: 0.338 ± 0.093
Mean ROC AUC: 0.570 ± 0.105


In [19]:
nested_repeated_cv(
    outcome = 'complication_plate___exposure', 
    min_follow_up_days = 60,
    inner_k = 5,
    outer_k = 4,
    remove_cols = remove_cols_pe, 
    model = LogisticRegression(), 
    parameter_grid = lr_param_grid,  
    scaling = RobustScaler(), 
    df = df_dropped_first_cca
)

Mean MCC: 0.220 ± 0.124
Mean F1: 0.439 ± 0.092
Mean Accuracy: 0.665 ± 0.197
Mean PR AUC: 0.375 ± 0.083
Mean ROC AUC: 0.643 ± 0.095


##### Imputed data

In [21]:
nested_cv(
    outcome = 'complication_plate___exposure', 
    min_follow_up_days = 60, 
    inner_k = 3,
    outer_k = 3,
    remove_cols = remove_cols_pe,
    model = LogisticRegression(), 
    parameter_grid = lr_param_grid, 
    scaling = RobustScaler(), 
    df = df_dropped_first_imp
)

Mean MCC: 0.190 ± 0.141
Mean F1: 0.454 ± 0.039
Mean Accuracy: 0.582 ± 0.233
Mean PR AUC: 0.325 ± 0.077
Mean ROC AUC: 0.595 ± 0.123


In [22]:
nested_repeated_cv(
    outcome = 'complication_plate___exposure', 
    min_follow_up_days = 60,
    inner_k = 5,
    outer_k = 4,
    remove_cols = remove_cols_pe, 
    model = LogisticRegression(), 
    parameter_grid = lr_param_grid,  
    scaling = RobustScaler(), 
    df = df_dropped_first_imp
)

Mean MCC: 0.212 ± 0.128
Mean F1: 0.450 ± 0.079
Mean Accuracy: 0.610 ± 0.216
Mean PR AUC: 0.378 ± 0.084
Mean ROC AUC: 0.649 ± 0.082


#### Nonunion

In [None]:
remove_cols_nu = ['venous_anastomosis_type___end_end', 'venous_anastomosis_type___end_side', 'urkens_classification___c', 'surgery_duration_min']

In [None]:
nested_cv(outcome='nonunion', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=180, scaling=RobustScaler(), df=df_dropped_first_cca, remove_cols=remove_cols_nu)

In [None]:
nested_repeated_cv(outcome='nonunion', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=180, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_nu)

#### Soft tissue complication

In [None]:
remove_cols_stx = ['venous_anastomosis_type___end_end', 'venous_anastomosis_type___end_side', 'urkens_classification___c']

In [None]:
nested_cv(outcome='soft_tissue_complication', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=30, scaling='None', df=df_dropped_first_cca, remove_cols=remove_cols_stx)

In [None]:
nested_cv_bootstrapping(outcome='soft_tissue_complication', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=30, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_stx)

#### Wound infection

In [None]:
remove_cols_wi = ['venous_anastomosis_type___end_end', 'venous_anastomosis_type___end_side', 'urkens_classification___c']

In [None]:
nested_cv(outcome='wound_infection', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=60, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_wi)

In [None]:
nested_cv_bootstrapping(outcome='wound_infection', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=60, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_wi)

#### Flap loss

In [None]:
remove_cols_fl = []

In [None]:
nested_cv(outcome='flap_loss', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=0, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_fl)

In [None]:
nested_cv_bootstrapping(outcome='flap_loss', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=0, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_fl)

## Random Forest

In [29]:
rf_param_grid = {
    'n_estimators': [50, 100, 500, 1000],
    'max_depth': np.arange(2, 8, 1),
    'min_samples_split': np.arange(0, 1, 0.1),
    #'min_weight_fraction_leaf': np.arange(0, 1, 0.1),
    'max_features': np.arange(0, 1, 0.1)
}

#### Plate exposure

##### CCA

In [None]:
nested_cv(
    outcome = 'complication_plate___exposure', 
    min_follow_up_days = 60, 
    inner_k = 3,
    outer_k = 3,
    remove_cols = remove_cols_pe,
    model = RandomForestClassifier(), 
    parameter_grid = rf_param_grid, 
    scaling = 'None', 
    df = df_all_levels_cca
)

In [None]:
nested_cv_bootstrapping(outcome='complication_plate___exposure', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=90, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_pe)

##### Imputed

#### Nonunion

In [None]:
nested_cv(outcome='nonunion', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_nu)

In [None]:
nested_cv_bootstrapping(outcome='nonunion', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_nu)

#### Soft tissue complication

In [None]:
nested_cv(outcome='soft_tissue_complication', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_stx)

In [None]:
nested_cv_bootstrapping(outcome='soft_tissue_complication', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_stx)

#### Wound infection

In [None]:
nested_cv(outcome='wound_infection', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_wi)

In [None]:
nested_cv_bootstrapping(outcome='wound_infection', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_wi)

#### Flap loss

In [None]:
nested_cv(outcome='flap_loss', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=0, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_fl)

In [None]:
nested_cv_bootstrapping(outcome='flap_loss', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=0, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_fl)

## kNN Classifier

In [None]:
knn_param_grid = {
    'n_neighbors': np.arange(2, 8, 1),
    'weights': ['uniform', 'distance'],
    'leaf_size': np.arange(5, 45, 10),
    'p': np.arange(0.5, 5, 0.5)
}

In [None]:
nested_cv(outcome='complication_plate___exposure', model=KNeighborsClassifier(n_jobs=-1), parameter_grid=knn_param_grid, min_follow_up_days=60, scaling=True, df=df_all_levels_cca, remove_cols=remove_cols_pe)

#### Nonunion

In [None]:
nested_cv(outcome='nonunion', model=KNeighborsClassifier(n_jobs=-1), parameter_grid=knn_param_grid, min_follow_up_days=180, scaling=True, df=df_all_levels_cca, remove_cols=remove_cols_nu)

## DEPRECATED

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_validate, StratifiedKFold, HalvingGridSearchCV
from sklearn.metrics import make_scorer, matthews_corrcoef, f1_score, accuracy_score, average_precision_score, roc_auc_score

inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

def nested_cv_bootstrapping(outcome, model, parameter_grid, min_follow_up_days, scaling, df, remove_cols):

  metrics = {'f1': [], 'mcc': [], 'accuracy': [], 'pr_auc': [], 'roc_auc': []}

  n_bootstrap = 10
  x, y = get_x_y(df, outcome, min_follow_up_days, scaling, remove_cols)

  for i in range(n_bootstrap):

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, stratify=y, random_state=i)
    print(y_train.value_counts())

    # Inner loop: Grid search for hyperparameter tuning
    inner_model = HalvingGridSearchCV(estimator=model, param_grid=parameter_grid, cv=inner_cv, factor=2, n_jobs=-1, scoring='average_precision')
    inner_model.fit(x_train, y_train)

    # Get the best hyperparameters from the inner loop
    best_params = inner_model.best_params_

    # Evaluate the selected hyperparameters on the remaining 30% of the data
    model.set_params(**best_params)
    model.fit(x_train, y_train)
    y_pred = model.predict_proba(x_test)

    metrics['f1'].append(optimized_f1(y_test, y_pred[:, 1]))
    metrics['mcc'].append(optimized_mcc(y_test, y_pred[:, 1]))
    metrics['accuracy'].append(optimized_accuracy(y_test, y_pred[:, 1]))
    metrics['pr_auc'].append(average_precision_score(y_test, y_pred[:, 1]))
    metrics['roc_auc'].append(roc_auc_score(y_test, y_pred[:, 1]))


  print("Mean MCC: "
      f"{np.mean(metrics['mcc']):.3f} ± {np.std(metrics['mcc']):.3f}")
  print("Mean F1: "
      f"{np.mean(metrics['f1']):.3f} ± {np.std(metrics['f1']):.3f}")
  print("Mean Accuracy: "
      f"{np.mean(metrics['accuracy']):.3f} ± {np.std(metrics['accuracy']):.3f}")
  print("Mean PR AUC: "
      f"{np.mean(metrics['pr_auc']):.3f} ± {np.std(metrics['pr_auc']):.3f}")
  print("Mean ROC AUC: "
      f"{np.mean(metrics['roc_auc']):.3f} ± {np.std(metrics['roc_auc']):.3f}")