<a href="https://colab.research.google.com/github/philipp-lampert/mymandible/blob/main/data_science/05_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Model training

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
!pip install notebook scikit-learn-intelex

Collecting scikit-learn-intelex
  Downloading scikit_learn_intelex-2024.0.1-py310-none-manylinux1_x86_64.whl (122 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.7/122.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting daal4py==2024.0.1 (from scikit-learn-intelex)
  Downloading daal4py-2024.0.1-py310-none-manylinux1_x86_64.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
Collecting daal==2024.0.1 (from daal4py==2024.0.1->scikit-learn-intelex)
  Downloading daal-2024.0.1-py2.py3-none-manylinux1_x86_64.whl (76.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython>=5.0.0->ipykernel->notebook)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
I

In [25]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [26]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, Normalizer, QuantileTransformer
from sklearn.metrics import make_scorer, matthews_corrcoef, f1_score, accuracy_score, average_precision_score, roc_auc_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_validate, StratifiedKFold, HalvingGridSearchCV

In [27]:
df_dropped_first_cca = pd.read_parquet('/content/drive/MyDrive/mymandible/data_science/data/dropped_first_cca.parquet')
df_dropped_first_imp = pd.read_parquet('/content/drive/MyDrive/mymandible/data_science/data/dropped_first_imputed.parquet')

df_all_levels_cca = pd.read_parquet('/content/drive/MyDrive/mymandible/data_science/data/all_levels_cca.parquet')
df_all_levels_imp = pd.read_parquet('/content/drive/MyDrive/mymandible/data_science/data/all_levels_imputed.parquet')

We first define a function that imports the prepared CCA and imputed datasets and splits them into train and test sets.

In [28]:
def get_x_y(df, outcome, min_follow_up_days, scaling, remove_cols):

  first_outcome_var = df.columns.get_loc('days_to_follow_up')
  predictors = df.columns[:first_outcome_var].tolist()

  data = df[df['days_to_follow_up'] >= min_follow_up_days].copy()
  data['days_to_flap_loss'] = data['days_to_flap_loss'].fillna(10000)
  data = data[data['days_to_flap_loss'] >= min_follow_up_days]
  data = data[predictors + [outcome]].dropna()

  data.drop(remove_cols, axis=1)

  if scaling != False:
    numeric_columns = data[predictors].select_dtypes(np.number).columns.tolist()
    scaler = scaling
    data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

  return data[predictors], data[outcome]

In [29]:
def optimized_accuracy(y_test, y_pred):
  thresholds = np.arange(0.1, 0.9, 0.05)
  best_acc = 0

  for threshold in thresholds:
    predicted_labels = (y_pred >= threshold).astype(int)
    acc = accuracy_score(y_test, predicted_labels)
    if acc > best_acc:
      best_acc = acc

  return best_acc


In [30]:
def optimized_f1(y_test, y_pred):
  thresholds = np.arange(0.1, 0.9, 0.05)
  best_f1 = 0

  for threshold in thresholds:
    predicted_labels = (y_pred >= threshold).astype(int)
    f1 = f1_score(y_test, predicted_labels)
    if f1 > best_f1:
      best_f1 = f1

  return best_f1


In [31]:
def optimized_mcc(y_test, y_pred):
  thresholds = np.arange(0.1, 0.9, 0.05)
  best_mcc = -1

  for threshold in thresholds:
    predicted_labels = (y_pred >= threshold).astype(int)
    mcc = matthews_corrcoef(y_test, predicted_labels)
    if mcc > best_mcc:
        best_mcc = mcc

  return best_mcc

In [32]:
#Make scoring functions
acc_scorer = make_scorer(optimized_accuracy, needs_proba=True)
f1_scorer = make_scorer(optimized_f1, needs_proba=True)
mcc_scorer = make_scorer(optimized_mcc, needs_proba=True)

# Declare the inner and outer cross-validation strategies
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

def nested_cv(outcome, model, parameter_grid, min_follow_up_days, scaling, df, remove_cols):

  x, y = get_x_y(df, outcome, min_follow_up_days, scaling, remove_cols)

  # Inner cross-validation for parameter search
  inner_model = HalvingGridSearchCV(estimator=model, param_grid=parameter_grid, cv=inner_cv, n_jobs=-1, factor=2, scoring='average_precision')

  # Outer cross-validation to compute the testing score
  cv_results = cross_validate(inner_model, x, y, cv=outer_cv, n_jobs=-1, scoring={'mcc': mcc_scorer, 'f1': f1_scorer, 'accuracy': acc_scorer, 'pr_auc': 'average_precision', 'roc_auc': 'roc_auc'})

  print("Mean MCC: "
      f"{cv_results['test_mcc'].mean():.3f} ± {cv_results['test_mcc'].std():.3f}")
  print("Mean F1: "
      f"{cv_results['test_f1'].mean():.3f} ± {cv_results['test_f1'].std():.3f}")
  print("Mean Accuracy: "
      f"{cv_results['test_accuracy'].mean():.3f} ± {cv_results['test_accuracy'].std():.3f}")
  print("Mean PR AUC: "
      f"{cv_results['test_pr_auc'].mean():.3f} ± {cv_results['test_pr_auc'].std():.3f}")
  print("Mean ROC AUC: "
      f"{cv_results['test_roc_auc'].mean():.3f} ± {cv_results['test_roc_auc'].std():.3f}")

In [33]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_validate, StratifiedKFold, RepeatedStratifiedKFold, HalvingGridSearchCV
from sklearn.metrics import make_scorer, matthews_corrcoef, f1_score, accuracy_score, average_precision_score, roc_auc_score

#Make scoring functions
acc_scorer = make_scorer(optimized_accuracy, needs_proba=True)
f1_scorer = make_scorer(optimized_f1, needs_proba=True)
mcc_scorer = make_scorer(optimized_mcc, needs_proba=True)

# Declare the inner and outer cross-validation strategies
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
outer_cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)

def nested_repeated_cv(outcome, model, parameter_grid, min_follow_up_days, scaling, df, remove_cols):

  x, y = get_x_y(df, outcome, min_follow_up_days, scaling, remove_cols)

  # Inner cross-validation for parameter search
  inner_model = HalvingGridSearchCV(estimator=model, param_grid=parameter_grid, cv=inner_cv, n_jobs=-1, factor=2, scoring='average_precision')

  # Outer cross-validation to compute the testing score
  cv_results = cross_validate(inner_model, x, y, cv=outer_cv, n_jobs=-1, scoring={'mcc': mcc_scorer, 'f1': f1_scorer, 'accuracy': acc_scorer, 'pr_auc': 'average_precision', 'roc_auc': 'roc_auc'})

  print("Mean MCC: "
      f"{cv_results['test_mcc'].mean():.3f} ± {cv_results['test_mcc'].std():.3f}")
  print("Mean F1: "
      f"{cv_results['test_f1'].mean():.3f} ± {cv_results['test_f1'].std():.3f}")
  print("Mean Accuracy: "
      f"{cv_results['test_accuracy'].mean():.3f} ± {cv_results['test_accuracy'].std():.3f}")
  print("Mean PR AUC: "
      f"{cv_results['test_pr_auc'].mean():.3f} ± {cv_results['test_pr_auc'].std():.3f}")
  print("Mean ROC AUC: "
      f"{cv_results['test_roc_auc'].mean():.3f} ± {cv_results['test_roc_auc'].std():.3f}")

### Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression

weights = np.arange(0, 1, 0.1)
lr_param_grid = {
    'max_iter': [7500],
    "C": np.arange(0.1, 5, 0.2),
    'class_weight': [{0:x, 1:1-x} for x in weights],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']
}

In [36]:
df_dropped_first_imp['complication_plate___exposure'].value_counts()

False    229
True      68
Name: complication_plate___exposure, dtype: Int64

####Plate exposure

In [37]:
remove_cols_pe = ['venous_anastomosis_type___end_end', 'venous_anastomosis_type___end_side', 'urkens_classification___c', 'surgery_duration_min']

In [39]:
nested_cv(outcome='complication_plate___exposure', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=60, scaling=RobustScaler(), df=df_dropped_first_cca, remove_cols=remove_cols_pe)

Mean MCC: 0.151 ± 0.131
Mean F1: 0.384 ± 0.118
Mean Accuracy: 0.607 ± 0.242
Mean PR AUC: 0.374 ± 0.071
Mean ROC AUC: 0.643 ± 0.073


In [38]:
nested_repeated_cv(outcome='complication_plate___exposure', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=60, scaling=RobustScaler(), df=df_dropped_first_cca, remove_cols=remove_cols_pe)

Mean MCC: 0.151 ± 0.139
Mean F1: 0.420 ± 0.091
Mean Accuracy: 0.542 ± 0.249
Mean PR AUC: 0.381 ± 0.069
Mean ROC AUC: 0.647 ± 0.076


In [None]:
nested_cv_bootstrapping(outcome='complication_plate___exposure', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=60, scaling=RobustScaler(), df=df_dropped_first_cca, remove_cols=remove_cols_pe)

####Nonunion

In [149]:
remove_cols_nu = ['venous_anastomosis_type___end_end', 'venous_anastomosis_type___end_side', 'urkens_classification___c', 'surgery_duration_min']

In [150]:
nested_cv(outcome='nonunion', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=180, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_nu)

Mean MCC: 0.180 ± 0.040
Mean F1: 0.631 ± 0.003
Mean Accuracy: 0.552 ± 0.004
Mean PR AUC: 0.446 ± 0.006
Mean ROC AUC: 0.512 ± 0.036


In [151]:
nested_cv_bootstrapping(outcome='nonunion', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=180, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_nu)

Mean MCC: 0.109 ± 0.091
Mean F1: 0.569 ± 0.057
Mean Accuracy: 0.521 ± 0.104
Mean PR AUC: 0.447 ± 0.061
Mean ROC AUC: 0.512 ± 0.057


####Soft tissue complication

In [152]:
remove_cols_stx = ['venous_anastomosis_type___end_end', 'venous_anastomosis_type___end_side', 'urkens_classification___c']

In [153]:
nested_cv(outcome='soft_tissue_complication', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=30, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_stx)

Mean MCC: 0.099 ± 0.091
Mean F1: 0.602 ± 0.084
Mean Accuracy: 0.542 ± 0.047
Mean PR AUC: 0.558 ± 0.040
Mean ROC AUC: 0.556 ± 0.032


In [154]:
nested_cv_bootstrapping(outcome='soft_tissue_complication', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=30, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_stx)

Mean MCC: 0.248 ± 0.097
Mean F1: 0.666 ± 0.088
Mean Accuracy: 0.608 ± 0.065
Mean PR AUC: 0.642 ± 0.090
Mean ROC AUC: 0.642 ± 0.057


####Wound infection

In [155]:
remove_cols_wi = ['venous_anastomosis_type___end_end', 'venous_anastomosis_type___end_side', 'urkens_classification___c']

In [156]:
nested_cv(outcome='wound_infection', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=60, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_wi)

Mean MCC: 0.107 ± 0.096
Mean F1: 0.426 ± 0.036
Mean Accuracy: 0.578 ± 0.217
Mean PR AUC: 0.320 ± 0.064
Mean ROC AUC: 0.519 ± 0.058


In [None]:
nested_cv_bootstrapping(outcome='wound_infection', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=60, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_wi)

1250 fits failed out of a total of 3750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1000 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1241, in fit
    raise ValueError(
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0

--------------------------------------------------------------------------------
250 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/lo

####Flap loss

In [None]:
remove_cols_fl = []

In [159]:
nested_cv(outcome='flap_loss', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=0, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_fl)

Mean MCC: nan ± nan
Mean F1: nan ± nan
Mean Accuracy: nan ± nan
Mean PR AUC: nan ± nan
Mean ROC AUC: nan ± nan


1 fits failed out of a total of 3.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search_successive_halving.py", line 273, in fit
    super().fit(X, y=y, groups=groups, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py", line 874, in fit
    self._run_search(evaluate_candidates)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search_successive_halving.py", 

In [None]:
nested_cv_bootstrapping(outcome='flap_loss', model=LogisticRegression(), parameter_grid=lr_param_grid, min_follow_up_days=0, scaling=True, df=df_dropped_first_cca, remove_cols=remove_cols_fl)

2500 fits failed out of a total of 3750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2000 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1241, in fit
    raise ValueError(
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0

--------------------------------------------------------------------------------
500 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/lo

##Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_param_grid = {
    'n_estimators': [50, 100, 500, 1000, 5000],
    'max_depth': np.arange(2, 8, 1),
    'min_samples': np.arange(0, 1, 0.1),
    'min_weight_fraction_leaf': np.arange(0, 1, 0.1),
    'max_features': np.arange(0, 1, 0.1)
}

####Plate exposure

In [None]:
nested_cv(outcome='complication_plate___exposure', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=90, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_pe)

In [None]:
nested_cv_bootstrapping(outcome='complication_plate___exposure', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=90, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_pe)

####Nonunion

In [None]:
nested_cv(outcome='nonunion', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_nu)

In [None]:
nested_cv_bootstrapping(outcome='nonunion', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_nu)

####Soft tissue complication

In [None]:
nested_cv(outcome='soft_tissue_complication', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_stx)

In [None]:
nested_cv_bootstrapping(outcome='soft_tissue_complication', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_stx)

####Wound infection

In [None]:
nested_cv(outcome='wound_infection', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_wi)

In [None]:
nested_cv_bootstrapping(outcome='wound_infection', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=180, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_wi)

####Flap loss

In [None]:
nested_cv(outcome='flap_loss', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=0, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_fl)

In [None]:
nested_cv_bootstrapping(outcome='flap_loss', model=RandomForestClassifier(n_jobs=-1, random_state=0), parameter_grid=rf_param_grid, min_follow_up_days=0, scaling=False, df=df_all_levels_cca, remove_cols=remove_cols_fl)

##kNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_param_grid = {
    'n_neighbors': np.arange(2, 8, 1),
    'weights': ['uniform', 'distance'],
    'leaf_size': np.arange(5, 45, 10),
    'p': np.arange(0.5, 5, 0.5)
}

In [None]:
nested_cv(outcome='complication_plate___exposure', model=KNeighborsClassifier(n_jobs=-1), parameter_grid=knn_param_grid, min_follow_up_days=60, scaling=True, df=df_all_levels_cca, remove_cols=remove_cols_pe)

####Nonunion

In [None]:
nested_cv(outcome='nonunion', model=KNeighborsClassifier(n_jobs=-1), parameter_grid=knn_param_grid, min_follow_up_days=180, scaling=True, df=df_all_levels_cca, remove_cols=remove_cols_nu)

##Old

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_validate, StratifiedKFold, HalvingGridSearchCV
from sklearn.metrics import make_scorer, matthews_corrcoef, f1_score, accuracy_score, average_precision_score, roc_auc_score

inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

def nested_cv_bootstrapping(outcome, model, parameter_grid, min_follow_up_days, scaling, df, remove_cols):

  metrics = {'f1': [], 'mcc': [], 'accuracy': [], 'pr_auc': [], 'roc_auc': []}

  n_bootstrap = 10
  x, y = get_x_y(df, outcome, min_follow_up_days, scaling, remove_cols)

  for i in range(n_bootstrap):

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, stratify=y, random_state=i)
    print(y_train.value_counts())

    # Inner loop: Grid search for hyperparameter tuning
    inner_model = HalvingGridSearchCV(estimator=model, param_grid=parameter_grid, cv=inner_cv, factor=2, n_jobs=-1, scoring='average_precision')
    inner_model.fit(x_train, y_train)

    # Get the best hyperparameters from the inner loop
    best_params = inner_model.best_params_

    # Evaluate the selected hyperparameters on the remaining 30% of the data
    model.set_params(**best_params)
    model.fit(x_train, y_train)
    y_pred = model.predict_proba(x_test)

    metrics['f1'].append(optimized_f1(y_test, y_pred[:, 1]))
    metrics['mcc'].append(optimized_mcc(y_test, y_pred[:, 1]))
    metrics['accuracy'].append(optimized_accuracy(y_test, y_pred[:, 1]))
    metrics['pr_auc'].append(average_precision_score(y_test, y_pred[:, 1]))
    metrics['roc_auc'].append(roc_auc_score(y_test, y_pred[:, 1]))


  print("Mean MCC: "
      f"{np.mean(metrics['mcc']):.3f} ± {np.std(metrics['mcc']):.3f}")
  print("Mean F1: "
      f"{np.mean(metrics['f1']):.3f} ± {np.std(metrics['f1']):.3f}")
  print("Mean Accuracy: "
      f"{np.mean(metrics['accuracy']):.3f} ± {np.std(metrics['accuracy']):.3f}")
  print("Mean PR AUC: "
      f"{np.mean(metrics['pr_auc']):.3f} ± {np.std(metrics['pr_auc']):.3f}")
  print("Mean ROC AUC: "
      f"{np.mean(metrics['roc_auc']):.3f} ± {np.std(metrics['roc_auc']):.3f}")