<a href="https://colab.research.google.com/github/philipp-lampert/mymandible/blob/main/data_science/05_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Model training

In [224]:
import numpy as np
import pandas as pd

We first define a function that imports the prepared CCA and imputed datasets and splits them into train and test sets.

In [225]:
def cca_imp_train_test_splits(outcome):

  from sklearn.model_selection import train_test_split

  random_state=0
  test_size=0.2

  df_cca = pd.read_parquet('https://github.com/philipp-lampert/mymandible/blob/main/data_science/data/dropped_first_cca.parquet?raw=true')
  df_imp = pd.read_parquet('https://github.com/philipp-lampert/mymandible/blob/main/data_science/data/dropped_first_imputed.parquet?raw=true')

  first_outcome_var = df_cca.columns.get_loc('flap_revision')
  predictors = df_cca.columns[:first_outcome_var].tolist()

  df_cca = df_cca[predictors].join(df_cca[outcome])
  df_cca = df_cca.dropna()

  df_imp = df_imp[predictors].join(df_imp[outcome])
  df_imp = df_imp.dropna()

  x_train_cca, x_test_cca, y_train_cca, y_test_cca = train_test_split(df_cca[predictors], df_cca[outcome].astype('int').values,
                                                                      test_size=test_size, random_state=random_state)

  x_train_imp, x_test_imp, y_train_imp, y_test_imp = train_test_split(df_imp[predictors], df_imp[outcome].astype('int').values,
                                                                      test_size=test_size, random_state=random_state)

  return x_train_cca, x_test_cca, y_train_cca, y_test_cca, x_train_imp, x_test_imp, y_train_imp, y_test_imp

Next, we write a function that calculates all performance metrics.

In [232]:
def metrics(predictions, y_test):

    from sklearn.metrics import matthews_corrcoef, accuracy_score, brier_score_loss, log_loss

    thresholds = np.arange(0.1, 0.8, 0.05)
    best_mcc = -1

    for threshold in thresholds:
        predicted_labels = (predictions >= threshold).astype(int)
        mcc = matthews_corrcoef(y_test, predicted_labels)

        if mcc > best_mcc:
            best_predicted_labels = predicted_labels
            best_mcc = mcc
            optimal_threshold = threshold

    accuracy = accuracy_score(y_test, best_predicted_labels)
    brier_score = brier_score_loss(y_test, predictions)
    log_loss = log_loss(y_test, predictions)

    print('MCC:', best_mcc)
    print('Accuracy:', accuracy)
    print('Brier Score:', brier_score)
    print('Log Loss:', log_loss)
    print('\n')

In [227]:
def regression(outcome, max_iter, cv):

  from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV

  random_state=0
  l1_ratio=[.1, .5, .7, .8, .9, .95, .99, 1]

  x_train_cca, x_test_cca, y_train_cca, y_test_cca, x_train_imp, x_test_imp, y_train_imp, y_test_imp = cca_imp_train_test_splits(outcome)

  encv = ElasticNetCV(random_state=random_state, max_iter=max_iter, cv=cv, l1_ratio=l1_ratio)

  encv_cca = encv.fit(x_train_cca, y_train_cca)
  predictions_cca = encv_cca.predict(x_test_cca)

  encv_imp = encv.fit(x_train_imp, y_train_imp)
  predictions_imp = encv_imp.predict(x_test_imp)

  print("Complete Case Analysis:")
  metrics(predictions_cca, y_test_cca)
  print("Imputed Data:")
  metrics(predictions_imp, y_test_imp)

In [238]:
regression('nonunion', 500, 10)

Complete Case Analysis:
MCC: 0.7211102550927979
Accuracy: 0.8888888888888888
Brier Score: 0.095224359878728
Log Loss: 0.31988113266779494


Imputed Data:
MCC: 0.2692307692307692
Accuracy: 0.6842105263157895
Brier Score: 0.2233138055388888
Log Loss: 0.7054729398729287


