<a href="https://colab.research.google.com/github/philipp-lampert/mymandible/blob/main/data_science/05_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Model training

In [29]:
import numpy as np
import pandas as pd

We first define a function that imports the prepared CCA and imputed datasets and splits them into train and test sets.

In [125]:
def cca_imp_train_test_splits(outcome, min_follow_up_days):

  from sklearn.model_selection import train_test_split

  random_state=100
  test_size=0.3

  df_cca = pd.read_parquet('https://github.com/philipp-lampert/mymandible/blob/main/data_science/data/dropped_first_cca.parquet?raw=true')
  df_imp = pd.read_parquet('https://github.com/philipp-lampert/mymandible/blob/main/data_science/data/dropped_first_imputed.parquet?raw=true')

  first_outcome_var = df_cca.columns.get_loc('flap_revision')
  predictors = df_cca.columns[:first_outcome_var].tolist()

  for i, df in enumerate([df_cca, df_imp]):
    dfx = df[df['days_to_follow_up'] >= min_follow_up_days].copy()
    dfx['days_to_flap_loss'] = dfx['days_to_flap_loss'].fillna(10000)
    dfx = dfx[dfx['days_to_flap_loss'] >= min_follow_up_days]
    dfx = dfx.drop('days_to_flap_loss', axis=1)
    dfx = dfx[predictors + [outcome]].dropna()
    if i == 0:
        df_cca = dfx
    else:
        df_imp = dfx

  x_train_cca, x_test_cca, y_train_cca, y_test_cca = train_test_split(df_cca[predictors], df_cca[outcome].astype('int').values,
                                                                      test_size=test_size, random_state=random_state)

  x_train_imp, x_test_imp, y_train_imp, y_test_imp = train_test_split(df_imp[predictors], df_imp[outcome].astype('int').values,
                                                                      test_size=test_size, random_state=random_state)

  return x_train_cca, x_test_cca, y_train_cca, y_test_cca, x_train_imp, x_test_imp, y_train_imp, y_test_imp


Next, we write a function that calculates all performance metrics.

In [146]:
def metrics(predictions, y_test):

    from sklearn.metrics import matthews_corrcoef, accuracy_score, brier_score_loss, log_loss, f1_score, confusion_matrix

    thresholds = np.arange(0.2, 0.8, 0.05)
    best_f1 = 0

    for threshold in thresholds:
        predicted_labels = (predictions >= threshold).astype(int)
        f1 = f1_score(y_test, predicted_labels)

        if f1 > best_f1:
            best_predicted_labels = predicted_labels
            best_f1 = f1
            optimal_threshold = threshold

    tn, fp, fn, tp = confusion_matrix(y_test, predicted_labels).ravel()

    print('F1 Score:', best_f1)
    print('MCC:', matthews_corrcoef(y_test, best_predicted_labels))
    print('Accuracy:', accuracy_score(y_test, best_predicted_labels))
    print('Brier Score:', brier_score_loss(y_test, predictions))
    print('Log Loss:', log_loss(y_test, predictions))
    print('\n')
    print('Decision threshold:', optimal_threshold)
    print('TP:', tp)
    print('FP:', fp)
    print('TN:', tn)
    print('FN:', fn)
    print('-----------------------------------------')

In [140]:
def logistic_regression(outcome, min_follow_up_days, max_iter, cv):

  from sklearn.linear_model import LogisticRegressionCV

  random_state=0

  x_train_cca, x_test_cca, y_train_cca, y_test_cca, x_train_imp, x_test_imp, y_train_imp, y_test_imp = cca_imp_train_test_splits(outcome, min_follow_up_days)

  lrcv = LogisticRegressionCV(solver='newton-cholesky', random_state=random_state, max_iter=max_iter, cv=cv)

  lrcv_cca = lrcv.fit(x_train_cca, y_train_cca)
  predictions_cca = lrcv_cca.predict_proba(x_test_cca)[:, 1]

  lrcv_imp = lrcv.fit(x_train_imp, y_train_imp)
  predictions_imp = lrcv_imp.predict_proba(x_test_imp)[:, 1]

  print("Complete Case Analysis:")
  metrics(predictions_cca, y_test_cca)
  print("Imputed Data:")
  metrics(predictions_imp, y_test_imp)

In [147]:
logistic_regression('soft_tissue_complication', min_follow_up_days=30, max_iter=1000, cv=10)

Complete Case Analysis:
F1 Score: 0.6666666666666666
MCC: 0.2358640882624316
Accuracy: 0.5555555555555556
Brier Score: 0.22622879233887477
Log Loss: 0.6432580350794923


Decision threshold: 0.39999999999999997
TP: 0
FP: 0
TN: 24
FN: 21
-----------------------------------------
Imputed Data:
F1 Score: 0.5957446808510638
MCC: 0.19202898550724637
Accuracy: 0.5957446808510638
Brier Score: 0.3977927207080263
Log Loss: 2.277157820829183


Decision threshold: 0.25
TP: 11
FP: 8
TN: 15
FN: 13
-----------------------------------------
