# Simple base model for all 3 prediction targets

In this notebook, a simple base model is created for all 3 prediction targets: cancellation, paracetamol and length of stay.

For the binary prediction targets, the model has to at least outperform a random model that is aware of the class distribution for it to be considered useful, AKA a random rate classifier (weighted guessing). For length of stay, a simple logistic regression model is created.


In [37]:
import pandas as pd
import numpy as np
import os
from other_lib import globalvar
from other_lib.general_functions import find_all_csv_locations
from other_lib.auk_score import AUK

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, precision_score, r2_score, make_scorer

from matplotlib import pyplot
pd.options.mode.chained_assignment = None  
import warnings
warnings.filterwarnings("ignore")

#function to load and prepare data
def prepare_dataset_for_model(file_location, model_type):
    
    df = pd.read_csv(file_location)
    model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)
    binary = False if 'los' in model_name.lower() else True #check if a binary prediction (for paracetamol/cancel datasets) or a regression prediction (for length of stay) is being made
    
    #define label (aka outcome) and prediction data
    y = df['Label'] if 'Label' in df else df['outcome']
    X = df.loc[:, df.columns != 'Label'] if 'Label' in df else df.loc[:, df.columns != 'outcome']
    
    #remove TraceID (aka case_id) from the training and testing data
    if 'TraceID' in X.columns or 'case_id' in X.columns:
        X = X.drop('TraceID', 1) if 'TraceID' in X.columns else X.drop('case_id', 1)
    
    #train/val/test set split, must be done before scaling and upsampling to prevent data leakage between train/test data
    if binary:
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)
    else:
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

        
    #fill NaN value with mean of training data for both train and test data. Cant do mean per group since many groups have no data at all
    x_train.fillna(x_train.mean(), inplace=True)
    x_test.fillna(x_train.mean(), inplace=True)
    X.fillna(x_train.mean(), inplace=True)
    
    #scaling for non-additional features, only on train/test data to prevent data leakage, complete X returned without scaling
    additional_features = ['MedicationCode_B01AA04', 'MedicationCode_B01AA07', 'MedicationCode_B01AE07', 'MedicationCode_B01AF01', 
                           'MedicationCode_B01AF02', 'MedicationCode_B01AF03', 'MedicationCode_N02AJ13', 'MedicationCode_N02BE01',
                           'PlannedDuration', 'Duration', 'MedicationType', 'NOAC', 'MedicationStatus', 'temperature', 
                           'bloodPressure', 'Test_Hemoglobine', 'Test_eGFR', 'Test_INR', 'Test_Trombocyten']

    scaler = StandardScaler()    
    
    if 'tokenized' in model_name and 'transformer' not in model_type: #means all columns need to be encoded, regardless of additional or not
        x_train = pd.DataFrame(scaler.fit_transform(x_train))
        x_test = pd.DataFrame(scaler.fit_transform(x_test))
    elif 'additional' in model_name.lower() and 'ae_agg' not in model_name.lower(): #means only the additionally added columns need to be scaled
        x_train[additional_features] = scaler.fit_transform(x_train[additional_features])
        x_test[additional_features] = scaler.fit_transform(x_test[additional_features])
        
    #oversampling of training data for cancellation data, skip test data (data leakage) and validation (validation needs to be representative of test data)
    if 'can' in model_name:
        oversampler = RandomOverSampler(sampling_strategy='minority')
        x_train, y_train = oversampler.fit_resample(x_train, y_train)
        
    #For lstm models, the input needs to be 3d instead of 2d. Therefore, add another dimension to the data so the data passes correctly
    if model_type == 'lstm' or model_type=='transformer' and 'additional' not in model_name.lower():
        x_train = np.expand_dims(x_train, -1)
        x_test= np.expand_dims(x_test, -1) 
    
    return x_train, x_test, y_train, y_test, binary, X, y, model_type


## Randomly weighted baseline

For the two binary prediction targets, the performance of a randomly weighted classifier is calculated

In [3]:
#use any of the encoded datasets since the labels are all the same regardless of encoding strategy
df_par = pd.read_csv('C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\encoded_logs\\one_hot_encoded_logs\\one_hot_par.csv')
df_can = pd.read_csv('C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\encoded_logs\\one_hot_encoded_logs\\one_hot_can.csv')

print('Number of true/false values for paracetamol: ', df_par['Label'].value_counts().to_dict())
print('Number of true/false values for cancellation: ', df_can['Label'].value_counts().to_dict())

Number of true/false values for paracetamol:  {1.0: 700, 0.0: 528}
Number of true/false values for cancellation:  {0.0: 994, 1.0: 234}


In [4]:
def random_rate_classifier(true_vals):
    
    total_predictions = len(true_vals)
    positive_chance = (true_vals.values == 1).sum() / total_predictions
    negative_chance = (true_vals.values == 0).sum() / total_predictions
    
    baseline_accuracy = round(positive_chance**2 + negative_chance**2, 4)
    baseline_predictions = np.random.choice([0, 1], size=(total_predictions,), p=[negative_chance, positive_chance])
    
    return baseline_accuracy
    
baseline_acc_par = random_rate_classifier(df_par['Label'])
baseline_acc_can = random_rate_classifier(df_can['Label'])

print('Baseline accuracy for paracetamol dataset: ', baseline_acc_par)
print('Baseline accuracy for cancellation dataset: ', baseline_acc_can)


Baseline accuracy for paracetamol dataset:  0.5098
Baseline accuracy for cancellation dataset:  0.6915


Furthermore, I've also created a very simple baseline logistic regression model that can be used to compare with the complexer neural networks.

## Logistic regression baseline model

In [19]:
file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\encoded_logs\\one_hot_encoded_logs\\one_hot_can_additional.csv'
model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)

x_train, x_test, y_train, y_test, binary, X, y, model_type = prepare_dataset_for_model(file_location, model_type='logistic')

print('x_train shape: ', x_train.shape, '| x_test shape: ', x_test.shape, '| X shape: ', X.shape)
print('y_train shape: ', y_train.shape, '| y_test shape: ', y_test.shape, '| y shape: ', y.shape)

def calc_auk_score(y_true, y_pred):
    return(AUK(y_true, y_pred).calculate_auk())

auk_scorer = make_scorer(calc_auk_score, greater_is_better=True)

log_regression = LogisticRegression()
log_regression.fit(x_train, y_train)
y_pred = log_regression.predict(x_test)
y_pred

scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'precision':'precision',
           'recall': 'recall',
           'auc': 'roc_auc',
           'auk': auk_scorer}

scores = cross_validate(log_regression, X, y, scoring=scoring, cv=5, return_train_score=False, error_score="raise")

file location:  C:\Users\20190337\Downloads\Tracebook_v2 (Projectfolder)\encoded_logs\one_hot_encoded_logs\one_hot_can_additional.csv
x_train shape:  (1590, 41) | x_test shape:  (246, 41) | X shape:  (1228, 41)
y_train shape:  (1590,) | y_test shape:  (246,) | y shape:  (1228,)


In [None]:
#custom scorer function that calculates AUK
def calc_auk_score(y_true, y_pred):
    return(AUK(y_true, y_pred).calculate_auk())

auk_scorer = make_scorer(calc_auk_score, greater_is_better=True)

def logistic_regression(x_train, x_test, y_train, y_test, X, y):

    log_regression = LogisticRegression()
    log_regression.fit(x_train, y_train)
    y_pred = log_regression.predict(x_test)

    importance = log_regression.coef_[0]
    
    scoring = {'acc': 'accuracy',
               'f1': 'f1',
               'precision':'precision',
               'recall': 'recall',
               'auc': 'roc_auc',
               'auk': auk_scorer
               }
    
    scores = cross_validate(log_regression, X, y, scoring=scoring, cv=5, return_train_score=False)
    
    #print(scores)
    
    scores.pop('fit_time')
    scores.pop('score_time')
    scores['# acc'] = scores.pop('test_acc')
    scores[' f1'] = scores.pop('test_f1')
    scores[' precision'] = scores.pop('test_precision')
    scores[' recall'] = scores.pop('test_recall')
    scores[' auc'] = scores.pop('test_auc')
    scores[' auk'] = scores.pop('test_auk')

    mean_scores = {key:sum(scores[key])/len(scores[key]) for key in scores}

    return mean_scores

#simple function to output mean cv scores to csv file
def save_cv_results(mean_scores, output_dir, model_name):
    df = pd.DataFrame(mean_scores, index=[0])
    np.savetxt(output_dir + model_name + '.csv', np.atleast_2d(df),
                      delimiter=',', fmt='%6f', header='acc, f1, precision, recall, auc, auk')
    
output_dir = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\model_results\\baseline\\'
file_locations = ['C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\encoded_logs\\one_hot_encoded_logs\\one_hot_can.csv',
                  'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\encoded_logs\\one_hot_encoded_logs\\one_hot_can_additional.csv',
                  'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\encoded_logs\\one_hot_encoded_logs\\one_hot_par.csv',
                  'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\encoded_logs\\one_hot_encoded_logs\\one_hot_par_additional.csv']

#loop through the 4 files, calculate performance of the logistic regression model and save scores
for file_location in file_locations:
    model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)
    print('Now calculating cv scores for: ', model_name)
    x_train, x_test, y_train, y_test, binary, X, y, model_type = prepare_dataset_for_model(file_location, model_type='logistic') #train/test split
    scores = logistic_regression(x_train, x_test, y_train, y_test, X, y) #calc cv scores

    print(scores)
    
    save_cv_results(scores, output_dir, model_name)


## Linear regression baseline model

For the LOS baseline model, a simple linear regression model is used. Again, start with preparing the data

In [126]:
file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\encoded_logs\\one_hot_encoded_logs\\one_hot_los.csv'
model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)

print('file location: ', file_location)

x_train, x_test, y_train, y_test, binary, X, y, model_type = prepare_dataset_for_model(file_location, model_type='linear')

print('x_train shape: ', x_train.shape, '| x_test shape: ', x_test.shape, '| X shape: ', X.shape)
print('y_train shape: ', y_train.shape, '| y_test shape: ', y_test.shape, '| y shape: ', y.shape)

file location:  C:\Users\20190337\Downloads\Tracebook_v2 (Projectfolder)\encoded_logs\one_hot_encoded_logs\one_hot_los.csv
x_train shape:  (933, 23) | x_test shape:  (234, 23) | X shape:  (1167, 23)
y_train shape:  (933,) | y_test shape:  (234,) | y shape:  (1167,)


In [38]:
#Function that calculates the linear regression performance
def linear_regression(x_train, x_test, y_train, y_test, X, y):
    
    lin_regression = LinearRegression()
    lin_regression.fit(x_train, y_train)
    y_pred = lin_regression.predict(x_test)

    importance = lin_regression.coef_
    
    scoring = {'mae': 'neg_mean_absolute_error',
               'mape':'neg_mean_absolute_percentage_error',
               'mse': 'neg_mean_squared_error',}
    
    #cross validate and clean up scores
    scores = cross_validate(lin_regression, X, y, scoring=scoring, cv=5, return_train_score=False)
    scores.pop('fit_time')
    scores.pop('score_time')
    scores['# mae'] = scores.pop('test_mae')
    scores[' mape'] = scores.pop('test_mape')
    scores[' mse'] = scores.pop('test_mse')
    
    mean_scores = {key:sum(-scores[key])/len(scores[key]) for key in scores}
    
    return mean_scores

#simple function to output mean cv scores to csv file
def save_cv_results(mean_scores, output_dir, model_name):
    df = pd.DataFrame(scores, index=[0])
    np.savetxt(output_dir + model_name + '.csv', np.atleast_2d(df),
                      delimiter=',', fmt='%6f', header='mae, mape, mse')

#calculate the performance
output_dir = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\model_results\\baseline\\'
file_locations = ['C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\encoded_logs\\one_hot_encoded_logs\\one_hot_los.csv',
                  'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\encoded_logs\\one_hot_encoded_logs\\one_hot_los_additional.csv']

for file_location in file_locations:
    model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)
    print('Now calculated cv scores for: ', model_name)

    x_train, x_test, y_train, y_test, binary, X, y, model_type = prepare_dataset_for_model(file_location, model_type='linear') #train/test split
    scores = linear_regression(x_train, x_test, y_train, y_test, X, y) #calc cv scores
    print(scores)
    save_cv_results(scores, output_dir, model_name)


Now calculated cv scores for:  one_hot_los
{'# mae': 267.5477487002304, ' mape': 0.2887656443021881, ' mse': 116210.07829962033}
Now calculated cv scores for:  one_hot_los_additional
{'# mae': 261.60554400109027, ' mape': 0.2796983705723021, ' mse': 108992.89140402494}
