# Cross-hospital model performance evaluation in log data

### 1. Import required libraries

In [None]:
from mlpipeline.evaluation import model_evaluater
from mlpipeline import mlpipeline_settings
import pandas as pd
import math
import numpy as np
from datetime import timedelta
from sklearn.metrics import roc_auc_score
from scipy.stats import norm
from math import sqrt

### 2. Select customer data function 

In [None]:
# General inputs
use_case = 'sep'
use_case_complete_name = 'sepsis'  

def select_customer_data (customer):

    if customer == 'HDZ':
        date_logs =['20210219', '20210226', '20210305','20210319','20210409','20210416','20210507','20210521','20210528','20210604','20210611','20210618','20210625','20210709','20210723','20210806']
    elif customer == 'MHS':
        date_logs =['20210219', '20210226', '20210305','20210319','20210409','20210430','20210507','20210521','20210528','20210604','20210611','20210618','20210625','20210709','20210723', '20210811']
    elif customer == 'MKN':
        date_logs =['20210219', '20210226', '20210305','20210625','20210709','20210723','20210806']

    path = r'logs'
    path_log_file = f'{path}/{customer}/{date_logs[-1]}'
    path_outcome_file = f'{path_log_file}/{date_logs[-1]}_{customer.lower()}_{use_case[0:3]}.txt'
    
    # the data of outcome is extracted from the path of outcome file
    outcome_date = pd.to_datetime(path_outcome_file[-20:-12], format="%Y%m%d")

    # Define Code deadline as 60 days before the end date
    code_deadline = outcome_date - timedelta(days = 60 )
    
    # read the medical cases at discharge
    obs_df_complete = pd.read_csv(f'{path_log_file}/{use_case}/discharged_df_complete.csv', sep = ";")

    # Assign the labels
    outcome_file = pd.read_csv(path_outcome_file,  sep = ';')
    outcome_file = outcome_file[~outcome_file.CANCELDATE.notnull()]
    obs_df_complete['LABEL'] = np.where(obs_df_complete['CASEID'].isin(outcome_file.FALLID),1,0)

    # include all the discharged cases
    dis_logs = date_logs
    discharged_df_total = pd.DataFrame(columns=['FALLID','AUFNDAT','ENTLDAT'])
    parser = lambda x: pd.datetime.strptime(x[0:10], '%d.%m.%Y')
    dtypes = {'FALLID':int, 'AUFNDAT':str, 'ENTLDAT':str}

    for dis_log in dis_logs:
        discharged_file1 = pd.read_csv(f'{path}/{customer}/{dis_log}/{dis_log}_{customer.lower()}_discharges.txt',  sep = ';', dtype = dtypes , parse_dates=['AUFNDAT', 'ENTLDAT'], date_parser=parser)
        discharged_df_total = pd.concat([discharged_file1, discharged_df_total])
    discharged_file = discharged_df_total.drop_duplicates(subset="FALLID")

    # If a case is discharged more than 60 days before the outcome file, it is considered as coded
    coded_cases = discharged_file[discharged_file.ENTLDAT<code_deadline].FALLID

    # If a case is considered as coded, keep the label, otherwise,set the label as -1, meaning unknown
    obs_df_complete['LABEL'] = np.where(obs_df_complete.CASEID.isin(coded_cases), obs_df_complete['LABEL'],-1)
    
    return obs_df_complete

### 3. Function for selecting the models

In [None]:
# different model versions for each use case and hospital.
def select_models(use_case):
    if use_case == 'aki':
        model_v_old_HDZ = 1611094012
        model_v_new_HDZ = 1618013699
        model_v_old_MHS = 1607525093
        model_v_new_MHS = 1616933703
        model_v_old_MKN = 1607562693
        model_v_new_MKN = 1614873887
        models = [model_v_old_HDZ, model_v_new_HDZ, model_v_old_MHS, model_v_new_MHS, model_v_old_MKN, model_v_new_MKN]
    elif use_case == 'del': 
        model_v_old_HDZ = 1611085421
        model_v_new_HDZ = 1618005122
        model_v_old_MHS = 1607518422
        model_v_new_MHS = 1616927034    
        model_v_old_MKN = 1607553470
        model_v_new_MKN = 1614857540
        models = [model_v_old_HDZ, model_v_new_HDZ, model_v_old_MHS, model_v_new_MHS, model_v_old_MKN, model_v_new_MKN]    
    elif use_case == 'sep':
        model_v_old_HDZ = 1611088812
        model_v_new_HDZ = 1618008511      
        model_v_old_MHS = 1607520825
        model_v_new_MHS = 1616929424
        model_v_old_MKN = 1607556923
        model_v_new_MKN = 1614865190        
        models = [model_v_old_HDZ, model_v_new_HDZ, model_v_old_MHS, model_v_new_MHS, model_v_old_MKN, model_v_new_MKN]
    return models    

models = select_models(use_case)

In [None]:
def rescale_belief(o, belief):
    b = math.exp(belief)
    # belief in DISEASE 0 is 1 - belief in DISEASE 1
    if o == 0:
        b = 1 - b
  
  # transform the belief thresholds to risk score thresholds
    rescaled = np.interp(b, belief_thresholds, risk_score_thresholds)
    return rescaled

belief_thresholds = [0, 0.2, 0.4, 1]
risk_score_thresholds = [0, 0.5, 0.75, 1]

def generate_observations(obs_df):
    for i in range(len(obs_df)):
        yield {
            "inputs": obs_df.OBS_COMPLETE.iloc[i],
            "label": "",
            "caseid": obs_df.CASEID.iloc[i],
            "med_day": "",
            "hour_group": ""
        }
        
def get_confidence_interval(p, n, confidence = 0.95):
    if n < 1:
        return [float('NaN'), float('NaN')]
    left_bound = 0.5 - confidence / 2
    right_bound = 0.5 + confidence / 2
    z_left = norm.ppf(left_bound)
    z_right = norm.ppf(right_bound)
    return [round(p + z_left * sqrt(p * (1 - p) / n),3),
            round(p + z_right * sqrt(p * (1 - p) / n),3)]


### 4. Create the cross-hospital evaluation table

In [None]:
# create a cross-validation table
df = pd.DataFrame(columns=['HDZ','MHS','MKN'], index= ['HDZ','MHS','MKN'])
for customer_data in ['HDZ','MHS','MKN']:
    auroc_score_all = []
    i = 0
    if customer_data == 'HDZ':
        hf_datetime = pd.to_datetime('2021-04-16 07:57:47', format="%Y-%m-%d %H:%M:%S")
    elif customer_data == 'MHS':
        hf_datetime = pd.to_datetime('2021-04-09 06:46:51', format="%Y-%m-%d %H:%M:%S")
    elif customer_data == 'MKN':
        hf_datetime = pd.to_datetime('2021-03-11 07:37:05', format="%Y-%m-%d %H:%M:%S")
    print('Customer data:', customer_data)        
    for customer in ['HDZ','MHS','MKN']:
        print(customer)
        copy_obs_complete = select_customer_data (customer_data)
        if customer == 'HDZ':
            date_logs =['20210219', '20210226', '20210305','20210319','20210409','20210416','20210507','20210521','20210528','20210604','20210611','20210618','20210625','20210709','20210723','20210806']
        elif customer == 'MHS':
            date_logs =['20210219', '20210226', '20210305','20210319','20210409','20210430','20210507','20210521','20210528','20210604','20210611','20210618','20210625','20210709','20210723', '20210811']
        elif customer == 'MKN':
            date_logs =['20210219', '20210226', '20210305','20210625','20210709','20210723','20210806']
        path = r'logs'

        # Do the predictions
        print('prediction with model old: ',models[i])
        prediction_data_old = model_evaluater.run(
        generate_observations(copy_obs_complete), use_case_complete_name , mlpipeline_settings, model_version=models[i])
        copy_obs_complete['BELIEF_OLD'] = prediction_data_old.apply(lambda row: rescale_belief(row['PREDICTION'],row['SCORE']),axis=1)

        print('prediction with model new: ',models[i+1])
        prediction_data_new = model_evaluater.run(
        generate_observations(copy_obs_complete), use_case_complete_name , mlpipeline_settings, model_version=models[i+1])       
        copy_obs_complete['BELIEF_NEW'] = prediction_data_new.apply(lambda row: rescale_belief(row['PREDICTION'],row['SCORE']),axis=1)

        copy_obs_complete['ADJUSTED_BELIEF'] = np.where(pd.to_datetime(copy_obs_complete.DATETIME, format="%Y-%m-%d %H:%M:%S")<hf_datetime, copy_obs_complete.BELIEF_OLD, copy_obs_complete.BELIEF_NEW )
        
        # drop the medical cases not coded
        copy_obs_complete = copy_obs_complete.drop(copy_obs_complete[copy_obs_complete.LABEL == -1].index)
        
        copy_obs_complete.to_csv(f'{path}/belief_{use_case[0:3]}_{customer_data}_{customer}.csv', index = False, sep = ";")       

        #calculate metrics
        auroc_score = roc_auc_score(copy_obs_complete.LABEL, copy_obs_complete.ADJUSTED_BELIEF)
        auroc_score_all.append(str(round(auroc_score,3))+' '+ str(get_confidence_interval(auroc_score, len(copy_obs_complete), confidence)))

        i = i+2
    df[customer_data] = auroc_score_all     
    
df.to_excel(f'{path}/cross_validation_{use_case[0:3]}.xlsx')