# Prepare the log data for analysis

### 1. Import the required libraries

In [None]:
import pandas as pd 
import re
import math

### 2. Provide some general information about the log

In [None]:
### General input
use_case = 'aki'
date_logs =['20210806']
customer = 'HDZ'
path = r'logs'
path_log_file = f'{path}/{customer}/{date_logs[-1]}'
dates_indicator = '_'.join(date_logs)

### 3. Read in the log file and create a dataframe with all observation in the logs

In [None]:
#Merge the log files
aggregated_log=[]

for date_log in date_logs:
    with open(f'{path}/{customer}/{date_log}/{customer}_{date_log}-{use_case[0:3]}-model.log', "r") as log:
        for line in log:
            aggregated_log.append(line)


### 4. Extract the predictions from the RESPONSE line in the log

In [None]:
prediction_data = pd.DataFrame(columns=['CASEID','DATE', 'OBS','BELIEF'])
cases=[]
beliefs = []
observations = []
dates=[]
for line in aggregated_log:
    if re.search('RESPONSE',line):
        step1 = re.sub('^.* RESPONSE ','',line)
        step2 = re.sub('\\\\n".*$','',step1)
        step3 = re.sub('^.*:"','',line)[:19]
        
        step5 = line.replace("\\","")
        obs = re.sub('}n',': ',step5).split('": "')[2]
        result = re.sub('\n', '', step2)
        case = result.split(': ')[0]
        belief = result.split(': ')[2].split(',')[0]
        cases.append(case)
        
        if ('DELIRIUM 1' in line )| ('SEPSIS 1' in line )|('AKI 1' in line ):
            beliefs.append(math.exp(float(belief)))
        else:
            beliefs.append(1-math.exp(float(belief)))        
        
        dates.append(step3)
        observations.append(obs)
            
prediction_data.CASEID = cases
prediction_data.BELIEF = beliefs  
prediction_data.DATE = dates
prediction_data.OBS = observations
prediction_data = prediction_data.sort_values(['CASEID','DATE'])

In [None]:
prediction_data = prediction_data.drop_duplicates()
prediction_data['CASEID'] = prediction_data['CASEID'].astype(int)

prediction_data['DATETIME'] = pd.to_datetime(prediction_data['DATE'], format="%Y-%m-%dT%H:%M:%S")

# date only on day level
prediction_data['DATE'] = prediction_data['DATETIME'].apply(lambda x: x.date())

# for predictions that belong to the same medical case and have same observations and belief, take just the last instance.
prediction_data['DATETIME'] = prediction_data.groupby(['CASEID','DATE','OBS','BELIEF'])['DATETIME'].transform(max)


In [None]:
prediction_data.to_csv(f'{path_log_file}/{use_case}/prediction_data.csv', index = False, sep = ";")