# Characteristic analysis of log data

### 1. Import the necessary mlpipeline functionality

In [None]:
import pandas as pd 
import numpy as np
import re
from datetime import timedelta
import statistics

### 2. Provide some general information about the log

In [None]:
### General input
use_case = 'aki'
customer = 'MKN'

if customer == 'HDZ':
    date_logs =['20210219', '20210226', '20210305','20210319','20210409','20210416','20210507','20210521','20210528','20210604','20210611','20210618','20210625','20210709','20210723','20210806']
elif customer == 'MHS':
    date_logs =['20210219', '20210226', '20210305','20210319','20210409','20210430','20210507','20210521','20210528','20210604','20210611','20210618','20210625','20210709','20210723', '20210811']
elif customer == 'MKN':
    date_logs =['20210219', '20210226', '20210305','20210625','20210709','20210723','20210806']

dates_indicator = '_'.join(date_logs)
path = r'logs'
path_log_file = f'{path}/{customer}/{date_logs[-1]}'

#Last outcome file path
path_outcome_file = f'{path_log_file}/{date_logs[-1]}_{customer.lower()}_{use_case[0:3]}.txt'

### 3. Read the log data discharged

In [None]:
parser = lambda x: pd.datetime.strptime(x[0:20], '%Y-%m-%d %H:%M:%S')
dtypes = {'CASEID':int,'OBS':str, 'BELIEF':float,'OBS_COMPLETE':str}

discharged_df = pd.read_csv(f'{path_log_file}/{use_case}/discharged_df_complete.csv', sep = ";",  dtype = dtypes , parse_dates=['DATETIME'], date_parser=parser)

### 4. Keep the medical cases discharged 60 days before the last outcome file

In [None]:
# read the last outcome file which contains all positive medical cases
outcome_file = pd.read_csv(path_outcome_file,  sep = ';')
outcome_file = outcome_file[~outcome_file.CANCELDATE.notnull()]

In [None]:
# Assign the labels to the prediction data
discharged_df['LABEL'] = np.where(discharged_df['CASEID'].isin(outcome_file.FALLID),1,0)

In [None]:
#read all the discharged cases
dis_logs = date_logs
discharged_df_total = pd.DataFrame(columns=['FALLID','AUFNDAT','ENTLDAT'])
parser = lambda x: pd.datetime.strptime(x[0:10], '%d.%m.%Y')
dtypes = {'FALLID':int, 'AUFNDAT':str, 'ENTLDAT':str}

for dis_log in dis_logs:
    discharged_file1 = pd.read_csv(f'{path}/{customer}/{dis_log}/{dis_log}_{customer.lower()}_discharges.txt',  sep = ';', dtype = dtypes , parse_dates=['AUFNDAT', 'ENTLDAT'], date_parser=parser)
    discharged_df_total = pd.concat([discharged_file1, discharged_df_total])
discharged_file = discharged_df_total.drop_duplicates(subset="FALLID")

In [None]:
# the data of outcome is extracted from the path of outcome file
outcome_date = pd.to_datetime(path_outcome_file[-20:-12], format="%Y%m%d")

# Define code deadline as 60 days before the end date
code_deadline = outcome_date - timedelta(days = 60 )

# If a case is discharged more than 60 days before the outcome file, it is considered as coded
coded_cases = discharged_file[discharged_file.ENTLDAT<code_deadline].FALLID

# If a case is considered as coded, keep the label, otherwise, set the label as -1, meaning unknown
discharged_df['LABEL'] = np.where(discharged_df.CASEID.isin(coded_cases), discharged_df['LABEL'],-1)

In [None]:
# Drop all the medical cases with unknown code/label
discharged_df = discharged_df.drop(discharged_df[discharged_df.LABEL == -1].index)

### 4. Extract the feature information from observations

In [None]:
def obs_list(t):
    # Split the observation into a list of strings
    return t.split(' ')

discharged_df['OBS_SPLIT'] = discharged_df.OBS_COMPLETE.map(obs_list)

In [None]:
# Analyze the different characteristic of the log data
num_records = len(discharged_df)

def find_age_group_median(df):
    matchers = ['AGE_GROUP']
    age_group = []
    for obs in df['OBS_SPLIT']:
        age_group_string = [s for s in obs if any(xs in s for xs in matchers)]
        age_group.append(int(age_group_string[0][-1]))
    return round(statistics.mean(age_group),1)

def find_sex(df):
    matchers = ['GENDER']
    gender_list = []
    for obs in df['OBS_SPLIT']:
        gender = [s for s in obs if any(xs in s for xs in matchers)]
        if len(gender) != 0: 
            gender_list.append(gender[0])
    return sum('GENDER-FEMALE' in s for s in gender_list)

def find_historical(df):
    matchers = ['HISTORI']
    historical_count = 0
    for obs in df['OBS_SPLIT']:
        hist = [s for s in obs if any(xs in s for xs in matchers)]
        if len(hist) != 0: 
            historical_count = historical_count + 1
    return historical_count

def find_admission_kind(df):
    matchers = ['ADMISSION']
    admission_type = []
    for obs in df['OBS_SPLIT']:
        admission = [s for s in obs if any(xs in s for xs in matchers)]
        if len(admission) != 0:
            admission_type.append(int(admission[0].partition('-')[-1]))
            #.partition("-")
    return admission_type.count(1)+admission_type.count(14)+admission_type.count(8)+admission_type.count(41)

def find_medication(df):
    matchers = ['MEDICATION']
    medication_count = 0 
    for obs in df['OBS_SPLIT']:
        medication = [s for s in obs if any(xs in s for xs in matchers)]
        if len(medication) != 0:
            medication_count = medication_count + 1
    return medication_count

def find_lab_results(df):
    matchers = ['LAB_RESULT']
    lab_results_count = 0 
    for obs in df['OBS_SPLIT']:
        lab_results = [s for s in obs if any(xs in s for xs in matchers)]
        if len(lab_results) != 0:
            lab_results_count = lab_results_count + 1
    return lab_results_count

def find_vital_sign(df):
    matchers = ['VITAL_SIGN']
    vital_sign_count = 0 
    for obs in df['OBS_SPLIT']:
        vital_sign = [s for s in obs if any(xs in s for xs in matchers)]
        if len(vital_sign) != 0:
            vital_sign_count = vital_sign_count + 1
    return vital_sign_count

def find_clinical_entities(df):
    matchers = ['SELECTED_KEYWORD','DISORDER']
    clinical_entities_count = 0 
    for obs in df['OBS_SPLIT']:
        clinical_entities = [s for s in obs if any(xs in s for xs in matchers)]
        if len(clinical_entities) != 0:
                clinical_entities_count = clinical_entities_count + 1
    return clinical_entities_count

age_group_median = find_age_group_median(discharged_df)
fem_gender_total = find_sex(discharged_df)
historical_total = find_historical(discharged_df)
normal_admission = find_admission_kind(discharged_df)
medication = find_medication(discharged_df)
lab_results = find_lab_results(discharged_df)
vital_sign = find_vital_sign(discharged_df)
clinical_entities = find_clinical_entities(discharged_df)

#Dataframe fill
characteristic_df = pd.DataFrame()
characteristic_df['N_of_records'] = [len(discharged_df)]
characteristic_df['Age_group'] = age_group_median
characteristic_df['Female_sex'] = str(fem_gender_total) + " ("+str(round(fem_gender_total/len(discharged_df)*100))+"%)"
characteristic_df['Normal_admission'] = str(normal_admission) + " ("+str(round(normal_admission/len(discharged_df)*100))+"%)"
characteristic_df['Emergency_admission'] = str(len(discharged_df)-normal_admission) + " ("+str(round((len(discharged_df)-normal_admission)/len(discharged_df)*100))+"%)"
characteristic_df['History'] = str(historical_total) + " ("+str(round(historical_total/len(discharged_df)*100))+"%)"
characteristic_df['Medication'] = str(medication) + " ("+str(round(medication/len(discharged_df)*100))+"%)"
characteristic_df['Lab_results'] = str(lab_results) + " ("+str(round(lab_results/len(discharged_df)*100))+"%)"
characteristic_df['Vital_sign'] = str(vital_sign) + " ("+str(round(vital_sign/len(discharged_df)*100))+"%)"
characteristic_df['Named_Clinical_Entities'] = str(clinical_entities) + " ("+str(round(clinical_entities/len(discharged_df)*100))+"%)"


In [None]:
#save the characteristic table
characteristic_df.to_csv(f'{path_log_file}/{use_case}/characteristic_table_{customer}_{use_case}.csv', index = False, sep = ";")