# Preprocessing the public Event Log

## Reading Raw Event data in XES format as an Event Log ##

In [3]:
import pandas as pd
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as xes_converter

hospital_log_all = xes_importer.apply('Data\Hospital_log_all.xes')

print("Number of traces present in the full event log:", len(hospital_log_all))

num_of_events = 0
for trace in hospital_log_all:
    num_of_events = num_of_events + len(trace)

print("Number of events in full event log:", num_of_events)

parsing log, completed traces ::   0%|          | 0/1143 [00:00<?, ?it/s]

Number of traces present in the full event log: 1143
Number of events in full event log: 150291


In [4]:
# from IPython import get_ipython
# ip = get_ipython()
# ip.magic("reload_ext autoreload")
# ip.magic("autoreload 2")

  ip.magic("reload_ext autoreload")
  ip.magic("autoreload 2")


## Getting a subset of the full event log based on traces ##

We consider 33.3% of the traces in the full event log. This is for the ease of computation later.

In [6]:
from Preprocessing import DivideDatasets

hospital_log_initial = DivideDatasets.get_subset(hospital_log_all, 3)

print("Number of traces present in the event log subset:", len(hospital_log_initial))

num_of_events = 0
for trace in hospital_log_initial:
     num_of_events = num_of_events + len(trace)

print("Number of events in the event log subset:", num_of_events)

pm4py.write_xes(hospital_log_initial, "Data\Processed\Hospital_Log_Initial.xes")

Number of traces present in the event log subset: 381
Number of events in the event log subset: 51451




exporting log, completed traces ::   0%|          | 0/381 [00:00<?, ?it/s]

## Translating the event log content into English

Mainly the activity name is in English. For a better clarity while analysing, we convert that to English

In [40]:
#hospital_log_initial_temp = hospital_log_initial

In [60]:
from Preprocessing import TranslationManager

hospital_log_translated = TranslationManager.translate(hospital_log_initial, 'en')

pm4py.write_xes(hospital_log_translated, "Data\Processed\Hospital_Log_Translated.xes")

KeyboardInterrupt: 

In [7]:
hospital_log_translated = xes_importer.apply('Data\Processed\Hospital_Log_Translated.xes')

parsing log, completed traces ::   0%|          | 0/381 [00:00<?, ?it/s]

In [8]:
len(hospital_log_translated)

381

## Remove duplicates

In [50]:
## Here the approach is to remove the duplicate events by converting it into a Pandas dataframe.

import pm4py.utils as pm4py_utils

# Define new lists for trace and event data
all_traces = []
all_events = []

# Extract trace and event data from the event log object
for trace in hospital_log_translated:
    # Get trace name for each trace
    trace_name = trace.attributes['concept:name']

    # Make the event name as unique as possible
    for event in trace:
        all_traces.append(trace_name)
        # Extract event name
        event_name = event['concept:name'] + ' - ' + event['Producer code'] + str(event['Specialism code']) + ' - ' + str(event['time:timestamp'])
        if 'org:group' in event:
            event_name = event_name + ' - ' + event['org:group'] 
        if 'Section' in event:
            event_name = event_name + ' - ' + event['Section']
            
        all_events.append(event_name)

# Raise error if the lists length are different
if len(all_traces) != len(all_events):
    raise ValueError("Lengths of all_traces and all_events do not match.")

# Create a Pandas DataFrame & remove duplicates
hospital_log_translated_df = pd.DataFrame({'trace': all_traces, 'event': all_events})
#print("All Events:", len(hospital_log_translated_df))
hospital_log_translated_df = hospital_log_translated_df.drop_duplicates(subset=['trace', 'event'])
#print("Dup removed:", len(hospital_log_translated_df))

# Initialize an empty event log object
hospital_log_dup_removed = pm4py_utils.EventLog()
trace_dict = {}

# Iterate over the unique traces found in the DataFrame
for trace_name in hospital_log_translated_df['trace'].unique():
    # Filter events corresponding to the current trace
    trace_events = hospital_log_translated_df[hospital_log_translated_df['trace'] == trace_name]['event']
    # Create a new trace object and add it to the dictionary
    new_trace = pm4py_utils.Trace()
    hospital_log_dup_removed.append(new_trace)
    # Copy trace attributes
    for key, value in hospital_log_translated_df[hospital_log_translated_df['trace'] == trace_name].iloc[0].items():
        if key != 'event':  # Skip the 'event' column
            new_trace.attributes[key] = value
    # Associate the trace name with all events within the trace
    for event_name in trace_events:
        # Create a new event object
        new_event = pm4py_utils.Event()
        # Copy event attributes
        for key, value in hospital_log_translated_df[hospital_log_translated_df['event'] == event_name].iloc[0].items():
            if key != 'trace':  # Skip the 'trace' column
                new_event[key] = value
        new_trace.append(new_event)

print("Number of traces present in the duplicate removed event log:", len(hospital_log_dup_removed))

num_of_events = 0
for trace in hospital_log_dup_removed:
     num_of_events = num_of_events + len(trace)

print("Number of events in the duplicate removed event:", num_of_events)

# write into file
pm4py.write_xes(hospital_log_dup_removed, "Data\Processed\Hospital_Log_Dup_Removed.xes")


Number of traces present in the duplicate removed event log: 381
Number of events in the duplicate removed event: 44982




exporting log, completed traces ::   0%|          | 0/381 [00:00<?, ?it/s]

In [53]:
import pm4py.utils as pm4py_utils

# Define new lists for trace and event data
all_traces = []
all_events = []

# Extract trace and event data from the event log object
for trace in hospital_log_translated:
    # Get trace name for each trace
    trace_name = trace.attributes['concept:name']

    # Make the event name as unique as possible
    for event in trace:
        all_traces.append(trace_name)
        # Extract event name
        event_name = event['concept:name'] + ' - ' + event['Producer code'] + str(event['Specialism code']) + ' - ' + str(event['time:timestamp'])
        if 'org:group' in event:
            event_name = event_name + ' - ' + event['org:group'] 
        if 'Section' in event:
            event_name = event_name + ' - ' + event['Section']
            
        all_events.append(event_name)

# Raise error if the lists length are different
if len(all_traces) != len(all_events):
    raise ValueError("Lengths of all_traces and all_events do not match.")

# Create a Pandas DataFrame & remove duplicates
hospital_log_translated_df = pd.DataFrame({'trace': all_traces, 'event': all_events})
print("All Events:", len(hospital_log_translated_df))
hospital_log_translated_df = hospital_log_translated_df.drop_duplicates(subset=['trace', 'event'])
print("Du removed:", len(hospital_log_translated_df))

hospital_log_dup_removed = pm4py_utils.EventLog()

# Iterate over the original event log to reconstruct the duplicate removed event log
for trace in hospital_log_translated:
    # Get trace name
    trace_name = trace.attributes['concept:name']
    
    # Check if trace_name already exists in the DataFrame
    if trace_name in hospital_log_translated_df['trace'].values:
        # Create a new trace object
        new_trace = pm4py_utils.Trace()
        hospital_log_dup_removed.append(new_trace)
        # Copy trace attributes
        for key, value in trace.attributes.items():
            new_trace.attributes[key] = value
        # Associate the trace name with all events within the trace
        for event in trace:
            event_name = event['concept:name'] + ' - ' + event['Producer code'] + str(event['Specialism code']) + ' - ' + str(event['time:timestamp'])
            if 'org:group' in event:
                event_name = event_name + ' - ' + event['org:group']
            if 'Section' in event:
                event_name = event_name + ' - ' + event['Section']
            
            # Check if the event name is in the cleaned DataFrame
            if event_name in hospital_log_translated_df[hospital_log_translated_df['trace'] == trace_name]['event'].values:
                # Create a new event object
                new_event = pm4py_utils.Event()
                # Copy event attributes
                for key, value in event.items():
                    new_event[key] = value
                # Add the event to the trace
                new_trace.append(new_event)

print("Number of traces present in the duplicate removed event log:", len(hospital_log_dup_removed))

num_of_events = 0
for trace in hospital_log_dup_removed:
     num_of_events = num_of_events + len(trace)

print("Number of events in the duplicate removed event:", num_of_events)

# write into file
pm4py.write_xes(hospital_log_dup_removed, "Data\Processed\Hospital_Log_Dup_Removed.xes")

All Events: 51451
Du removed: 44982
Number of traces present in the duplicate removed event log: 381
Number of events in the duplicate removed event: 51451


exporting log, completed traces ::   0%|          | 0/381 [00:00<?, ?it/s]

In [55]:
import pm4py.utils as pm4py_utils

# Define neww lists for trace and event data
all_traces = []
all_events = []

# Get trace and event data from the event log object
for trace in hospital_log_translated:
    # Get trace name for each trace
    trace_name = trace.attributes['concept:name']

    # Make the event name as unique as possible
    for event in trace:
        all_traces.append(trace_name)
        # Extract event name
        event_name = event['concept:name'] + ' - ' + event['Producer code'] + str(event['Specialism code']) + ' - ' + str(event['time:timestamp'])
        if 'org:group' in event:
            event_name = event_name + ' - ' + event['org:group'] 
        if 'Section' in event:
            event_name = event_name + ' - ' + event['Section']
            
        all_events.append(event_name)

# Raise error if the liists length are different
if len(all_traces) != len(all_events):
    raise ValueError("Lengths of all_traces and all_events do not match.")

# Creat a Pandas DataFrame & remove duplicates
hospital_log_translated_df = pd.DataFrame({'trace': all_traces, 'event': all_events})
#print("All Events:", len(hospital_log_translated_df))
hospital_log_translated_df = hospital_log_translated_df.drop_duplicates(subset=['trace', 'event'])
#print("Dup removed removed:", len(hospital_log_translated_df))

hospital_log_dup_removed = pm4py_utils.EventLog()

# Initialize a dictionary to keep track of added events for each trace
events_included = {}

# Iterate over the original event log to reconstruct the dupliacate removed event log
for trace in hospital_log_translated:
    # Get trace name
    trace_name = trace.attributes['concept:name']
    
    # Check if trace_name already exists in the DataFrame
    if trace_name in hospital_log_translated_df['trace'].values:
        # Create a new trace object
        new_trace = pm4py_utils.Trace()
        hospital_log_dup_removed.append(new_trace)
        # Copy trace attributes
        for key, value in trace.attributes.items():
            new_trace.attributes[key] = value
        # Initialize a set to keep track of added events for the current trace
        events_included[trace_name] = set()
        # Associate the trace name with all events within the trace
        for event in trace:
            event_name = event['concept:name'] + ' - ' + event['Producer code'] + str(event['Specialism code']) + ' - ' + str(event['time:timestamp'])
            if 'org:group' in event:
                event_name = event_name + ' - ' + event['org:group']
            if 'Section' in event:
                event_name = event_name + ' - ' + event['Section']
            
            # Check if the event name is in the cleaned DataFrame and not already added
            if event_name in hospital_log_translated_df[hospital_log_translated_df['trace'] == trace_name]['event'].values \
                    and event_name not in events_included[trace_name]:
                # Create a new event object
                new_event = pm4py_utils.Event()
                # Copy event attributes
                for key, value in event.items():
                    new_event[key] = value
                # Add the event to the trace
                new_trace.append(new_event)
                # Add the event name to the set of added events for the current trace
                events_included[trace_name].add(event_name)

print("Number of traces present in the duplicate removed event log:", len(hospital_log_dup_removed))

num_of_events = 0
for trace in hospital_log_dup_removed:
     num_of_events = num_of_events + len(trace)

print("Number of events in the duplicate removed event:", num_of_events)

# write into file
pm4py.write_xes(hospital_log_dup_removed, "Data\Processed\Hospital_Log_Dup_Removed.xes")

All Events: 51451
Du removed: 44982
Number of traces present in the duplicate removed event log: 381
Number of events in the duplicate removed event: 44982




exporting log, completed traces ::   0%|          | 0/381 [00:00<?, ?it/s]