# Preprocessing the public Event Log

## Reading Raw Event data in XES format as an Event Log ##

In [None]:
import pandas as pd
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as xes_converter

hospital_log_all = xes_importer.apply('Data\Hospital_log_all.xes')

print("Number of traces present in the full event log:", len(hospital_log_all))

num_of_events = 0
for trace in hospital_log_all:
    num_of_events = num_of_events + len(trace)

print("Number of events in full event log:", num_of_events)

### Utility code to automatically load newly compiled classes into Jupiter notebook

In [None]:
from IPython import get_ipython
ip = get_ipython()
ip.magic("reload_ext autoreload")
ip.magic("autoreload 2")

## Getting a subset of the full event log based on traces ##

We consider 33.3% of the traces in the full event log. This is for the ease of computation later.

In [None]:
from Preprocessing import DivideDatasets

hospital_log_initial = DivideDatasets.get_subset(hospital_log_all, 3)

print("Number of traces present in the event log subset:", len(hospital_log_initial))

num_of_events = 0
for trace in hospital_log_initial:
     num_of_events = num_of_events + len(trace)

print("Number of events in the event log subset:", num_of_events)

pm4py.write_xes(hospital_log_initial, "Data\Processed\Hospital_Log_Initial.xes")

### ``When starting from middle with divided

In [None]:
import pandas as pd
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as xes_converter

hospital_log_initial = xes_importer.apply('Data\Processed\Hospital_Log_Initial.xes')

## Translating the event log content into English

Mainly the activity name is in English. For a better clarity while analysing, we convert that to English

In [None]:
#hospital_log_initial_temp = hospital_log_initial

In [None]:
from Preprocessing import TranslationManager

hospital_log_translated = TranslationManager.translate(hospital_log_initial, 'en')

pm4py.write_xes(hospital_log_translated, "Data\Processed\Hospital_Log_Translated.xes")

### ``When starting from middle (since Translation is costly)

In [None]:
import pandas as pd
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as xes_converter

hospital_log_translated = xes_importer.apply('Data\Processed\Hospital_Log_Translated.xes')

In [None]:
len(hospital_log_translated)

## Remove duplicates

The logic here we are using is:
  1. Extract traces & events into two lists from the original EventLog
  2. Create a Pandas dataframe using above two lists as columns & remove duplicates from there
  3. Rebuild the EventLog referring to original EventLog by only adding values persent in dataframe

In [None]:
import pm4py.utils as pm4py_utils

# Define neww lists for trace and event data
all_traces = []
all_events = []

# Get trace and event data from the event log object

###
###
#translation seems giving low hits
#for trace in hospital_log_translated:
###
###

for trace in hospital_log_initial:
    # Get trace name for each trace
    trace_name = trace.attributes['concept:name']

    # Make the event name as unique as possible
    for event in trace:
        all_traces.append(trace_name)
        # Extract event name
        event_name = event['concept:name'] + ' - ' + event['Producer code'] + str(event['Specialism code']) + ' - ' + str(event['time:timestamp'])
        if 'org:group' in event:
            event_name = event_name + ' - ' + event['org:group'] 
        if 'Section' in event:
            event_name = event_name + ' - ' + event['Section']
            
        all_events.append(event_name)

# Raise error if the liists length are different
if len(all_traces) != len(all_events):
    raise ValueError("Lengths of all_traces and all_events do not match.")

# Creat a Pandas DataFrame
hospital_log_initial_df = pd.DataFrame({'trace': all_traces, 'event': all_events})
#print("All Events:", len(hospital_log_translated_df))
# Remove duplicates
hospital_log_initial_df = hospital_log_initial_df.drop_duplicates(subset=['trace', 'event'])
#print("Dup removed removed:", len(hospital_log_translated_df))

hospital_log_dup_removed = pm4py_utils.EventLog()

# Initialize a dictionary to keep track of added events for each trace
events_included = {}

# Iterate over the original event log to reconstruct the dupliacate removed event log
for trace in hospital_log_initial:
    # Get trace name
    trace_name = trace.attributes['concept:name']
    
    # Check if trace_name already exists in the DataFrame
    if trace_name in hospital_log_initial_df['trace'].values:
        # Create a new trace object
        new_trace = pm4py_utils.Trace()
        hospital_log_dup_removed.append(new_trace)
        # Copy trace attributes
        for key, value in trace.attributes.items():
            new_trace.attributes[key] = value
        # Initialize a set to keep track of added events for the current trace
        events_included[trace_name] = set()
        # Associate the trace name with all events within the trace
        for event in trace:
            event_name = event['concept:name'] + ' - ' + event['Producer code'] + str(event['Specialism code']) + ' - ' + str(event['time:timestamp'])
            if 'org:group' in event:
                event_name = event_name + ' - ' + event['org:group']
            if 'Section' in event:
                event_name = event_name + ' - ' + event['Section']
            
            # Check if the event name is in the cleaned DataFrame and not already added
            if event_name in hospital_log_initial_df[hospital_log_initial_df['trace'] == trace_name]['event'].values \
                    and event_name not in events_included[trace_name]:
                # Create a new event object
                new_event = pm4py_utils.Event()
                # Copy event attributes
                for key, value in event.items():
                    new_event[key] = value
                # Add the event to the trace
                new_trace.append(new_event)
                # Add the event name to the set of added events for the current trace
                events_included[trace_name].add(event_name)

print("Number of traces present in the duplicate removed event log:", len(hospital_log_dup_removed))

num_of_events = 0
for trace in hospital_log_dup_removed:
     num_of_events = num_of_events + len(trace)

print("Number of events in the duplicate removed event:", num_of_events)

# write into file
pm4py.write_xes(hospital_log_dup_removed, "Data\Processed\Hospital_Log_Dup_Removed.xes")

### ``When starting from middle (dup removed)

In [9]:
import pandas as pd
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as xes_converter

hospital_log_dup_removed = xes_importer.apply('Data\Processed\Hospital_Log_Dup_Removed.xes')

  hospital_log_dup_removed = xes_importer.apply('Data\Processed\Hospital_Log_Dup_Removed.xes')


parsing log, completed traces ::   0%|          | 0/381 [00:00<?, ?it/s]

## Remove traces with no events

In [None]:
import xml.etree.ElementTree as ET

def count_traces_without_events(Logg):
    
    # Count traces without events
    count = 0
    all_traces=0
    for trace in Logg:
        # Check if trace has any events
        all_traces += 1
        present = 0
        for event in trace:
            present += 1
        if present==0:
            count += 1
        if all_traces >379:
            print(all_traces, ": ", present)

    return count, all_traces

traces_without_events, all_tracesss = count_traces_without_events(hospital_log_translated_emp)
print("Number of traces:", all_tracesss)
print("Number of traces without any events:", traces_without_events)


# Frequent Pattern Mining

## Finding most frequest patterns

### Using FP-Growth algorithm

We are using the fp growth algorithm here to find frequent flow variants.
In creating transactions to be fed into into FP Growth, we use concept:name as the key of each event
Also sending the transaction list in chunks to FP Growth algo to process to reduce the complexity in processing,
and later merging the results by removing duplicates, etc

In [None]:
import pandas as pd
from collections import defaultdict
from FrequentPatternMining import FPGrowthHandler, TransactionManager


# GEt the list of transactions based on events in each trace
transactions_list = TransactionManager.create_transactions(hospital_log_dup_removed)

# Split transactions list into chunks 
# for the computational ease
chunk_size = 5
num_of_chunks = len(transactions_list) // chunk_size +(len(transactions_list) % chunk_size > 0)
frequent_variants_all = pd.DataFrame( columns=['support', 'itemsets'])

# Process chunks by looping
for i in range(num_of_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, len(transactions_list))
    transactions_chunk = transactions_list[start_idx:end_idx]
    
    # Mine frequent varints for the current chunk using FP-Growth algorithm
    print(f"Processing chunk {i+1} of {num_of_chunks}...")
    frequent_variants_chunk = FPGrowthHandler.mine_frequent_variants(transactions_chunk, min_support=0.1)
    
    # Merge frequent variants with previous chunks
    frequent_variants_all = pd.concat([frequent_variants_all, frequent_itemsets_chunk])
    print("Mining finshed for chunk {i+1} of {num_of_chunks}...")

# Remove duplicates and sort the combined frequent variants
frequent_variants_all = frequent_variants_all.groupby('itemsets').agg({'support': 'sum'}).reset_index()
frequent_variants_all = frequent_variants_all.sort_values(by='support', ascending=False)

print("Full Frequent Variants List: ")
frequent_variants_all

### Using Apriori

In [None]:
import pandas as pd
from collections import defaultdict
from FrequentPatternMining import AprioriHandler, TransactionManager


# GEt the list of transactions based on events in each trace
transactions_list = TransactionManager.create_transactions(hospital_log_dup_removed)

# Split transactions list into chunks 
# for the computational ease
chunk_size = 5
num_of_chunks = len(transactions_list) // chunk_size +(len(transactions_list) % chunk_size > 0)
frequent_variants_all = pd.DataFrame( columns=['support', 'itemsets'])

# Process chunks by looping
for i in range(num_of_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, len(transactions_list))
    transactions_chunk = transactions_list[start_idx:end_idx]
    
    # Mine frequent variantsa for the current chunk using Apriori algorithm
    print(f"Processing chunk {i+1} of {num_of_chunks}...")
    frequent_variants_chunk = AprioriHandler.mine_frequent_variants(transactions_chunk, min_support=0.1)
    
    # Merge frequent varints with previous chunks
    frequent_variants_all = pd.concat([frequent_variants_all, frequent_itemsets_chunk])
    print("Mining finshed for chunk {i+1} of {num_of_chunks}...")

# Remove duplicates and sort the combined frequent variants
frequent_variants_all = frequent_variants_all.groupby('itemsets').agg({'support': 'sum'}).reset_index()
frequent_variants_all = frequent_variants_all.sort_values(by='support', ascending=False)

print("Full Frequent Variants List: ")
frequent_variants_all

### Direct method (with no ML)

Since it was unable to find the final list of frequent flow variants using both the above FP-Growth & the Apriori algorithms,
now we go for a direct text mapping approach without any machine learning approach involved.

In [11]:
import pandas as pd
from collections import defaultdict
from FrequentPatternMining import DirectPatternMatchHandler


# Mine frequent variants using Direct text match
event_flows_with_counts_df = DirectPatternMatchHandler.mine_frequent_variants(hospital_log_dup_removed, sort_order="DESC")


print("Full Frequent Variants List: ")
event_flows_with_counts_df

Full Frequent Variants List: 


Unnamed: 0,Event Flow,Count
0,"(vervolgconsult poliklinisch, administratief t...",11
1,"(vervolgconsult poliklinisch, administratief t...",6
2,"(1e consult poliklinisch, administratief tarie...",4
3,"(vervolgconsult poliklinisch, administratief t...",3
4,"(vervolgconsult poliklinisch, administratief t...",3
...,...,...
342,"(e.c.g. - elektrocardiografie, coupe ter ...",1
343,"(echografie - genitalia interna, thorax, 1e c...",1
344,"(aanname laboratoriumonderzoek, ca-125 mbv mei...",1
345,"(aanname laboratoriumonderzoek, aanname labora...",1
