# Basic analysis of the nature of the event log

## Reading Raw Event data in XES format as an Event Log ##

In [None]:
import pandas as pd
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer

hospital_log_all = xes_importer.apply('Data\\Hospital_log_all.xes')

print('Number of traces present in the full event log:', len(hospital_log_all))

num_of_events = 0
for trace in hospital_log_all:
    num_of_events = num_of_events + len(trace)

print('Number of events in full event log:', num_of_events)

### Utility code to automatically load newly compiled classes into Jupiter notebook

In [None]:
from IPython import get_ipython
ip = get_ipython()
ip.magic("reload_ext autoreload")
ip.magic("autoreload 2")

In [None]:
%reload_ext autoreload
# Configure autoreload to automatically reload all modules
%autoreload 2

## Getting a subset of the full event log based on traces ##

We consider 33.3% of the traces in the full event log. This is for the ease of computation later.

In [None]:
from Preprocessing import DivideDatasets

hospital_log_initial = DivideDatasets.get_subset(hospital_log_all, 3)

print('Number of traces present in the event log subset:', len(hospital_log_initial))

num_of_events = 0
for trace in hospital_log_initial:
     num_of_events = num_of_events + len(trace)

print('Number of events in the event log subset:', num_of_events)

pm4py.write_xes(hospital_log_initial, 'Data\Processed\Hospital_Log_Initial.xes')

## Numerical & statistical analysis of the selected dataset

### Converting to a Pandas dataframe

In [None]:
import pandas as pd
from pm4py.objects.conversion.log import converter as log_converter

hospital_log_analysis_df = log_converter.apply(hospital_log_initial, variant=log_converter.Variants.TO_DATA_FRAME)

In [None]:
hospital_log_analysis_df.shape[0]

In [None]:
hospital_log_analysis_df

### Number of events per trace

In [None]:
import pandas as pd

# Get the count group by 'case:concept:name'
event_count_per_trace = hospital_log_analysis_df.groupby('case:concept:name').size()

event_count_per_trace.describe().round()

### The oldest & the newest trace

In [None]:
print('Oldest trace: ',hospital_log_analysis_df['time:timestamp'].min())
print('Newest trace: ',hospital_log_analysis_df['time:timestamp'].max())

### Age range of the patients

In [None]:
hospital_log_analysis_df['case:Age'].describe().round()

### Most widely used event

In [None]:
import pandas as pd

# Get the count for events names
event_counts = hospital_log_analysis_df['concept:name'].value_counts()

print("Most widely recorded event:", event_counts.idxmax())
print("Number of occurrences:", event_counts.max())

### The org group which the most number of patients visited

In [None]:
import pandas as pd

# Group pandas Data Frame by the 'case:concept_name' 
# and get the first instance of 'org:group'
org_group_usage= hospital_log_analysis_df.groupby('case:concept:name')['org:group'].first()

# Count the occurrences of each 'org:group'
org_group_counts = org_group_usage.value_counts()

# Get the 'org:group' inthe most of traces
most_recorded_org_group = org_group_counts.idxmax()
max_org_group_count = org_group_counts.max()

print("Org group recorded in most of the traces:", most_recorded_org_group)
print("Number of instances:", max_org_group_count)

### ``When starting from middle with divided

In [None]:
import pandas as pd
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as xes_converter

hospital_log_initial = xes_importer.apply('Data\Processed\Hospital_Log_Initial.xes')

# Preprocessing the public Event Log

## Translating the event log content into English

Mainly the activity name is in English. For a better clarity while analysing, we convert that to English

In [None]:
#hospital_log_initial_temp = hospital_log_initial

In [None]:
from Preprocessing import TranslationManager

hospital_log_translated = TranslationManager.translate(hospital_log_initial, 'en')

pm4py.write_xes(hospital_log_translated, "Data\Processed\Hospital_Log_Translated.xes")

### ``When starting from middle (since Translation is costly)

In [None]:
import pandas as pd
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as xes_converter

hospital_log_translated = xes_importer.apply('Data\Processed\Hospital_Log_Translated.xes')

In [None]:
len(hospital_log_translated)

## Remove duplicates

The logic here we are using is:
  1. Extract traces & events into two lists from the original EventLog
  2. Create a Pandas dataframe using above two lists as columns & remove duplicates from there
  3. Rebuild the EventLog referring to original EventLog by only adding values persent in dataframe

In [None]:
import pm4py.utils as pm4py_utils

# Define neww lists for trace and event data
all_traces = []
all_events = []

# # Get trace and event data from the event log object

# ###
# ###
# #translation seems giving low hits
# #for trace in hospital_log_translated:
# ###
# ###

for trace in hospital_log_initial:
    # Get trace name for each trace
    trace_name = trace.attributes['concept:name']

    # Make the event name as unique as possible
    for event in trace:
        all_traces.append(trace_name)
        # Extract event name
        event_name = event['concept:name'] + ' - ' + event['Producer code'] + str(event['Specialism code']) + ' - ' + str(event['time:timestamp'])
        if 'org:group' in event:
            event_name = event_name + ' - ' + event['org:group'] 
        if 'Section' in event:
            event_name = event_name + ' - ' + event['Section']
            
        all_events.append(event_name)

# Raise error if the liists length are different
if len(all_traces) != len(all_events):
    raise ValueError("Lengths of all_traces and all_events do not match.")

# Creat a Pandas DataFrame
hospital_log_initial_df = pd.DataFrame({'trace': all_traces, 'event': all_events})
#print("All Events:", len(hospital_log_translated_df))
# Remove duplicates
hospital_log_initial_df = hospital_log_initial_df.drop_duplicates(subset=['trace', 'event'])
#print("Dup removed removed:", len(hospital_log_translated_df))

hospital_log_dup_removed = pm4py_utils.EventLog()

# Initialize a dictionary to keep track of added events for each trace
events_included = {}

# Iterate over the original event log to reconstruct the dupliacate removed event log
for trace in hospital_log_initial:
    # Get trace name
    trace_name = trace.attributes['concept:name']
    
    # Check if trace_name already exists in the DataFrame
    if trace_name in hospital_log_initial_df['trace'].values:
        # Create a new trace object
        new_trace = pm4py_utils.Trace()
        hospital_log_dup_removed.append(new_trace)
        # Copy trace attributes
        for key, value in trace.attributes.items():
            new_trace.attributes[key] = value
        # Initialize a set to keep track of added events for the current trace
        events_included[trace_name] = set()
        # Associate the trace name with all events within the trace
        for event in trace:
            event_name = event['concept:name'] + ' - ' + event['Producer code'] + str(event['Specialism code']) + ' - ' + str(event['time:timestamp'])
            if 'org:group' in event:
                event_name = event_name + ' - ' + event['org:group']
            if 'Section' in event:
                event_name = event_name + ' - ' + event['Section']
            
            # Check if the event name is in the cleaned DataFrame and not already added
            if event_name in hospital_log_initial_df[hospital_log_initial_df['trace'] == trace_name]['event'].values \
                    and event_name not in events_included[trace_name]:
                # Create a new event object
                new_event = pm4py_utils.Event()
                # Copy event attributes
                for key, value in event.items():
                    new_event[key] = value
                # Add the event to the trace
                new_trace.append(new_event)
                # Add the event name to the set of added events for the current trace
                events_included[trace_name].add(event_name)

print("Number of traces present in the duplicate removed event log:", len(hospital_log_dup_removed))

num_of_events = 0
for trace in hospital_log_dup_removed:
     num_of_events = num_of_events + len(trace)

print('Number of events in the duplicate removed event:', num_of_events)

# write into file
pm4py.write_xes(hospital_log_dup_removed, 'Data\Processed\Hospital_Log_Dup_Removed.xes')

### ``When starting from middle (dup removed)

In [None]:
import pandas as pd
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as xes_converter

hospital_log_dup_removed = xes_importer.apply('Data\Processed\Hospital_Log_Dup_Removed.xes')

## Remove traces with no events

In [None]:
import xml.etree.ElementTree as ET

def count_traces_without_events(Logg):
    
    # Count traces without events
    count = 0
    all_traces=0
    for trace in Logg:
        # Check if trace has any events
        all_traces += 1
        present = 0
        for event in trace:
            present += 1
        if present==0:
            count += 1

    return count, all_traces

traces_without_events, all_tracesss = count_traces_without_events(hospital_log_dup_removed)
print('Number of traces:', all_tracesss)
print('Number of traces without any events:', traces_without_events)


## Remove traces with extreme number of events, as outliers

### Plot Boxplot for number of events present in traces

In [None]:
len(hospital_log_dup_removed)

In [None]:
from pm4py.objects.conversion.log import converter as log_converter
import matplotlib.pyplot as plt

# Convert the event log to a pandas data frame
hospital_log_dup_removed_df = log_converter.apply(hospital_log_dup_removed, variant=log_converter.Variants.TO_DATA_FRAME)

# Count the number of events per each trace
trace_counts = hospital_log_dup_removed_df.groupby('case:concept:name').size()
trace_lengths = trace_counts.reset_index(name='event_count')

# Plot a boxplot of trace lengths
plt.figure(figsize=(8, 5))
plt.boxplot(trace_lengths['event_count'])
plt.title('Number of events per trace')
plt.xlabel('Trace')
plt.ylabel('Number of Events')
plt.show()

### Calculate the upper bound for number of traces

In [None]:
# Calculate the Interquartile Range (IQR)
Q1 = trace_lengths['event_count'].quantile(0.25)
Q3 = trace_lengths['event_count'].quantile(0.75)
IQR = Q3 - Q1

# Calculate the upper bound
upper_bound_val = Q3 + (1.5 * IQR)

print('Upper Bound for number of events per trace:', round(upper_bound_val))

### Number of traces beyond upper bound of number of tarces

In [None]:
# Fitler out the traces beyond the upper bound
records_beyond_upper_bound = trace_lengths[trace_lengths['event_count'] > round(upper_bound_val)]

# Display the number of traces  that meet the condition
print('Number of traces with beyond the upper bound of number of tarces:', records_beyond_upper_bound.shape[0])
print ('Percentage of traces with beyond 800 event: ', round(records_beyond_upper_bound.shape[0]/len(hospital_log_dup_removed)*100,2), '%')


### Anayzing traces beyond upper bound

#### Traces with number of events above 800

In [None]:
# Fitler out the traces with number of events beyond 800
records_beyond_upper_bound = trace_lengths[trace_lengths['event_count'] > 800]

# Count & the percentage
print('Number of traces with beyond 800 events:', records_beyond_upper_bound.shape[0])
print ('Percentage of traces with beyond 800 events: ', round(records_beyond_upper_bound.shape[0]/len(hospital_log_dup_removed)*100,2), '%')

#### Traces with number of events above 600

In [None]:
# Fitler out the traces with number of events beyond 600
records_beyond_amount = trace_lengths[trace_lengths['event_count'] > 600]

# Count & the percentage
print('Number of traces with beyond 600 events:', records_beyond_amount.shape[0])
print ('Percentage of traces with beyond 600 events: ', round(records_beyond_amount.shape[0]/len(hospital_log_dup_removed)*100,2), '%')

#### Traces with number of events above 500

In [None]:
# Fitler out the traces with number of events beyond 500
records_beyond_amount = trace_lengths[trace_lengths['event_count'] > 500]

# Count & the percentage
print('Number of traces with beyond 500 events:', records_beyond_amount.shape[0])
print ('Percentage of traces with beyond 500 events: ', round(records_beyond_amount.shape[0]/len(hospital_log_dup_removed)*100,2), '%')

#### Traces with number of events above 400

In [None]:
# Fitler out the traces with number of events beyond 400
records_beyond_amount = trace_lengths[trace_lengths['event_count'] > 400]

# Count & the percentage
print('Number of traces with beyond 400 events: ', records_beyond_amount.shape[0])
print ('Percentage of traces with beyond 400 events: ', round(records_beyond_amount.shape[0]/len(hospital_log_dup_removed)*100,2), '%')

Remove traces beyond 500 as outliers

### Caculate lower bound for number fo traces

In [None]:
# Calculate the Interquartile Range (IQR)
Q1 = trace_lengths['event_count'].quantile(0.25)
Q3 = trace_lengths['event_count'].quantile(0.75)
IQR = Q3 - Q1

# Calculate the lower bound
lower_bound_val = Q1 - (1.5 * IQR)

print('Lower Bound for number of events per trace:', round(lower_bound_val))

Having traces around this value is unrealistic

### Anayzing traces below lower bound

#### Traces with number of events below 2

In [None]:
# Fitler out the traces with number of events below 2
records_below_lower_bound = trace_lengths[trace_lengths['event_count'] < 2]

# Count & the percentage
print('Number of traces with below 2 events: ', records_below_lower_bound.shape[0])
print ('Percentage of traces with below 2 events: ', round(records_below_lower_bound.shape[0]/len(hospital_log_dup_removed)*100,2), '%')

#### Traces with number of events below 3

In [None]:
# Fitler out the traces with number of events below 3
records_below_lower_bound = trace_lengths[trace_lengths['event_count'] < 3 ]

# Count & the percentage
print('Number of traces with below 2 events: ', records_below_lower_bound.shape[0])
print ('Percentage of traces with below 2 events: ', round(records_below_lower_bound.shape[0]/len(hospital_log_dup_removed)*100,2), '%')

Remove traces below 2 as outliers

### Remove traces beyond number of events 500 & below 2 as outliers

#### Remove the selected traces

In [None]:
# Filter traces with event count less than or equal to 500
filtered_traces = trace_lengths[(trace_lengths['event_count'] <= 500) & (trace_lengths['event_count'] > 1)]

# Create a new panda Data Frame with the filtered traces
hospital_log_cleansed_df = hospital_log_dup_removed_df[hospital_log_dup_removed_df['case:concept:name'].isin(filtered_traces['case:concept:name'])]

hospital_log_cleansed_df['case:concept:name'].nunique()


#### Reconstruct the event log after removal of outliers

In [None]:
hospital_log_cleansed = pm4py_utils.EventLog()

# Initialize a dictionary to keep track of added events for each trace
events_included = {}

cleansed_trace_names_set = set(hospital_log_cleansed_df['case:concept:name'])

# Iterate over the original event log to reconstruct the dupliacate removed event log
for trace in hospital_log_initial:
    # Get trace name
    trace_name = trace.attributes['concept:name']


    # Check if the trace name exists in cleansed_trace_names_set
    if trace_name in cleansed_trace_names_set:       
        new_trace = pm4py_utils.Trace()
        hospital_log_cleansed.append(new_trace)
        # Copy trace attributes
        for key, value in trace.attributes.items():
            new_trace.attributes[key] = value
        # Initialize a set to keep track of added events for the current trace
        events_included[trace_name] = set()
        # Associate the trace name with all events within the trace

        for event in trace:
            event_name = event['concept:name'] + ' - ' + event['Producer code'] + str(event['Specialism code']) + ' - ' + str(event['time:timestamp'])
            if 'org:group' in event:
                event_name = event_name + ' - ' + event['org:group']
            if 'Section' in event:
                event_name = event_name + ' - ' + event['Section']
            
            new_event = pm4py_utils.Event()
            # Copy event attributes
            for key, value in event.items():
                new_event[key] = value
            # Add the event to the trace
            new_trace.append(new_event)
            # Add the event name to the set of added events for the current trace
            events_included[trace_name].add(event_name)

In [None]:
len(hospital_log_cleansed)

In [None]:
hospital_log_cleansed_df

In [None]:
pm4py.write_xes(hospital_log_cleansed, "Data\Processed\Hospital_Log_Cleansed.xes")

### ``When starting from cleansed

In [None]:
from pm4py.objects.log.importer.xes import importer as xes_importer

hospital_log_cleansed = xes_importer.apply('Data\Processed\Hospital_Log_Cleansed.xes')

In [None]:
hospital_log_cleansed

# Frequent Pattern Mining

## Finding most frequest patterns

### Using FP-Growth algorithm

We are using the fp growth algorithm here to find frequent flow variants.
In creating transactions to be fed into into FP Growth, we use concept:name as the key of each event
Also sending the transaction list in chunks to FP Growth algo to process to reduce the complexity in processing,
and later merging the results by removing duplicates, etc

In [None]:
import pandas as pd
from collections import defaultdict
from FrequentPatternMining import FPGrowthHandler, TransactionManager


# GEt the list of transactions based on events in each trace
transactions_list = TransactionManager.create_transactions(hospital_log_cleansed)

# Split transactions list into chunks 
# for the computational ease
chunk_size = 25
remainder = (len(transactions_list) % chunk_size > 0)
num_of_chunks = len(transactions_list) // chunk_size +remainder
frequent_variants_all = pd.DataFrame( columns=['support', 'itemsets'])

# Process chunks by looping
for i in range(num_of_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, len(transactions_list))
    transactions_chunk = transactions_list[start_idx:end_idx]
    
    # Mine frequent varints for the current chunk using FP-Growth algorithm
    print("Processing chunk", i+1, " of ", num_of_chunks, "...")
    frequent_variants_chunk = FPGrowthHandler.mine_frequent_variants(transactions_chunk, min_support=0.1)
    
    # Merge frequent variants with previous chunks
    frequent_variants_all = pd.concat([frequent_variants_all, frequent_itemsets_chunk])
    print("Mining finished for chunk", i+1, " of ", num_of_chunks, "...")

# Remove duplicates in combined frequent variantss
frequent_variants_all = frequent_variants_all.groupby('itemsets').agg({'support':'sum'}).reset_index()
# Sort variants
frequent_variants_final = frequent_variants_all.sort_values(by='support', ascending=False)

print("Full Frequent Variants List: ")
frequent_variants_final

### Using Apriori

In [None]:
import pandas as pd
from collections import defaultdict
from FrequentPatternMining import AprioriHandler, TransactionManager


# GEt the list of transactions based on events in each trace
transactions_list = TransactionManager.create_transactions(hospital_log_cleansed)

# Split transactions list into chunks 
# for the computational ease
chunk_size = 5
remainder = (len(transactions_list) % chunk_size > 0)
num_of_chunks = len(transactions_list) // chunk_size + remainder
frequent_variants_all = pd.DataFrame( columns=['support', 'itemsets'])

# Process chunks by looping
for i in range(num_of_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, len(transactions_list))
    transactions_chunk = transactions_list[start_idx:end_idx]
    
    # Mine frequent variantsa for the current chunk using Apriori algorithm
    print("Processing chunk", i+1, " of ", num_of_chunks, "...")
    frequent_variants_chunk = AprioriHandler.mine_frequent_variants(transactions_chunk, min_support=0.1)
    
    # Merge frequent varints with previous chunks
    frequent_variants_all = pd.concat([frequent_variants_all, frequent_itemsets_chunk])
    print("Mining finished for chunk", i+1, " of ", num_of_chunks, "...")

# Remove duplicates in combined frequent variantss
frequent_variants_all = frequent_variants_all.groupby('itemsets').agg({'support': 'sum'}).reset_index()
# Sort variants
frequent_variants_all = frequent_variants_all.sort_values(by='support', ascending=False)

print("Full Frequent Variants List: ")
frequent_variants_all

### Direct method (with no ML)

Since it was unable to find the final list of frequent flow variants using both the above FP-Growth & the Apriori algorithms,
now we go for a direct text mapping approach without any machine learning approach involved.

In [None]:
import pandas as pd
from collections import defaultdict
from FrequentPatternMining import DirectPatternMatchHandler


# Mine frequent variants using Direct text match
event_flows_with_counts_df = DirectPatternMatchHandler.mine_frequent_variants(hospital_log_cleansed, sort_order="DESC")


print("Full Frequent Variants List: ")
event_flows_with_counts_df

In [None]:
event_flows_with_counts_df.to_csv("Data\\Processed\\EventFlowsWithCounts.csv", index=False)

In [None]:
import matplotlib.pyplot as plt

# # Extracting the event flow and counts
# event_flow = event_flows_with_counts_df['Event Flow']
# counts = event_flows_with_counts_df['Count']

# Plotting the distribution
plt.figure(figsize=(10, 5))
plt.plot(range(len(event_flows_with_counts_df['Event Flow'])), event_flows_with_counts_df['Count'], color='skyblue', linestyle='-')

plt.ylabel('Flow Variant Count')
plt.title('Distribution of Event Flow Varients ')
plt.xticks([]) 
plt.show()

#### Traces with their event flow

In [None]:
import pandas as pd
from FrequentPatternMining import DirectPatternMatchHandler


# Get the traces along with theri flow (for future usage)
trace_event_flows_df = DirectPatternMatchHandler.get_trace_variants(hospital_log_cleansed)

In [None]:
trace_event_flows_df[trace_event_flows_df['Event Flow']==('verlosk.-gynaec.   jaarkaart kosten-out', 'vervolgconsult poliklinisch', 'administratief tarief       - eerste pol')]

In [None]:
trace_event_flows_df.to_csv("Data\\Processed\\Trace_Event_Flows.csv", index=False)

### ``When starting from middle (flow variants with counts)

In [None]:
import pandas as pd

event_flows_with_counts_df = pd.read_csv("Data\\Processed\\EventFlowsWithCounts.csv")

In [None]:
event_flows_with_counts_df

## Set weights for each event flow variant based on its frequency 

In [None]:
from Clustering import ClusterUtil

# Assign weights for each variant
event_flows_with_counts_df = ClusterUtil.assign_weights(event_flows_with_counts_df1)

event_flows_with_counts_df

In [None]:
event_flows_with_counts_df

In [None]:
event_flows_with_counts_df.to_csv('Data\\Processed\\FlowVariantsWithWeights.csv', index=False)

### Assign a unique variant ID for ease of process

In [None]:
from Clustering import ClusterUtil

# Assign a unque variant id for each flow 
# focusing plotting later to avoid huge texts
event_flows_with_counts_df = ClusterUtil.assign_unique_variant_ids(event_flows_with_counts_df, col_name='Variant No')

In [None]:
event_flows_with_counts_df

In [None]:
event_flows_with_counts_df.to_csv('Data\\Processed\\FlowVariantsWithWeightsAndVariantNo.csv', index=False)

# Qualifying percentage of most frequent flow variants

## 66% of the frequent flow variants

In [None]:

# Calculate the index up 66% of recrods
all_records = len(event_flows_with_counts_df)
index_66 = int(all_records * 0.66)

# Get the first 66% of the records
event_flows_with_counts_df_66 = event_flows_with_counts_df.head(index_66)

In [None]:
event_flows_with_counts_df_66

## 50% of the frequent flow variants

In [None]:
# Calculate the index up 50% of recrods
all_records = len(event_flows_with_counts_df)
index_50 = int(all_records * 0.5)

# Get the first 50% of the records
event_flows_with_counts_df_50 = event_flows_with_counts_df.head(index_50)

In [None]:
event_flows_with_counts_df_50

## 33% of the frequent flow variants

In [None]:
# Calculate the index up 330% of recrods
all_records = len(event_flows_with_counts_df)
index_33 = int(all_records * 0.33)

# Get the first 33% of the records
event_flows_with_counts_df_33 = event_flows_with_counts_df.head(index_33)

In [None]:
event_flows_with_counts_df_33

# Clustering
Here we focus to perform Agglomerative Heirachical clustering with average linkage method.

## 66% of frequent flow variants

### Calculate distances among flow variant pairs

Here we use Jaccard Similaritiy method for this purpose

In [None]:
from Clustering import ClusterHandler

# Calculate Jaccard similarity for flows
pairwise_distance_list_66, event_binary_matrix_np_66 = ClusterHandler.calculate_distances_with_jaccard(event_flows_with_counts_df_66)

In [None]:
pairwise_distance_list_66

### Perform Agglomerative Hierarchical Clustering

Here we use the average linkage method which considers distance among all other nodes.

In [None]:
from Clustering import ClusterHandler

# Perform Agglomerative Hierarchical clustering with Average Linkage method
clustered_flows_66 = ClusterHandler.perform_hierarchical_clustering(pairwise_distance_list_66, method='average')

In [None]:
clustered_flows_66

### Generate dendogram for the heirachical tree

In [None]:
from Clustering import ClusterHandler

# plot & save the dendogram
ClusterHandler.generate_dendogram(clustered_flows_66, event_flows_with_counts_df_66,
                                        title='Agglomerative Hierarchical Clustering Dendrogram - 66 percent of variants',
                                        col_name='Variant No',
                                        xlabel= 'Event Flow Variants', 
                                        ylabel= 'Distance', 
                                        save_location= 'Visualize\\HierarchicalClusteringDendrogram66.png')


### Finding optimal number of clusters

#### Using Silhouette Scores

Optimum number of clusters decided based on the maximum Silhouette score

In [None]:
from Clustering import ClusterHandler

# plot silhouett scores & return the optimum cluster number
opt_num_of_clusters_66 = ClusterHandler.plot_silhouette_scores(clustered_flows_66, event_binary_matrix_np_66)

#### Using Elbow method

Here we find the optimum number of clusters from where we find the elbow shape in the graph

In [None]:
from Clustering import ClusterHandler

# plot elbow method values 
ClusterHandler.plot_elbow_method(clustered_flows_66, event_binary_matrix_np_66)

### Cluster variants based on their weights

In [None]:
event_flows_with_counts_df_66

#### Divide into 2 Clusters based on Silhouette analysis

In [None]:
from Clustering import ClusterHandler

# Cluster based on weights
all_clusters_66 = ClusterHandler.form_cluster_variants(event_flows_with_counts_df_66, clustered_flows_66, num_of_clusters=2, max_variants=200)
cluster_66_1, cluster_66_2 = all_clusters_66

In [None]:
len(cluster_66_1)

In [None]:
len(cluster_66_2)

In [None]:
from Clustering import ClusterUtil

# Write each cluster as a csv to disk
ClusterUtil.write_cluster_to_csv(cluster_66_1,"Data\\Processed\\Cluster66_Sil_1.csv")
ClusterUtil.write_cluster_to_csv(cluster_66_2,"Data\\Processed\\Cluster66_Sil_2.csv")
#ClusterUtil.write_cluster_to_csv(cluster3,"Data\\Processed\\Cluster3_Event_Log.csv")

#### Divide into 4 Clusters based on Elbow method

In [None]:
from Clustering import ClusterHandler

# Cluster based on weights
all_clusters_66 = ClusterHandler.form_cluster_variants(event_flows_with_counts_df_66, clustered_flows_66, num_of_clusters=4, max_variants=200)
cluster_66_Elbow_1, cluster_66_Elbow_2, cluster_66_Elbow_3, cluster_66_Elbow_4 = all_clusters_66

In [None]:
len(cluster_66_Elbow_1)

In [None]:
len(cluster_66_Elbow_2)

In [None]:
len(cluster_66_Elbow_3)

In [None]:
len(cluster_66_Elbow_4)

## 50% of frequent flow variants

### Calculate distances among flow variant pairs

Here we use Jaccard Similaritiy method for this purpose

In [None]:
from Clustering import ClusterHandler

# Calculate Jaccard similarity for flows
pairwise_distance_list_50, event_binary_matrix_np_50 = ClusterHandler.calculate_distances_with_jaccard(event_flows_with_counts_df_50)

In [None]:
pairwise_distance_list_50

### Perform Agglomerative Hierarchical Clustering

Here we use the average linkage method which considers distance among all other nodes

In [None]:
from Clustering import ClusterHandler

# Perform Agglomerative Hierarchical clustering with Average Linkage method
clustered_flows_50 = ClusterHandler.perform_hierarchical_clustering(pairwise_distance_list_50, method='average')

In [None]:
clustered_flows_50

### Generate dendrogram for the heirachical tree

In [None]:
from Clustering import ClusterHandler

# plot & save the dendogram
ClusterHandler.generate_dendogram(clustered_flows_50, event_flows_with_counts_df_50,
                                        title='Agglomerative Hierarchical Clustering Dendrogram - 50 percent of variants',
                                        col_name='Variant No',
                                        xlabel= 'Event Flow Variants', 
                                        ylabel= 'Distance', 
                                        save_location= 'Visualize\\HierarchicalClusteringDendrogram50.png')

### Finding optimal number of clusters

#### Using Silhouette Scores

Optimum number of clusters decided based on the maximum Silhouette score

In [None]:
from Clustering import ClusterHandler

# plot silhouett scores & return the optimum cluster number
opt_num_of_clusters_50 = ClusterHandler.plot_silhouette_scores(clustered_flows_50, event_binary_matrix_np_50)

#### Using Elbow method

Here we find the optimum number of clusters from where we find the elbow shape in the graph

In [None]:
from Clustering import ClusterHandler

# plot elbow method values 
ClusterHandler.plot_elbow_method(clustered_flows_50, event_binary_matrix_np_50)

### Cluster variants based on their weights

In [None]:
event_flows_with_counts_df_50

#### Divide into 5 Clusters based on Elbow method

In [None]:
from Clustering import ClusterHandler

# Cluster based on weights
all_clusters_50 = ClusterHandler.form_cluster_variants(event_flows_with_counts_df_50, clustered_flows_50, num_of_clusters=5, max_variants=125)
cluster_50_1, cluster_50_2, cluster_50_3, cluster_50_4, cluster_50_5 = all_clusters_50

In [None]:
len(cluster_50_1)

In [None]:
len(cluster_50_2)

In [None]:
len(cluster_50_3)

In [None]:
len(cluster_50_4)

In [None]:
len(cluster_50_5)

## 33% of frequent flow variants

### Calculate distances among flow variant pairs

Here we use Jaccard Similaritiy method for this purpose

In [None]:
from Clustering import ClusterHandler

# Calculate Jaccard similarity for flows
pairwise_distance_list_33, event_binary_matrix_np_33 = ClusterHandler.calculate_distances_with_jaccard(event_flows_with_counts_df_33)

### Perform Agglomerative Hierarchical Clustering

Here we use the average linkage method which considers distance among all other nodes

In [None]:
from Clustering import ClusterHandler

# Perform Agglomerative Hierarchical clustering with Average Linkage method
clustered_flows_33 = ClusterHandler.perform_hierarchical_clustering(pairwise_distance_list_33, method='average')

In [None]:
clustered_flows_33

### Generate dendrogram for the heirachical tree

In [None]:
from Clustering import ClusterHandler

# plot & save the dendogram
ClusterHandler.generate_dendogram(clustered_flows_33, event_flows_with_counts_df_33,
                                        title='Agglomerative Hierarchical Clustering Dendrogram - 33 percent of variants',
                                        col_name='Variant No',
                                        xlabel= 'Event Flow Variants', 
                                        ylabel= 'Distance', 
                                        save_location= 'Visualize\\HierarchicalClusteringDendrogram33.png')

### Finding optimal number of clusters

#### Using Silhouette Scores

Optimum number of clusters decided based on the maximum Silhouette score

In [None]:
from Clustering import ClusterHandler

# plot silhouett scores & return the optimum cluster number
opt_num_of_clusters_33 = ClusterHandler.plot_silhouette_scores(clustered_flows_33, event_binary_matrix_np_33)

#### Using Elbow method

Here we find the optimum number of clusters from where we find the elbow shape in the graph

In [None]:
from Clustering import ClusterHandler

# plot elbow method values 
ClusterHandler.plot_elbow_method(clustered_flows_33, event_binary_matrix_np_33)

### Cluster variants based on their weights

In [None]:
event_flows_with_counts_df_33

#### Divide into 5 Clusters based on Elbow method

In [None]:
from Clustering import ClusterHandler

# Cluster based on weights
all_clusters_33 = ClusterHandler.form_cluster_variants(event_flows_with_counts_df_33, clustered_flows_33, num_of_clusters=5)
cluster_33_1, cluster_33_2, cluster_33_3, cluster_33_4, cluster_33_5 = all_clusters_33

In [None]:
len(cluster_33_1)

In [None]:
len(cluster_33_2)

In [None]:
len(cluster_33_3)

In [None]:
len(cluster_33_4)

In [None]:
len(cluster_33_5)

# Process Discovery

In [None]:
trace_event_flows_df = pd.read_csv("Data\\Processed\\Trace_Event_Flows.csv", dtype={'Case ID': str})

In [None]:
trace_event_flows_df.dtypes

## Process Discovery - Full Event Log

### Generate the process model

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

hospital_log_full_process_model = ProcessDiscoveryHandler.discover_process(hospital_log_cleansed, True)

### Discover the process

## 66% of frequent flow variants

### With 2 clusters (Silhouette)

#### Process Discovery - Cluster1

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
trace_event_flows_df

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_66_1_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_66_1, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_66_1_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_66_1_processs_model = ProcessDiscoveryHandler.discover_process(cluster_66_1_event_log_actual_df, False)


In [None]:
cluster_66_1_processs_model

##### Visualize the discovered process

In [None]:
# Visualize the process for the cluster log
save_file = 'Visualize\\ProcessDiscovery\\Percent66\\Cluster_66_Sil_1_Process_Visualization.png'
ProcessDiscoveryHandler.visualize_process(cluster_66_1_processs_model, save_file)

#### Process Discovery - Cluster2

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_66_2_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_66_2, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_66_2_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_66_2_processs_model = ProcessDiscoveryHandler.discover_process(cluster_66_2_event_log_actual_df, False)

In [None]:
# Visualize the process for the cluster log
save_file = 'Visualize\\ProcessDiscovery\\Percent66\\Cluster_66_Sil_2_Process_Visualization.png'
ProcessDiscoveryHandler.visualize_process(cluster_66_2_processs_model, save_file)

### With 4 clusters (Elbow)

#### Process Discovery - Cluster1

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_66_Elbow_1_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_66_Elbow_1, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_66_Elbow_1_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_66_Elbow_1_processs_model = ProcessDiscoveryHandler.discover_process(cluster_66_Elbow_1_event_log_actual_df, False)

#### Process Discovery - Cluster2

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_66_Elbow_2_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_66_Elbow_2, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_66_Elbow_2_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_66_Elbow_2_processs_model = ProcessDiscoveryHandler.discover_process(cluster_66_Elbow_2_event_log_actual_df, False)

#### Process Discovery - Cluster3

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_66_Elbow_3_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_66_Elbow_3, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_66_Elbow_3_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_66_Elbow_3_processs_model = ProcessDiscoveryHandler.discover_process(cluster_66_Elbow_3_event_log_actual_df, False)

#### Process Discovery - Cluster4

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_66_Elbow_4_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_66_Elbow_4, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_66_Elbow_4_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_66_Elbow_4_processs_model = ProcessDiscoveryHandler.discover_process(cluster_66_Elbow_4_event_log_actual_df, False)

## 50% of frequent flow variants

### With 5 clusters

#### Process Discovery - Cluster1

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_50_1_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_50_1, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_50_1_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_50_1_processs_model = ProcessDiscoveryHandler.discover_process(cluster_50_1_event_log_actual_df, False)

#### Process Discovery - Cluster2

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_50_2_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_50_2, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_50_2_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_50_2_processs_model = ProcessDiscoveryHandler.discover_process(cluster_50_2_event_log_actual_df, False)

#### Process Discovery - Cluster3

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_50_3_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_50_3, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_50_3_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_50_3_processs_model = ProcessDiscoveryHandler.discover_process(cluster_50_3_event_log_actual_df, False)

#### Process Discovery - Cluster4

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_50_4_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_50_4, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_50_4_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_50_4_processs_model = ProcessDiscoveryHandler.discover_process(cluster_50_4_event_log_actual_df, False)

#### Process Discovery - Cluster5

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_50_5_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_50_5, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_50_5_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_50_5_processs_model = ProcessDiscoveryHandler.discover_process(cluster_50_5_event_log_actual_df, False)

## 33% of frequent flow variants

### With 5 clusters

#### Process Discovery - Cluster1

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_33_1_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_33_1, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_33_1_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_33_1_processs_model = ProcessDiscoveryHandler.discover_process(cluster_33_1_event_log_actual_df, False)

#### Process Discovery - Cluster2

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_33_2_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_33_2, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_33_2_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_33_2_processs_model = ProcessDiscoveryHandler.discover_process(cluster_33_2_event_log_actual_df, False)

#### Process Discovery - Cluster3

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_33_3_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_33_3, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_33_3_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_33_3_processs_model = ProcessDiscoveryHandler.discover_process(cluster_33_3_event_log_actual_df, False)

#### Process Discovery - Cluster4

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_33_4_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_33_4, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_33_4_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_33_4_processs_model = ProcessDiscoveryHandler.discover_process(cluster_33_4_event_log_actual_df, False)

#### Process Discovery - Cluster5

##### Prepare final event log for the cluster using actual trace records for process discovery

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Create the event log (with all attributes) for the cluster
cluster_33_5_event_log_actual_df = ProcessDiscoveryHandler.create_cluster_full_event_log(cluster_33_5, trace_event_flows_df, hospital_log_cleansed)

In [None]:
cluster_33_5_event_log_actual_df

##### Discover the process

In [None]:
from ProcessDiscovery import ProcessDiscoveryHandler

# Discover the process for the cluster log
cluster_33_5_processs_model = ProcessDiscoveryHandler.discover_process(cluster_33_5_event_log_actual_df, False)

# Conformance Checking

## Full Log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of full log against full process model
full_log_fitness, full_log_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, hospital_log_cleansed, True)

print ('Fitness: ', full_log_fitness)
print ('Precision: ', full_log_precision)

## 66% of frequent flow variants

### With 2 clusters (Silhouette)

#### Conformance Checking - Cluster 1

##### Checking the conformance of the cluster1 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 1 against own process model
cluster_66_1_own_fitness, cluster_66_1__own_precision = ConformanceCheckingHandler.get_conformance(cluster_66_1_processs_model, cluster_66_1_event_log_actual_df, False)

print ('Conformance of cluster 1 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_66_1_own_fitness)
print ('Precision: ', cluster_66_1__own_precision)

##### Checking the conformance of the cluster1 log with the process model of the full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 1 against full process model
cluster_66_1_full_fitness, cluster_66_1_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_66_1_event_log_actual_df, False)

print ('Conformance of cluster 1 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_66_1_full_fitness)
print ('Precision: ', cluster_66_1_full_precision)

#### Conformance Checking - Cluster 2

##### Checking the conformance of the cluster 2 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 2 against full process model
cluster_66_2_own_fitness, cluster_66_2_own_precision = ConformanceCheckingHandler.get_conformance(cluster_66_2_processs_model, cluster_66_2_event_log_actual_df, False)

print ('Conformance of cluster 2 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_66_2_own_fitness)
print ('Precision: ', cluster_66_2_own_precision)

##### Checking the conformance of the cluster 2 log against the process model of full event log, discovered

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 2 against full process model
cluster_66_2_full_fitness, cluster_66_2_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_66_2_event_log_actual_df, False)

print ('Conformance of cluster 2 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_66_2_full_fitness)
print ('Precision: ', cluster_66_2_full_precision)

### With 4 clusters (Elbow)

#### Conformance Checking - Cluster 1

##### Checking the conformance of the cluster 1 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 1 against own process model
cluster_66_elbow_1_own_fitness, cluster_66_elbow_1_own_precision = ConformanceCheckingHandler.get_conformance(cluster_66_Elbow_1_processs_model, cluster_66_Elbow_1_event_log_actual_df, False)

print ('Conformance of cluster 1 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_66_elbow_1_own_fitness)
print ('Precision: ', cluster_66_elbow_1_own_precision)

##### Checking the conformance of the cluster 1 log against the process model of full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 1 against full process model
cluster_66_elbow_1_full_fitness, cluster_66_elbow_1_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_66_Elbow_1_event_log_actual_df, False)

print ('Conformance of cluster 1 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_66_elbow_1_full_fitness)
print ('Precision: ', cluster_66_elbow_1_full_precision)

#### Conformance Checking - Cluster 2

##### Checking the conformance of the cluster 2 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 2 against own process model
cluster_66_elbow_2_own_fitness, cluster_66_elbow_2_own_precision = ConformanceCheckingHandler.get_conformance(cluster_66_Elbow_2_processs_model, cluster_66_Elbow_2_event_log_actual_df, False)

print ('Conformance of cluster 2 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_66_elbow_2_own_fitness)
print ('Precision: ', cluster_66_elbow_2_own_precision)

##### Checking the conformance of the cluster 2 log against the process model of full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 2 against full process model
cluster_66_elbow_2_full_fitness, cluster_66_elbow_2_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_66_Elbow_2_event_log_actual_df, False)

print ('Conformance of cluster 2 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_66_elbow_2_full_fitness)
print ('Precision: ', cluster_66_elbow_2_full_precision)

#### Conformance Checking - Cluster 3

##### Checking the conformance of the cluster 3 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 3 against own process model
cluster_66_elbow_3_own_fitness, cluster_66_elbow_3_own_precision = ConformanceCheckingHandler.get_conformance(cluster_66_Elbow_3_processs_model, cluster_66_Elbow_3_event_log_actual_df, False)

print ('Conformance of cluster 3 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_66_elbow_3_own_fitness)
print ('Precision: ', cluster_66_elbow_3_own_precision)

##### Checking the conformance of the cluster 3 log against the process model of full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 3 against full process model
cluster_66_elbow_3_full_fitness, cluster_66_elbow_3_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_66_Elbow_3_event_log_actual_df, False)

print ('Conformance of cluster 3 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_66_elbow_3_full_fitness)
print ('Precision: ', cluster_66_elbow_3_full_precision)

#### Conformance Checking - Cluster 4

##### Checking the conformance of the cluster 4 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 4 against own process model
cluster_66_elbow_4_own_fitness, cluster_66_elbow_4_own_precision = ConformanceCheckingHandler.get_conformance(cluster_66_Elbow_4_processs_model, cluster_66_Elbow_4_event_log_actual_df, False)

print ('Conformance of cluster 4 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_66_elbow_4_own_fitness)
print ('Precision: ', cluster_66_elbow_4_own_precision)

##### Checking the conformance of the cluster 4 log against the process model of full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 4 against full process model
cluster_66_elbow_4_full_fitness, cluster_66_elbow_4_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_66_Elbow_4_event_log_actual_df, False)

print ('Conformance of cluster 4 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_66_elbow_4_full_fitness)
print ('Precision: ', cluster_66_elbow_4_full_precision)

## 50% of frequent flow variants

### With 5 clusters 

#### Conformance Checking - Cluster 1

##### Checking the conformance of the cluster 1 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 1 against own process model
cluster_50_1_own_fitness, cluster_50_1_own_precision = ConformanceCheckingHandler.get_conformance(cluster_50_1_processs_model, cluster_50_1_event_log_actual_df, False)

print ('Conformance of cluster 1 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_50_1_own_fitness)
print ('Precision: ', cluster_50_1_own_precision)

##### Checking the conformance of the cluster 1 log with the process model of the full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 1 against full process model
cluster_50_1_full_fitness, cluster_50_1_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_50_1_event_log_actual_df, False)

print ('Conformance of cluster 1 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_50_1_full_fitness)
print ('Precision: ', cluster_50_1_full_precision)

#### Conformance Checking - Cluster 2

##### Checking the conformance of the cluster 2 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 2 against own process model
cluster_50_2_own_fitness, cluster_50_2_own_precision = ConformanceCheckingHandler.get_conformance(cluster_50_2_processs_model, cluster_50_2_event_log_actual_df, False)

print ('Conformance of cluster 2 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_50_2_own_fitness)
print ('Precision: ', cluster_50_2_own_precision)

##### Checking the conformance of the cluster 2 log with the process model of the full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 2 against full process model
cluster_50_2_full_fitness, cluster_50_2_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_50_2_event_log_actual_df, False)

print ('Conformance of cluster 2 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_50_2_full_fitness)
print ('Precision: ', cluster_50_2_full_precision)

#### Conformance Checking - Cluster 3

##### Checking the conformance of the cluster 3 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 3 against own process model
cluster_50_3_own_fitness, cluster_50_3_own_precision = ConformanceCheckingHandler.get_conformance(cluster_50_3_processs_model, cluster_50_3_event_log_actual_df, False)

print ('Conformance of cluster 3 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_50_3_own_fitness)
print ('Precision: ', cluster_50_3_own_precision)

##### Checking the conformance of the cluster 3 log with the process model of the full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 3 against full process model
cluster_50_3_full_fitness, cluster_50_3_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_50_3_event_log_actual_df, False)

print ('Conformance of cluster 3 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_50_3_full_fitness)
print ('Precision: ', cluster_50_3_full_precision)

#### Conformance Checking - Cluster 4

##### Checking the conformance of the cluster 4 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 4 against own process model
cluster_50_4_own_fitness, cluster_50_4_own_precision = ConformanceCheckingHandler.get_conformance(cluster_50_4_processs_model, cluster_50_4_event_log_actual_df, False)

print ('Conformance of cluster 4 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_50_4_own_fitness)
print ('Precision: ', cluster_50_4_own_precision)

##### Checking the conformance of the cluster 4 log with the process model of the full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 4 against full process model
cluster_50_4_full_fitness, cluster_50_4_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_50_4_event_log_actual_df, False)

print ('Conformance of cluster 4 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_50_4_full_fitness)
print ('Precision: ', cluster_50_4_full_precision)

#### Conformance Checking - Cluster 5

##### Checking the conformance of the cluster 5 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 5 against own process model
cluster_50_5_own_fitness, cluster_50_5_own_precision = ConformanceCheckingHandler.get_conformance(cluster_50_5_processs_model, cluster_50_5_event_log_actual_df, False)

print ('Conformance of cluster 5 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_50_5_own_fitness)
print ('Precision: ', cluster_50_5_own_precision)

##### Checking the conformance of the cluster 5 log with the process model of the full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 5 against full process model
cluster_50_5_full_fitness, cluster_50_5_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_50_5_event_log_actual_df, False)

print ('Conformance of cluster 5 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_50_5_full_fitness)
print ('Precision: ', cluster_50_5_full_precision)

## 33% of frequent flow variants

### With 5 clusters 

#### Conformance Checking - Cluster 1

##### Checking the conformance of the cluster 1 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 1 against own process model
cluster_33_1_own_fitness, cluster_33_1_own_precision = ConformanceCheckingHandler.get_conformance(cluster_33_1_processs_model, cluster_33_1_event_log_actual_df, False)

print ('Conformance of cluster 1 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_33_1_own_fitness)
print ('Precision: ', cluster_33_1_own_precision)

##### Checking the conformance of the cluster 1 log with the process model of the full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 1 against full process model
cluster_33_1_full_fitness, cluster_33_1_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_33_1_event_log_actual_df, False)

print ('Conformance of cluster 1 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_33_1_full_fitness)
print ('Precision: ', cluster_33_1_full_precision)

#### Conformance Checking - Cluster 2

##### Checking the conformance of the cluster 2 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 2 against own process model
cluster_33_2_own_fitness, cluster_33_2_own_precision = ConformanceCheckingHandler.get_conformance(cluster_33_2_processs_model, cluster_33_2_event_log_actual_df, False)

print ('Conformance of cluster 2 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_33_2_own_fitness)
print ('Precision: ', cluster_33_2_own_precision)

##### Checking the conformance of the cluster 2 log with the process model of the full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 2 against full process model
cluster_33_2_full_fitness, cluster_33_2_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_33_2_event_log_actual_df, False)

print ('Conformance of cluster 2 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_33_2_full_fitness)
print ('Precision: ', cluster_33_2_full_precision)

#### Conformance Checking - Cluster 3

##### Checking the conformance of the cluster 3 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 3 against own process model
cluster_33_3_own_fitness, cluster_33_3_own_precision = ConformanceCheckingHandler.get_conformance(cluster_33_3_processs_model, cluster_33_3_event_log_actual_df, False)

print ('Conformance of cluster 3 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_33_3_own_fitness)
print ('Precision: ', cluster_33_3_own_precision)

##### Checking the conformance of the cluster 3 log with the process model of the full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 3 against full process model
cluster_33_3_full_fitness, cluster_33_3_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_33_3_event_log_actual_df, False)

print ('Conformance of cluster 3 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_33_3_full_fitness)
print ('Precision: ', cluster_33_3_full_precision)

#### Conformance Checking - Cluster 4

##### Checking the conformance of the cluster 4 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 4 against own process model
cluster_33_4_own_fitness, cluster_33_4_own_precision = ConformanceCheckingHandler.get_conformance(cluster_33_4_processs_model, cluster_33_4_event_log_actual_df, False)

print ('Conformance of cluster 4 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_33_4_own_fitness)
print ('Precision: ', cluster_33_4_own_precision)

##### Checking the conformance of the cluster 4 log with the process model of the full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 4 against full process model
cluster_33_4_full_fitness, cluster_33_4_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_33_4_event_log_actual_df, False)

print ('Conformance of cluster 4 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_33_4_full_fitness)
print ('Precision: ', cluster_33_4_full_precision)

#### Conformance Checking - Cluster 5

##### Checking the conformance of the cluster 5 log with its own process model

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 5 against own process model
cluster_33_5_own_fitness, cluster_33_5_own_precision = ConformanceCheckingHandler.get_conformance(cluster_33_5_processs_model, cluster_33_5_event_log_actual_df, False)

print ('Conformance of cluster 5 against own process model')
print ('--------------------------------------------------')
print ('Fitness: ', cluster_33_5_own_fitness)
print ('Precision: ', cluster_33_5_own_precision)

##### Checking the conformance of the cluster 5 log with the process model of the full event log

In [None]:
from ConformanceChecking import ConformanceCheckingHandler

# calculating the fitness & the precision of cluster 5 against full process model
cluster_33_5_full_fitness, cluster_33_5_full_precision = ConformanceCheckingHandler.get_conformance(hospital_log_full_process_model, cluster_33_5_event_log_actual_df, False)

print ('Conformance of cluster 5 against full process model')
print ('---------------------------------------------------')
print ('Fitness: ', cluster_33_5_full_fitness)
print ('Precision: ', cluster_33_5_full_precision)