In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import pm4py
from feeed import extract_features
import math

## Comparing python and Java

In [None]:
import warnings 
import glob
warnings.filterwarnings('ignore')

log_files = glob.glob("test_logs/*.xes")
df_results = pd.DataFrame(columns=['File', 'Key', 'Java_Value', 'Python_Value', 'Difference'])

In [None]:
for file in log_files:
    INPUT_PATH = file
    output_features_java=extract_features(INPUT_PATH, feature_types=["entropies",])
    output_features_python=extract_features(INPUT_PATH, feature_types=["internal_entropies"])
    
        # Compare dictionaries
    for key in output_features_java.keys():
        if key in output_features_python:
            if output_features_java[key] != output_features_python[key]:
                # Add the result to the DataFrame using df_results.loc
                df_results.loc[len(df_results)] = [file, key, output_features_java[key], output_features_python[key], output_features_java[key] - output_features_python[key]]

    # Check for keys in output_features_python that are not in output_features_java
    additional_keys = set(output_features_python.keys()) - set(output_features_java.keys())
    for key in additional_keys:
        # Add the result to the DataFrame using df_results.loc
        df_results.loc[len(df_results)] = [file, key, None, output_features_python[key], None]

    # Check for keys in output_features_java that are not in output_features_python
    missing_keys = set(output_features_java.keys()) - set(output_features_python.keys())
    for key in missing_keys:
        # Add the result to the DataFrame using df_results.loc
        df_results.loc[len(df_results)] = [file, key, output_features_java[key], None, None]

## inspecting as the feed library

In [40]:
from pm4py.objects.log.importer.xes import importer as xes_importer
from collections import Counter
from pm4py.algo.filtering.log.variants import variants_filter
event_log = xes_importer.apply('test_logs/Sepsis.xes')

parsing log, completed traces :: 100%|██████████| 1050/1050 [00:00<00:00, 1212.01it/s]


In [3]:
event_log

[{'attributes': {'concept:name': 'A'}, 'events': [{'InfectionSuspected': True, 'org:group': 'A', 'DiagnosticBlood': True, 'DisfuncOrg': True, 'SIRSCritTachypnea': True, 'Hypotensie': True, 'SIRSCritHeartRate': True, 'Infusion': True, 'DiagnosticArtAstrup': True, 'concept:name': 'ER Registration', 'Age': 85, 'DiagnosticIC': True, 'DiagnosticSputum': False, 'DiagnosticLiquor': False, 'DiagnosticOther': False, 'SIRSCriteria2OrMore': True, 'DiagnosticXthorax': True, 'SIRSCritTemperature': True, 'time:timestamp': datetime.datetime(2014, 10, 22, 11, 15, 41, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), 'DiagnosticUrinaryCulture': True, 'SIRSCritLeucos': False, 'Oligurie': False, 'DiagnosticLacticAcid': True, 'lifecycle:transition': 'complete', 'Diagnose': 'A', 'Hypoxie': False, 'DiagnosticUrinarySediment': True, 'DiagnosticECG': True}, '..', {'org:group': 'E', 'lifecycle:transition': 'complete', 'concept:name': 'Release A', 'time:timestamp': datetime.datetime(2014, 11, 2, 15, 

In [4]:
[tuple(event["concept:name"] for event in trace) for trace in event_log]

[('a', 'b', 'c', 'd', 'f', 'g', 'h'),
 ('a', 'b', 'c', 'e', 'f', 'g', 'h'),
 ('a', 'b', 'c', 'd', 'f', 'g'),
 ('a', 'b', 'c', 'e', 'f', 'g'),
 ('a', 'b', 'b', 'c', 'd', 'f', 'g', 'h'),
 ('a', 'b', 'b', 'c', 'e', 'f', 'g', 'h'),
 ('a', 'b', 'b', 'c', 'd', 'f', 'g'),
 ('a', 'b', 'b', 'c', 'e', 'f', 'g')]

### trace entropies

In [16]:
def trace_entropies(log):
    # Get unique traces and their counts
    trace_counts = Counter(tuple(event["concept:name"] for event in trace) for trace in log)
    
    # Calculate trace entropy
    trace_entropy = sum((count / len(log)) * np.log2(count / len(log)) for count in trace_counts.values())
    
    return -trace_entropy  # Use negative sign to follow the convention of minimizing entropy

In [17]:

print("Trace Entropy: ", trace_entropies(event_log))

Trace Entropy:  3.0


### Prefix entropies

In [18]:

def prefix_entropy(log):
    unique_traces = [tuple(event["concept:name"] for event in trace) for trace in log]
    

    # Generate all possible prefixes
    all_possible_prefixes = [tuple(trace[:i+1]) for trace in unique_traces for i in range(len(trace))]

    # Use Counter to count occurrences of each unique prefix
    prefix_counts = Counter(all_possible_prefixes)

    # Calculate prefix entropy
    total_prefixes = len(all_possible_prefixes)
    prefix_entropy = sum((count / total_prefixes) * math.log2(count / total_prefixes) for count in prefix_counts.values())

    return -prefix_entropy

def prefix_flattened_entropy(log):
    unique_traces = variants_filter.get_variants(log)

    # Generate all possible prefixes
    all_possible_prefixes = [tuple(trace[:i+1]) for trace in unique_traces for i in range(len(trace))]

    # Use Counter to count occurrences of each unique prefix
    prefix_counts = Counter(all_possible_prefixes)

    # Calculate prefix entropy
    total_prefixes = len(all_possible_prefixes)
    prefix_entropy = sum((count / total_prefixes) * math.log2(count / total_prefixes) for count in prefix_counts.values())

    return -prefix_entropy

In [19]:
print("Prefix Entropy: ", prefix_entropy(event_log))
print("Prefix Flattened Entropy: ", prefix_flattened_entropy(event_log))

Prefix Entropy:  4.09306920777189
Prefix Flattened Entropy:  4.09306920777189


### Global block entropy

In [20]:
def global_block_entropy(log):
    all_traces = [tuple(event["concept:name"] for event in trace) for trace in log]
    
    # Generate all possible substrings for all traces
    all_substrings = [sub for trace in all_traces for sub in (tuple(trace[i:j]) for i in range(len(trace)) for j in range(i + 1, len(trace) + 1))]
    
    substring_counts = Counter(all_substrings)
    total_substrings = len(all_substrings)
    
    # Calculate entropy
    substring_entropy = sum((count / total_substrings) * math.log2(count / total_substrings) for count in substring_counts.values())
    
    return -substring_entropy

In [21]:
def flattened_global_block_entropy(log):
    all_traces = variants_filter.get_variants(log)
    
    # Generate all possible substrings for all traces
    all_substrings = [sub for trace in all_traces for sub in (tuple(trace[i:j]) for i in range(len(trace)) for j in range(i + 1, len(trace) + 1))]
    
    substring_counts = Counter(all_substrings)
    total_substrings = len(all_substrings)
    
    # Calculate entropy
    substring_entropy = sum((count / total_substrings) * math.log2(count / total_substrings) for count in substring_counts.values())
    
    return -substring_entropy


In [22]:
print("Global Block Entropy: ", global_block_entropy(event_log))
print("Flattened Global Block Entropy: ", flattened_global_block_entropy(event_log))

Global Block Entropy:  5.753720776536197
Flattened Global Block Entropy:  5.753720776536197


### K- Block entropy

In [123]:
def entropy_k_block(log, k=1):
    all_k_object_substrings = [trace[i:i + k] for trace in (tuple(event["concept:name"] for event in trace) for trace in log) for i in range(len(trace) - k + 1)]
    
    k_sub_counts = Counter(all_k_object_substrings)
    total_k_substrings = len(all_k_object_substrings)

    k_substring_entropy = sum((count / total_k_substrings) * math.log2(count / total_k_substrings) for count in k_sub_counts.values()) 
    return -k_substring_entropy

In [24]:
entropy_k_block(event_log, 1)

2.8962915290459277

K_Block_ratio

In [25]:
print("Entropy 1 Block Ratio: ", entropy_k_block(event_log, 1)/1)
print("Entropy 2 Block Ratio: ", entropy_k_block(event_log, 2)/2)
print("Entropy 3 Block Ratio: ", entropy_k_block(event_log, 3)/3)
print("Entropy 4 Block Ratio: ", entropy_k_block(event_log, 4)/4)
print("Entropy 5 Block Ratio: ", entropy_k_block(event_log, 5)/5)
print("Entropy 6 Block Ratio: ", entropy_k_block(event_log, 6)/6)

Entropy 1 Block Ratio:  2.8962915290459277
Entropy 2 Block Ratio:  1.5424812503605778
Entropy 3 Block Ratio:  1.1073093649624541
Entropy 4 Block Ratio:  0.84375
Entropy 5 Block Ratio:  0.6503258334775646
Entropy 6 Block Ratio:  0.5


K_Block_diff

In [26]:
print("Entropy 1 Block diff: ", entropy_k_block(event_log, 1) - entropy_k_block(event_log, 0))
print("Entropy 2 Block diff: ", entropy_k_block(event_log, 2) - entropy_k_block(event_log, 1))
print("Entropy 3 Block diff: ", entropy_k_block(event_log, 3) - entropy_k_block(event_log, 2))
print("Entropy 4 Block diff: ", entropy_k_block(event_log, 4) - entropy_k_block(event_log, 3))
print("Entropy 5 Block diff: ", entropy_k_block(event_log, 5) - entropy_k_block(event_log, 4))
print("Entropy 6 Block diff: ", entropy_k_block(event_log, 6) - entropy_k_block(event_log, 5))

Entropy 1 Block diff:  2.8962915290459277
Entropy 2 Block diff:  0.18867097167522795
Entropy 3 Block diff:  0.23696559416620655
Entropy 4 Block diff:  0.05307190511263782
Entropy 5 Block diff:  -0.1233708326121774
Entropy 6 Block diff:  -0.2516291673878226


### Lempel-ziv entropy

In [15]:
def entropy_lempel_zev_flattened(log):
    unique_traces = list(variants_filter.get_variants(log)) # List of tuples
    N = 0
    words = set()
    progress = 0
    total = len(unique_traces)

    for trace in unique_traces:
        word = ""
        for activity in trace:
            word = word + activity
            if word not in words:
                words.add(word)
                word = ""

        N += len(trace)
        progress += 1

    N_w = len(words)
    h = N_w * math.log2(N) / N
    return h

In [38]:
import math

def entropy_lempel_zev_flattened(log):
    
    unique_traces = [tuple(event["concept:name"] for event in trace) for trace in log]
    N, N_w, words = 0, 0, set()

    for trace in unique_traces:
        word = ""
        for activity in trace:
            word += activity
            if word not in words:
                words.add(word)
                word = ""

        N += len(trace)

    N_w = len(words)
    return N_w * math.log2(N) / N


In [39]:
entropy_lempel_zev_flattened(event_log)

1.7505649725632033

### KNN entropy

#### type-1: kinda slow

In [11]:
from Levenshtein import distance 
def harmonic_sum(j):
    if j < 0:
        return None
    elif j == 0:
        return 0.0
    else:
        L_j = 0.0
        for i in range(1, j + 1):
            L_j += 1.0 / float(i)
        return L_j
    
def find_nearest_neighbor(trace_list, k =1):
    neighbour_list = []
    for other_trace in trace_list:
        list1 = [(trace, distance(other_trace,trace), max(len(other_trace),len(trace)), distance(other_trace,trace)/max(len(other_trace),len(trace)))for trace in trace_list]
        list2 = sorted(list1,key=lambda x: x[3])
        filtered_list = list(filter(lambda x: x[3] != 0, list2))
        neighbour_list.append(filtered_list[k-1])
    return neighbour_list


import time
def entropy_flattened_knn(log, k=1):
    unique_traces = variants_filter.get_variants(log)
    unique_traces = list(unique_traces)
    start_time = time.time()
    local_neighbour_list = find_nearest_neighbor(unique_traces, k)
    print("Time for find_nearest_neighbor--- %s seconds ---" % (time.time() - start_time))
    n= len(unique_traces)
    
    knn_entropy = 0
    start_time = time.time()
    for neighbour in local_neighbour_list:
        item, d, max_len, normalized_lev = neighbour
        part_2 = math.log(normalized_lev)
        part_3 = math.log(math.pow(math.pi, 1/ 2.0) / math.gamma(1.0 / 2.0 + 1.0))
        part_4 = 0.5772
        part_5 = harmonic_sum(k-1)
        part_6 = math.log(n)
        local_sum = 1/n * (part_2 + part_3 + part_4 - part_5 + part_6)
        knn_entropy += local_sum
    print("Time for for loop--- %s seconds ---" % (time.time() - start_time))
    return knn_entropy

entropy_flattened_knn(event_log, 4)

Time for find_nearest_neighbor--- 3.186993360519409 seconds ---
Time for for loop--- 0.0009992122650146484 seconds ---


4.700062906892501

#### type-2: fast

In [4]:
from Levenshtein import distance 
def harmonic_sum(j):
    if j < 0:
        return None
    elif j == 0:
        return 0.0
    else:
        L_j = 0.0
        for i in range(1, j + 1):
            L_j += 1.0 / float(i)
        return L_j
    
def calculate_distance_matrix(trace_list):
    n = len(trace_list)
    distance_matrix = [[0] * n for _ in range(n)]

    for i in range(n):
        for j in range(i + 1, n):
            dist = distance(trace_list[i], trace_list[j])
            distance_matrix[i][j] = dist
            distance_matrix[j][i] = dist

    return distance_matrix

def find_nearest_neighbor(trace_list, k=1):
    n = len(trace_list)
    distance_matrix = calculate_distance_matrix(trace_list)

    neighbour_list = []

    for i in range(n):
        distances = [(j, distance_matrix[i][j], max(len(trace_list[i]), len(trace_list[j])),
                      distance_matrix[i][j] / max(len(trace_list[i]), len(trace_list[j])))
                     for j in range(n) if i != j]
        distances.sort(key=lambda x: x[3])

        filtered_distances = [d for d in distances if d[1] != 0]
        neighbour_list.append(filtered_distances[k - 1])

    return neighbour_list

import time
def entropy_flattened_knn(log, k=1):
    unique_traces = variants_filter.get_variants(log)
    unique_traces = list(unique_traces)
    start_time = time.time()
    local_neighbour_list = find_nearest_neighbor(unique_traces, k)
    print("Time for find_nearest_neighbor--- %s seconds ---" % (time.time() - start_time))
    n= len(unique_traces)
    
    knn_entropy = 0
    start_time = time.time()
    for neighbour in local_neighbour_list:
        item, d, max_len, normalized_lev = neighbour
        part_2 = math.log(normalized_lev)
        part_3 = math.log(math.pow(math.pi, 1/ 2.0) / math.gamma(1.0 / 2.0 + 1.0))
        part_4 = 0.5772
        part_5 = harmonic_sum(k-1)
        part_6 = math.log(n)
        local_sum = 1/n * (part_2 + part_3 + part_4 - part_5 + part_6)
        knn_entropy += local_sum
    print("Time for for loop--- %s seconds ---" % (time.time() - start_time))
    return knn_entropy

entropy_flattened_knn(event_log, 4)

Time for find_nearest_neighbor--- 1.5599985122680664 seconds ---
Time for for loop--- 0.0019757747650146484 seconds ---


4.700062906892501

#### type-3: faster

In [7]:
# allll_traces = [tuple(event["concept:name"] for event in trace) for trace in event_log]
unique_traces = list(variants_filter.get_variants(event_log))

In [7]:
from Levenshtein import distance 
def harmonic_sum(j):
    if j < 0:
        return None
    elif j == 0:
        return 0.0
    else:
        L_j = 0.0
        for i in range(1, j + 1):
            L_j += 1.0 / float(i)
        return L_j
    
def calculate_distance_matrix(trace_list):
    n = len(trace_list)
    distance_matrix = [[0] * n for _ in range(n)]

    for i in range(n):
        for j in range(i + 1, n):
            dist = distance(trace_list[i], trace_list[j])
            distance_matrix[i][j] = dist
            distance_matrix[j][i] = dist

    return distance_matrix

def find_nearest_neighbors(trace_list, k=1):
    n = len(trace_list)
    distance_matrix = calculate_distance_matrix(trace_list)

    def calculate_normalized_distance(i, j):
        return distance_matrix[i][j] / max(len(trace_list[i]), len(trace_list[j]))

    neighbour_list = []

    for i in range(n):
        distances = [(calculate_normalized_distance(i, j))
                     for j in range(n) if i != j]
        distances.sort(key=lambda x: x)

        filtered_distances = [d for d in distances if d != 0]
        neighbour_list.append(filtered_distances[k-1])

    return neighbour_list

import time
def entropy_flattened_knn(log, k=1):
    unique_traces = variants_filter.get_variants(log)
    unique_traces = list(unique_traces)
    start_time = time.time()
    local_neighbour_list = find_nearest_neighbors(unique_traces, k)
    print("Time for find_nearest_neighbor--- %s seconds ---" % (time.time() - start_time))
    n= len(unique_traces)
    
    knn_entropy = 0
    start_time = time.time()
    for neighbour in local_neighbour_list:
        normalized_lev = neighbour
        part_2 = math.log(normalized_lev)
        part_3 = math.log(math.pow(math.pi, 1/ 2.0) / math.gamma(1.0 / 2.0 + 1.0))
        part_4 = 0.5772
        part_5 = harmonic_sum(k-1)
        part_6 = math.log(n)
        local_sum = 1/n * (part_2 + part_3 + part_4 - part_5 + part_6)
        knn_entropy += local_sum
    print("Time for for loop--- %s seconds ---" % (time.time() - start_time))
    return knn_entropy

entropy_flattened_knn(event_log, 4)

Time for find_nearest_neighbor--- 1.1830122470855713 seconds ---
Time for for loop--- 0.0019953250885009766 seconds ---


4.700062906892501

### Inspecting sepsis

In [70]:
event_log1 = pm4py.read_xes('test_logs/Sepsis.xes')

parsing log, completed traces :: 100%|██████████| 1050/1050 [00:00<00:00, 1093.76it/s]


In [71]:
event_log1

Unnamed: 0,InfectionSuspected,org:group,DiagnosticBlood,DisfuncOrg,SIRSCritTachypnea,Hypotensie,SIRSCritHeartRate,Infusion,DiagnosticArtAstrup,concept:name,...,DiagnosticLacticAcid,lifecycle:transition,Diagnose,Hypoxie,DiagnosticUrinarySediment,DiagnosticECG,case:concept:name,Leucocytes,CRP,LacticAcid
0,True,A,True,True,True,True,True,True,True,ER Registration,...,True,complete,A,False,True,True,A,,,
1,,B,,,,,,,,Leucocytes,...,,complete,,,,,A,9.6,,
2,,B,,,,,,,,CRP,...,,complete,,,,,A,,21.0,
3,,B,,,,,,,,LacticAcid,...,,complete,,,,,A,,,2.2
4,,C,,,,,,,,ER Triage,...,,complete,,,,,A,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15209,,B,,,,,,,,CRP,...,,complete,,,,,KNA,,66.0,
15210,,E,,,,,,,,Release A,...,,complete,,,,,KNA,,,
15211,False,L,False,False,False,False,False,False,False,ER Registration,...,False,complete,,False,False,False,LNA,,,
15212,,C,,,,,,,,ER Triage,...,,complete,,,,,LNA,,,


In [73]:
event_log1['concept:name'].value_counts()

concept:name
Leucocytes          3383
CRP                 3262
LacticAcid          1466
Admission NC        1182
ER Triage           1053
ER Registration     1050
ER Sepsis Triage    1049
IV Antibiotics       823
IV Liquid            753
Release A            671
Return ER            294
Admission IC         117
Release B             56
Release C             25
Release D             24
Release E              6
Name: count, dtype: int64

In [6]:
event_log.groupby('concept:name').count()

Unnamed: 0_level_0,InfectionSuspected,org:group,DiagnosticBlood,DisfuncOrg,SIRSCritTachypnea,Hypotensie,SIRSCritHeartRate,Infusion,DiagnosticArtAstrup,Age,...,DiagnosticLacticAcid,lifecycle:transition,Diagnose,Hypoxie,DiagnosticUrinarySediment,DiagnosticECG,case:concept:name,Leucocytes,CRP,LacticAcid
concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Admission IC,0,117,0,0,0,0,0,0,0,0,...,0,117,0,0,0,0,117,0,0,0
Admission NC,0,1182,0,0,0,0,0,0,0,0,...,0,1182,0,0,0,0,1182,0,0,0
CRP,0,3262,0,0,0,0,0,0,0,10,...,0,3262,5,0,0,0,3262,0,3123,0
ER Registration,1050,1050,1050,1050,1050,1050,1050,1050,1050,995,...,1050,1050,754,1050,1050,1050,1050,0,0,0
ER Sepsis Triage,0,1049,0,0,0,0,0,0,0,7,...,0,1049,5,0,0,0,1049,0,0,0
ER Triage,0,1053,0,0,0,0,0,0,0,6,...,0,1053,4,0,0,0,1053,0,0,0
IV Antibiotics,0,823,0,0,0,0,0,0,0,0,...,0,823,0,0,0,0,823,0,0,0
IV Liquid,0,753,0,0,0,0,0,0,0,14,...,0,753,13,0,0,0,753,0,0,0
LacticAcid,0,1466,0,0,0,0,0,0,0,0,...,0,1466,0,0,0,0,1466,0,0,1454
Leucocytes,0,3383,0,0,0,0,0,0,0,18,...,0,3383,16,0,0,0,3383,3361,0,0


In [66]:
import pm4py
from collections import Counter
import math

# Load the event log
xes_file_path = "test_logs/Sepsis.xes"
event_log = pm4py.read_xes(xes_file_path)

# Extract activity names from the event log
activities = event_log['concept:name']

# Calculate the frequency of each activity
activity_counts = Counter(activities)

# Calculate the probability of each activity
total_activities = len(activities)
activity_probabilities = {activity: count / total_activities for activity, count in activity_counts.items()}

# Calculate entropy based on activity probabilities
entropy = -sum(prob * math.log2(prob) for prob in activity_probabilities.values() if prob > 0)

print("Entropy of the event log:", entropy)


parsing log, completed traces :: 100%|██████████| 1050/1050 [00:00<00:00, 1370.26it/s]


Entropy of the event log: 3.2381589112808022


using feeed

In [50]:
from feeed.feature_extractor import extract_features

features = extract_features("test_logs/Sepsis.xes", feature_types=['simple_stats','entropies'])

     INFO: Sepsis.xes 3 simple_stats took 0:00:00.009976 sec, next entropies...
     INFO: Sepsis.xes 16 entropies took 0:00:18.312158 sec, last feature.
SUCCESSFULLY: 17 features for Sepsis.xes took 0:00:18.322134 sec.


In [51]:
features

{'log': 'Sepsis',
 'n_traces': 1050,
 'n_unique_traces': 846,
 'ratio_unique_traces_per_trace': 0.8057142857142857,
 'entropy_trace': 9.334,
 'entropy_prefix': 10.227,
 'entropy_global_block': 14.501,
 'entropy_lempel_ziv': 1.727,
 'entropy_k_block_diff_1': -0.019,
 'entropy_k_block_diff_3': 1.837,
 'entropy_k_block_diff_5': 1.712,
 'entropy_k_block_ratio_1': 2.262,
 'entropy_k_block_ratio_3': 3.238,
 'entropy_k_block_ratio_5': 2.538,
 'entropy_knn_3': 4.956,
 'entropy_knn_5': 4.49,
 'entropy_knn_7': 4.191}