In [1]:
%load_ext autoreload
import datetime
# import ete3
import itertools
import json
import logging
import math
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import typing
# import re
import xml.etree.ElementTree as ET
# import xml.dom.minidom
# import xmlschema

In [34]:
%autoreload
from maxes.xes_loader2 import XesLoader, XesLog, XesTrace, XesEvent, XES_STRING, XES_INT, XES_DATE
# from maxes.xes_file import XesFile
# from maxes.analyze_xes import AnalyzeXes
# import maxes.analyze_sequence
# import maxes.serialization.serialize
# import maxes.graphs
import maxes.metrics.mean_levenstein_distance
import maxes.notebooks.utils
import maxes.utils

In [35]:
maxes.notebooks.utils.init_notebook() # RUN ONLY ONCE

Loading config from: C:\vt\md\maxes\maxes\config.local.yml
Config:
{'project_directory_path': '/vt/md/maxes/maxes', 'data_directory_path': '/vt/md/maxes/maxes/data'}


In [10]:
ccc19_processed_path = os.path.join(maxes.notebooks.utils.get_project_path(), "output/data_processed/ccc19.xes")
ccc19_generated_path = os.path.join(maxes.notebooks.utils.get_project_path(), "output/data_generated/ccc19.xes")

ccc19_processed = maxes.notebooks.utils.load_xes(ccc19_processed_path)
ccc19_processed

ccc19_generated = maxes.notebooks.utils.generate_log(ccc19_processed)
maxes.notebooks.utils.write_xes_to_file(ccc19_generated, skeleton_log=ccc19_processed, destination_file_path=ccc19_generated_path)

In [27]:
eleven_original_path = "C:\\vt\\md\\maxes\\maxes\\data\\other\\11.xes"
eleven_generated_path = "C:\\vt\\md\\maxes\\maxes\\output\\data_generated\\11.xes"

eleven_processed = maxes.notebooks.utils.load_xes(eleven_original_path)
eleven_processed

eleven_generated = maxes.notebooks.utils.generate_log(eleven_processed)
maxes.notebooks.utils.write_xes_to_file(eleven_generated, skeleton_log=eleven_processed, destination_file_path=eleven_generated_path)

In [16]:

def get_attributes_weights1(original_log: XesLog) -> dict[str, float]:
    # All attributes are equally important

    event_attributes_count = len(original_log.event_attribute_type_map)
    attributes_weights = {attribute_name: 1.0 / float(event_attributes_count)
                          for attribute_name in original_log.event_attribute_type_map.keys()}

    return attributes_weights

def get_attributes_weights2(original_log: XesLog) -> dict[str, float]:
    # String attributes are less important

    attributes_weights: dict[str, float] = {}

    for attribute_name, attribute_type in original_log.event_attribute_type_map.items():
        weight = 1.0

        if attribute_type == 'str':
            weight = 0.2

        attributes_weights[attribute_name] = weight

    return maxes.utils.normalize(attributes_weights)

def calculate_attributes_range_lengths(event_attribute_type_map: dict[str, str], original_trace: XesTrace) -> dict[str, float]:
    attributes_range_lengths: dict[str, float] = {}

    for attribute_name, attribute_type in event_attribute_type_map.items():
        range_length = None

        if attribute_type == XES_INT:
            series = original_trace.df[attribute_name]
            range_length = series.max() - series.min()

        if attribute_type == XES_DATE:
            series = original_trace.df[attribute_name]
            range_length = (series.max() - series.min()).seconds

        if range_length != None:
            attributes_range_lengths[attribute_name] = range_length

    return attributes_range_lengths

def calculate_event_error(
        original_event: XesEvent,
        generated_event: XesEvent,

        event_attribute_type_map: dict[str, str],
        attributes_range_lengths: dict[str, float],
        attributes_weights: dict[str, float]
        ) -> float:
    event_error = 0

    for attribute_name, attribute_type in event_attribute_type_map.items():
        attribute_error = 0

        if attribute_type == XES_STRING:
            if original_event[attribute_name] != generated_event[attribute_name]:
                attribute_error = 1

        if attribute_type == XES_INT:
            delta = original_event[attribute_name] - generated_event[attribute_name]
            scaled_delta = delta / attributes_range_lengths[attribute_name]
            # attribute_error = scaled_delta ** 2
            attribute_error = scaled_delta

        if attribute_type == XES_DATE:
            delta = original_event[attribute_name].second - generated_event[attribute_name].second
            scaled_delta = delta / attributes_range_lengths[attribute_name]
            # attribute_error = scaled_delta ** 2
            attribute_error = scaled_delta

        print(f"attribute_name: {attribute_name}, attribute_error: {attribute_error}")

        # TODO: Other types

        weighted_attribute_error = attribute_error * attributes_weights[attribute_name]
        event_error += weighted_attribute_error

    return event_error

def calculate_trace_error(
        original_trace: XesTrace,
        generated_trace: XesTrace,

        event_attribute_type_map: dict[str, str],
        attributes_weights: dict[str, float],
        ) -> float:
    attributes_range_lengths = calculate_attributes_range_lengths(event_attribute_type_map, original_trace)

    accumulated_error = 0

    for original_event in original_trace.events:
        # TODO: Find best event by lowest error

        generated_events_candidates = [
            {
                "event": event,
                "error": calculate_event_error(
                    original_event=original_event,
                    generated_event=event,
                    event_attribute_type_map=event_attribute_type_map,
                    attributes_range_lengths=attributes_range_lengths,
                    attributes_weights=attributes_weights
                )
            }
            for event in generated_trace.events
            if event.concept_name == original_event.concept_name
        ]

        if len(generated_events_candidates) == 0:
            continue

        suitable_event_with_error = min(generated_events_candidates, key=lambda item: item["error"])
        event_error = suitable_event_with_error["error"]

        accumulated_error += event_error

    mean_error = accumulated_error / float(len(original_trace.events))
    return mean_error

def calculate_log_error(
        original_log: XesLog,
        traces_pairs: list[tuple[XesTrace, XesTrace]],
        attribute_weights: dict[str, float]
        ) -> float:
    accumulated_error = 0

    for original_trace, generated_trace in traces_pairs:
        trace_error = calculate_trace_error(
            original_trace,
            generated_trace,
            event_attribute_type_map=original_log.event_attribute_type_map,
            attributes_weights=attribute_weights
        )

        accumulated_error += trace_error

    mean_log_error = accumulated_error / float(len(original_log.traces))

    return mean_log_error


In [19]:
trace_mappings = maxes.metrics.mean_levenstein_distance.mean_levenstein_distance(ccc19_processed, ccc19_generated)
trace_mappings

trace_pairs = [(trace_pair.original_sequence.trace, trace_pair.generated_sequence.trace) for trace_pair in trace_mappings]
attribute_weights = get_attributes_weights1(ccc19_processed)

[(trace_pair.original_sequence.index, trace_pair.generated_sequence.index) for trace_pair in trace_mappings]

[(9, 1),
 (19, 8),
 (5, 11),
 (2, 2),
 (6, 0),
 (3, 12),
 (4, 4),
 (17, 9),
 (10, 14),
 (12, 10),
 (1, 6),
 (16, 19),
 (7, 7),
 (11, 16),
 (14, 17),
 (15, 5),
 (0, 13),
 (8, 3),
 (18, 15),
 (13, 18)]

In [25]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

ccc19_processed.traces[9].df

Unnamed: 0,concept:name,lifecycle:transition,org:resource,time:timestamp,RESOURCE,ROUND,ACTIVITY,STAGE,VIDEOSTART,VIDEOEND
0,Prepare implements,start,R_45_2A,2019-01-19 14:00:00+00:00,R_45_2A,Post,Prepare implements,Operator and Patient Preparation,16,66
1,Prepare implements,complete,R_45_2A,2019-01-19 14:01:00+00:00,R_45_2A,Post,Prepare implements,Operator and Patient Preparation,16,66
2,Get in sterile clothes,start,R_45_2A,2019-01-19 14:01:00+00:00,R_45_2A,Post,Get in sterile clothes,Operator and Patient Preparation,67,106
3,Get in sterile clothes,complete,R_45_2A,2019-01-19 14:01:00+00:00,R_45_2A,Post,Get in sterile clothes,Operator and Patient Preparation,67,106
4,Hand washing,start,R_45_2A,2019-01-19 14:01:00+00:00,R_45_2A,Post,Hand washing,Operator and Patient Preparation,108,127
5,Hand washing,complete,R_45_2A,2019-01-19 14:02:00+00:00,R_45_2A,Post,Hand washing,Operator and Patient Preparation,108,127
6,Get in sterile clothes,start,R_45_2A,2019-01-19 14:02:00+00:00,R_45_2A,Post,Get in sterile clothes,Operator and Patient Preparation,129,255
7,Get in sterile clothes,complete,R_45_2A,2019-01-19 14:04:00+00:00,R_45_2A,Post,Get in sterile clothes,Operator and Patient Preparation,129,255
8,Prepare implements,start,R_45_2A,2019-01-19 14:04:00+00:00,R_45_2A,Post,Prepare implements,Operator and Patient Preparation,257,290
9,Prepare implements,complete,R_45_2A,2019-01-19 14:04:00+00:00,R_45_2A,Post,Prepare implements,Operator and Patient Preparation,257,290


In [26]:
ccc19_generated.traces[1].df

Unnamed: 0,concept:name,lifecycle:transition,time:timestamp,VIDEOSTART,VIDEOEND,org:resource,RESOURCE,ROUND,ACTIVITY,STAGE
0,Get in sterile clothes,start,2019-01-16 13:59:59.228876+00:00,1202,436,R_14_1D,R_33_1L,Pre,Drop probe,Venous puncture
1,Get in sterile clothes,complete,2019-01-16 14:01:59.168401+00:00,1838,1264,R_45_2A,R_47_2C,Post,Prepare implements,Install Guidewire
2,Prepare implements,complete,2019-01-16 14:02:59.915067+00:00,1888,317,R_14_1D,R_13_1C,Post,Check flow and reflow,Install Catheter
3,Get in sterile clothes,start,2019-01-16 14:02:59.548655+00:00,1364,473,R_21_1F,R_45_2A,Pre,Put sterile gel,Ultrasound Preparation
4,Get in sterile clothes,complete,2019-01-16 14:02:58.993454+00:00,683,1630,R_31_1G,R_47_2C,Pre,Blood return,Operator and Patient Preparation
5,Hand washing,start,2019-01-16 14:02:57.952583+00:00,1234,1153,R_46_2B,R_13_1C,Pre,Compression identification,Venous puncture
6,Hand washing,complete,2019-01-16 14:04:57.586655+00:00,801,1204,R_32_1H,R_13_1C,Pre,Gel in probe,Operator and Patient Preparation
7,Get in sterile clothes,start,2019-01-16 14:04:59.103730+00:00,2010,1413,R_32_1H,R_31_1G,Post,Position probe,Operator and Patient Preparation
8,Get in sterile clothes,complete,2019-01-16 14:06:59.198321+00:00,2014,1768,R_14_1D,R_33_1L,Post,Ultrasound configuration,Operator and Patient Preparation
9,Prepare implements,complete,2019-01-16 14:07:57.966333+00:00,1912,633,R_47_2C,R_33_1L,Post,Remove syringe,Locate Structures


In [18]:
calculate_log_error(
    ccc19_generated,
    trace_pairs,
    attribute_weights=attribute_weights
)

attribute_name: concept:name, attribute_error: 0
attribute_name: lifecycle:transition, attribute_error: 1
attribute_name: org:resource, attribute_error: 1
attribute_name: time:timestamp, attribute_error: 0.002984396433470508
attribute_name: RESOURCE, attribute_error: 1
attribute_name: ROUND, attribute_error: 0
attribute_name: ACTIVITY, attribute_error: 1
attribute_name: STAGE, attribute_error: 1
attribute_name: VIDEOSTART, attribute_error: 3.0212056082742995
attribute_name: VIDEOEND, attribute_error: 0.05949996269505997
attribute_name: concept:name, attribute_error: 0
attribute_name: lifecycle:transition, attribute_error: 1
attribute_name: org:resource, attribute_error: 1
attribute_name: time:timestamp, attribute_error: 0.0027854938271604937
attribute_name: RESOURCE, attribute_error: 1
attribute_name: ROUND, attribute_error: 0
attribute_name: ACTIVITY, attribute_error: 1
attribute_name: STAGE, attribute_error: 1
attribute_name: VIDEOSTART, attribute_error: 3.09916900086126
attribute_na

np.float64(0.3581842937937214)

In [29]:
len(trace_pairs)

20

In [36]:
trace_mappings = maxes.metrics.mean_levenstein_distance.mean_levenstein_distance(eleven_processed, eleven_generated)
trace_mappings

trace_pairs = [(trace_pair.original_sequence.trace, trace_pair.generated_sequence.trace) for trace_pair in trace_mappings]
attribute_weights = get_attributes_weights1(eleven_processed)

[(trace_pair.original_sequence.index, trace_pair.generated_sequence.index) for trace_pair in trace_mappings]

[(0, 4),
 (1, 0),
 (2, 2),
 (5, 3),
 (9, 6),
 (14, 7),
 (7, 8),
 (3, 1),
 (6, 9),
 (13, 5)]

In [30]:
len(eleven_processed.traces)

3512

In [37]:
calculate_log_error(
    eleven_generated,
    trace_pairs,
    attribute_weights=attribute_weights
)

attribute_name: call centre, attribute_error: 1
attribute_name: org:resource, attribute_error: 1
attribute_name: time:timestamp, attribute_error: 0.0003635808675667912
attribute_name: lifecycle:transition, attribute_error: 0
attribute_name: concept:name, attribute_error: 0
attribute_name: reason, attribute_error: 1
attribute_name: check, attribute_error: 0
attribute_name: age, attribute_error: 1
attribute_name: pvr, attribute_error: 1


KeyError: 'duration'

In [None]:


"C:\\vt\\md\\maxes\\maxes\\data\\other\\simple.xes"

In [38]:
simple_original_path = "C:\\vt\\md\\maxes\\maxes\\data\\other\\simple.xes"
simple_generated_path = "C:\\vt\\md\\maxes\\maxes\\output\\data_generated\\simple.xes"

simple_processed = maxes.notebooks.utils.load_xes(simple_original_path)
simple_processed

simple_generated = maxes.notebooks.utils.generate_log(simple_processed)
maxes.notebooks.utils.write_xes_to_file(simple_generated, skeleton_log=simple_processed, destination_file_path=simple_generated_path)

In [39]:
len(simple_generated.traces)

3512