In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pm4py
import scipy
import stormpy
import datetime
import numpy as np
from fitter import Fitter

In [None]:
from simulation.markov_models import log_parser
from simulation.markov_chain import apply as mc_apply
from simulation.markov_chain_vis import view_markov_chain, view_resource_markov_chain, view_non_resource_markov_chain
import simulation.util as sim_util

## Load event log and clean roles

In [None]:
event_log = pm4py.read_xes('BPI_Challenge_2013_incidents.xes.gz')
event_log = event_log.sort_values(['case:concept:name','time:timestamp'])
number_of_traces = event_log['case:concept:name'].nunique()
subset_el = event_log[['case:concept:name','concept:name','time:timestamp','org:resource','org:role']]
subset_el['org:role'] = subset_el['org:role'].fillna('nan_1').apply(lambda x: x.split('_')[0])
subset_el['org:role'] = subset_el['org:role'].replace({'C':'C1','D':'D1','E':'E1'})

## Define the final states
The final states need to be reachable from the start states according to the DFG.
There can be many, but all of them have to point towards the end state.

In [None]:
final_states = ['Completed']

In [None]:
subset_el = pm4py.read_xes('test.xes')
final_states = ['End']

# Discovery

## Start Control Flow Model: Directly Follows Graph

In [None]:
subset_el = pm4py.convert_to_event_log(subset_el)
subset_el = log_parser.add_start_end(subset_el)
dfg, start_activities, end_activities = pm4py.discover_dfg(subset_el)
dfg["end", "start"] = 1

In [None]:
pm4py.view_dfg(dfg, start_activities, end_activities)

In [None]:
subset_el = pm4py.convert_to_dataframe(subset_el)

## (Optional step) Proportional filtering

In [None]:
#TODO: remove arrows in the dfg that account for less than x percent of transitions

## Extract the inputs to the ctmc from the event log

In [None]:
data_transition_role_frequency = sim_util.get_transition_resource_dict(subset_el)

In [None]:
from simulation.timings import Timings

mine_declaratively = True
if mine_declaratively:
    timings = Timings()
    resource_input_array = timings.create_resource_input_array_from_log(subset_el)
    res_timings = timings.get_timings_per_resource(subset_el, resource_input_array)
    times_dictionary = res_timings
else:
    timings = Timings()
    times_dictionary = timings.extract_resource_times_with_future(subset_el)

In [None]:
data_mean_transition_role_time = {}
tuples_to_discard = set()
for k,v in data_transition_role_frequency.items():
    if k in ['start','end']:
        continue
    for k2,v2 in v.items():
        if k2 in ['start','end']:
            continue
        all_freq = 0
        for k3,v3 in v2.items():
            all_freq += v3
            if (k,k2,k3) in times_dictionary:
                times = times_dictionary[(k,k2,k3)]
                times = np.array(times)
                times = times/3600
                times = times[times != 0]
                if len(times) > 1: # only take times that have more than 1 value
                    expon_loc, expon_scale = scipy.stats.expon.fit(times)

                    # f = Fitter(times, distributions=['expon'])
                    # f.fit()
                    # best = f.get_best()['expon']
                    # expon_loc_fitter, expon_scale_fitter = best['loc'], best['scale']

                    if expon_scale>0: # do not take times that cannot be fit into an exponential
                        rate = 1/expon_scale
                        if k not in data_mean_transition_role_time:
                            data_mean_transition_role_time[k] = {}
                        if k2 not in data_mean_transition_role_time[k]:
                            data_mean_transition_role_time[k][k2] = {}
                        if k3 not in data_mean_transition_role_time[k][k2]:
                            data_mean_transition_role_time[k][k2][k3] = {
                                # 'loc': expon_loc_fitter,
                                # 'scale': expon_scale_fitter,
                                'loc': expon_loc,
                                'scale': expon_scale,
                                'lambda': rate
                            }
                    else:
                        print(k,k2,k3)
                        tuples_to_discard.add((k,k2,k3))
                        print(times)
                else:
                    print(k,k2,k3)
                    tuples_to_discard.add((k,k2,k3))
                    print(times)

In [None]:
for (e_from,e_to,role) in tuples_to_discard:
    if e_from in data_transition_role_frequency:
        if e_to in data_transition_role_frequency[e_from]:
            if role in data_transition_role_frequency[e_from][e_to]:
                data_transition_role_frequency[e_from][e_to].pop(role)

In [None]:
for e_from in data_transition_role_frequency.keys():
    for e_to in data_transition_role_frequency.keys():
        if (e_from == 'start' and e_to == 'start') or (e_from == 'end' and e_to == 'end'):
            data_transition_role_frequency[e_from].pop(e_to)

In [None]:
def remove_empty_keys(d):
    """Recursively remove empty keys from a three-level nested dictionary."""
    if not isinstance(d, dict):
        return d  # Return non-dict values as they are

    cleaned_dict = {}
    for key, value in d.items():
        if isinstance(value, dict):
            cleaned_value = remove_empty_keys(value)  # Recursively clean sub-dictionaries
            if cleaned_value:  # Add only if not empty
                cleaned_dict[key] = cleaned_value
        elif value not in (None, "", [], {}, ()):  # Ignore empty values
            cleaned_dict[key] = value

    return cleaned_dict

data_transition_role_frequency = remove_empty_keys(data_transition_role_frequency)

In [None]:
role_resources = sim_util.get_detailed_weighted_role(subset_el)

In [None]:
role_trials = {k:int(v) for k,v in role_resources.items()}

In [None]:
res = {}
out_frequency = {}
data_transition_role_prob = {}

for k,v in data_transition_role_frequency.items():
    if k in ['start','end']:
        continue
    out_freq = 0
    if k not in data_transition_role_prob:
        data_transition_role_prob[k] = {}

    for k2,v2 in v.items():
        if k2 in ['start','end']:
            continue
        all_freq = 0

        if k2 not in data_transition_role_prob[k]:
            data_transition_role_prob[k][k2] = {}

        if k not in res:
            res[k] = {}
        if k2 not in res[k]:
            for k3,v3 in v2.items():
                if k3 not in data_transition_role_prob[k][k2]:
                    data_transition_role_prob[k][k2][k3] = v3
                all_freq += v3
            res[k][k2] = all_freq
            out_freq += all_freq
        out_frequency[k] = out_freq

for k,v in res.items():
    for k2,v2 in v.items():
        res[k][k2] = res[k][k2]/out_frequency[k]

for k,v in data_transition_role_prob.items():
    for k2,v2 in v.items():
        for k3,v3 in v2.items():
            data_transition_role_prob[k][k2][k3] = v3/out_frequency[k]

In [None]:
view_resource_markov_chain(data_transition_role_prob)

In [None]:
semi_markov_json = mc_apply(subset_el)

In [None]:
view_markov_chain(semi_markov_json)
view_non_resource_markov_chain(res)

In [None]:
role_resources

In [None]:
states = set(subset_el['concept:name'].unique()).difference(set(['start','end']))
n = len(states)
i = 0
correspondence = {s:i for s,i in zip(states,range(len(states)))}
#TODO: make sure none of the final states have state = 0 in the prism program
non_final_states = list(states.difference(set(final_states)))
for s in final_states:
    if correspondence[s] == 0:
        correspondence[s] = correspondence[non_final_states[0]]
        correspondence[non_final_states[0]] = 0
correspondence

## Analysis for resource allocation

In [None]:
role_resources

In [None]:
role_trials = role_resources
role_trials

In [None]:
from simulation.ctmc import create_prism_program_from_log

probabilities = create_prism_program_from_log(
                            correspondence,
                            final_states,
                            data_mean_transition_role_time,
                            role_resources,
                            data_transition_role_frequency,
                            role_trials,
                            'ctmc.sm')
# print(probabilities)
prism_program = stormpy.parse_prism_program('ctmc.sm',prism_compat=True,simplify=True)
model = stormpy.build_model(prism_program)
# print("Number of states: {}".format(model.nr_states))
# print("Number of transitions: {}".format(model.nr_transitions))
# print("Labels: {}".format(model.labeling.get_labels()))
labels = ""
for fs in final_states:
    labels += f'"q_terminal_{fs}" |'
labels = labels[:-2]

formula_str = f'Tmin=? [F {labels}]'
properties = stormpy.parse_properties(formula_str, prism_program)
result = stormpy.model_checking(model, properties[0])
initial_state = model.initial_states[0]
result = result.at(initial_state)
print(f"Hours: {result}")
if result<np.inf:
    print(f"Duration: {datetime.timedelta(hours=result)}")

In [None]:
mean, median, margin_of_error = sim_util.get_pm4py_reference_times(subset_el)
print(datetime.timedelta(seconds=median))
print(datetime.timedelta(seconds=mean))
print(datetime.timedelta(seconds=margin_of_error))

In [None]:
view_non_resource_markov_chain(probabilities)

In [None]:
import random

durations = []
x = list(range(1,50))
samples = 500
for i in range(samples):
    role_trials = {
        'R1': random.choice(x)
        # "A2": random.choice(x),
        # "C1": random.choice(x),
        # "D1": random.choice(x),
        # "E1": random.choice(x),
        # "V3": random.choice(x),
        # "nan": random.choice(x)
    }
    print(role_trials)
    probabilities = create_prism_program_from_log(
                            correspondence,
                            final_states,
                            data_mean_transition_role_time,
                            role_resources,
                            data_transition_role_frequency,
                            role_trials,
                            'ctmc.sm')
    prism_program = stormpy.parse_prism_program('ctmc.sm', prism_compat=True, simplify=True)
    model = stormpy.build_model(prism_program)
    labels = ""
    for fs in final_states:
        labels += f'"q_terminal_{fs}" |'
    labels = labels[:-2]

    formula_str = f'Tmin=? [F {labels}]'
    properties = stormpy.parse_properties(formula_str, prism_program)
    result = stormpy.model_checking(model, properties[0])
    initial_state = model.initial_states[0]
    result = result.at(initial_state)
    durations.append({**role_trials, "duration": result})
    print(f'{i}/{samples}')

## Find the under estimation coeficient for the specific log

In [None]:
from copy import deepcopy
from pm4py.algo.filtering.log.variants import variants_filter

In [None]:
scales = [2**i for i in range(-5, 6)]  # -5 to 5 gives 1/32x to 32x
labels = [f"{s}x" if s >= 1 else f"1/{int(1/s)}x" for s in scales]
sc_data = {}
for scale, label in zip(scales, labels):
    print(label)
    el = sim_util.scale_event_log_time(deepcopy(subset_el), multiplicity=scale)
    sc_data[label] = el
    filtered_el = pm4py.convert_to_dataframe(variants_filter.filter_log_variants_percentage(deepcopy(el),percentage=0.8))
    sc_data[f'filtered-{label}'] = filtered_el

In [None]:
sc_df = sim_util.sanity_check(sc_data,final_states)

In [None]:
sc_df[['label','ref-mean','ref-median','analysis-time','offset']]

In [None]:
import matplotlib.pyplot as plt

el = []
el_errors = []
ctmc_el = []
el_filtered = []
el_filtered_errors = []
ctmc_elf = []
x = [i for i in range(len(scales))]
for i,row in sc_df.iterrows():
    if str(row['label']).startswith('filtered'):
        el_filtered.append(row['ref-median'].total_seconds()//3600)
        el_filtered_errors.append(row['ref-std'].total_seconds()//3600)
        ctmc_elf.append(row['analysis-time'].total_seconds()//3600)
    else:
        el.append(row['ref-median'].total_seconds()//3600)
        el_errors.append(row['ref-std'].total_seconds()//3600)
        ctmc_el.append(row['analysis-time'].total_seconds()//3600)

el = np.array(el)
el_errors = np.array(el_errors)
ctmc_el = np.array(ctmc_el)
el_filtered = np.array(el_filtered)
el_filtered_errors = np.array(el_filtered_errors)
ctmc_elf = np.array(ctmc_elf)

log_el = np.log(el)
log_el_errors = np.log(el_errors)
log_ctmc_el = np.log(ctmc_el)
log_el_filtered = np.log(el_filtered)
log_el_filtered_errors = np.log(el_filtered_errors)
log_ctmc_elf = np.log(ctmc_elf)

plt.figure(1,(16,4))

plt.scatter(x,log_el, color='r',marker='.')
plt.plot(x,log_el,label='el median',color='r',linestyle='-', marker='.')
plt.fill_between(x, log_el - log_el_errors, log_el + log_el_errors, alpha=0.2, color='r')

plt.scatter(x,log_ctmc_el, color='b',marker='v')
plt.plot(x,log_ctmc_el,label='ctmc-el', color='b',marker='v',linestyle='--')

plt.scatter(x,log_el_filtered, color='g',marker='s',s=10)
plt.plot(x,log_el_filtered,label='el-filtered median', color='g',marker='s',linestyle=':')
plt.fill_between(x, log_el_filtered - log_el_filtered_errors, log_el_filtered + log_el_filtered_errors, alpha=0.2, color='g')

plt.scatter(x,log_ctmc_elf, color='orange',marker='*',)
plt.plot(x,log_ctmc_elf,label='ctmc-elf', color='orange',marker='*',linestyle='-.')

plt.xticks(x,labels)
plt.xlabel('event log duration time scale')
plt.ylabel('duration (hours,log scale)')
plt.title('BIC 13 log')
plt.legend()

## Find regression coeficients

In [None]:
import pandas as pd

durations_df = pd.DataFrame(durations)
durations_df

## Save the configurations

In [None]:
import json

with open('mean_transition_role_time.json', 'w') as file:
    json.dump(data_mean_transition_role_time, file, indent=4)

with open('role_number_of_resources.json', 'w') as file:
    json.dump(role_resources, file, indent=4)

with open('transition_role_frequency.json', 'w') as file:
    json.dump(data_transition_role_frequency, file, indent=4)

role_trials = {k:int(v) for k,v in role_resources.items()}
with open('role_trials.json', 'w') as file:
    json.dump(role_trials, file, indent=4)

# Evaluation

##

## Analyze timings for specific roles
TODO: run a ks or chi-square test between the fitted functions to analyze the goodness of fit

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import fitter
from simulation.markov_models.fit_distribution import fit_gauss

import scipy.stats as stats
from scipy.stats import entropy as kl_div

In [None]:
eval_functions = pd.DataFrame()

seed=None

for k, v in times_dictionary.items():
    s1 = k[0]
    s2 = k[1]
    role = k[2]
    v = np.array(v)
    v = v // 3600
    # do we remove 0 values?
    v = v[v != 0]
    #Note: on invalid values the fitting will return NaN. This means there was no fit.
    if len(v) > 1:# and role in ['V3']:
        x = [i for i in range(len(v))]

        fig, ax = plt.subplots(1, 1, figsize=(10, 4))

        f = fitter.Fitter(v,distributions=fitter.get_common_distributions())
        f.fit()
        best_dist, fitted_params = f.get_best().popitem()
        best_two_summary = f.summary(Nbest=2,plot=True,lw=1)

        test_func = getattr(scipy.stats, best_dist)
        test_res = test_func.pdf(x, **fitted_params)
        best_two_summary.loc[best_dist,'kl_div'] = kl_div(test_res,v)
        second_best = best_two_summary.index[1]
        test_func = getattr(scipy.stats, second_best)
        test_res = test_func.pdf(x, *f.fitted_param[second_best])
        best_two_summary.loc[second_best,'kl_div'] = kl_div(test_res,v)

        kde = sm.nonparametric.KDEUnivariate(v)
        kde.fit(bw=4, kernel='gau')  # Estimate the densities
        multi_gauss = fit_gauss(kde.support, kde.density, v)
        multi_gauss.plot_mult_gauss(x,label='multi-gauss',color='g')
        ks_stat, ks_pval, kl_divergence = multi_gauss.fitted_results(v)
        best_two_summary.loc['multi-gauss'] = [np.inf,np.inf,np.inf,kl_divergence,ks_stat,ks_pval]

        f = fitter.Fitter(v,distributions=['expon'])
        f.fit()

        f.plot_pdf(names=['expon'],lw=3)
        expon_summary = f.summary(Nbest=1,plot=False)
        best_expon, fitted_expon = f.get_best().popitem()
        res = stats.expon.pdf(x, **fitted_expon)
        expon_summary.loc['expon','kl_div'] = kl_div(res,v)
        expon_summary.rename({'expon':'_expon'},axis=0,inplace=True)

        # plt.xlim([0,500])
        plt.title(f'{s1}-{role}->{s2}')
        plt.legend(labels=[f'Best fit {best_two_summary.index[0]}',f'Second best fit {best_two_summary.index[1]}','multi-gauss',f'CtMC Exponential'])
        plt.savefig(f"/home/vco/Writing/simulation/figs/timeeval/{s1}-{role}-{s2}.png",format='png',pad_inches=0)
        plt.show()

        fit_summary = pd.concat([expon_summary, best_two_summary])
        fit_summary['from'] = s1
        fit_summary['to'] = s2
        fit_summary['role'] = role
        eval_functions = pd.concat([eval_functions,fit_summary])
        # break

In [None]:
kl_div_eval = deepcopy(eval_functions[['kl_div','from','role','to']])

In [None]:
clean_kl_div_eval = kl_div_eval[(kl_div_eval['from']!='start') & (kl_div_eval['kl_div'].notna())]

In [None]:
clean_kl_div_eval['kl_div'].groupby(clean_kl_div_eval.index).mean()

In [None]:
kl_div_res = clean_kl_div_eval.groupby(clean_kl_div_eval.index).apply(lambda x: x['kl_div'].tolist()).to_dict()

In [None]:
kl_divergence_values = kl_div_res['_expon']
print(kl_divergence_values)
kl_divergence_average = np.mean(list(kl_divergence_values))
print("KL-divergence average:")
print(kl_divergence_average)
kl_divergence_interval = stats.t.interval(0.95, df=len(kl_divergence_values) - 1,
                                       loc=np.mean(list(kl_divergence_values)),
                                       scale=stats.sem(list(kl_divergence_values)))
print("KL-divergence interval:")
print(kl_divergence_interval[1] - kl_divergence_average)
print()

In [None]:
kl_divergence_values = kl_div_res['multi-gauss']
print(kl_divergence_values)
kl_divergence_average = np.mean(list(kl_divergence_values))
print("KL-divergence average:")
print(kl_divergence_average)
kl_divergence_interval = stats.t.interval(0.95, df=len(kl_divergence_values) - 1,
                                       loc=np.mean(list(kl_divergence_values)),
                                       scale=stats.sem(list(kl_divergence_values)))
print("KL-divergence interval:")
print(kl_divergence_interval[1] - kl_divergence_average)
print()