In [1]:
import numpy as np
import pandas
import pm4py
from matplotlib import pyplot as plt
from sklearn.mixture import GaussianMixture
import scipy.stats as stats
import ot
import os
from tqdm import tqdm

In [2]:
file_path = '../../../data/BPI Challenge 2017.xes'
event_log = pm4py.read_xes(file_path)

parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

# Get real case durations

In [None]:
merged_event_log = pandas.merge(event_log,
                                event_log[['case:concept:name', 'time:timestamp_start']],
                                left_on=['case:concept:name'],
                                right_on=['case:concept:name'],
                                suffixes=('', '_case_start'))

start_end_start_case_event_log = merged_event_log.loc[merged_event_log.groupby(['case:concept:name', 'concept:name'])['time:timestamp_start_case_start'].idxmin()]

merged_event_log = pandas.merge(start_end_start_case_event_log,
                                start_end_start_case_event_log[['case:concept:name', 'time:timestamp_complete']],
                                left_on=['case:concept:name'],
                                right_on=['case:concept:name'],
                                suffixes=('', '_case_end'))

start_end_start_end_event_log = merged_event_log.loc[
    merged_event_log.groupby(['case:concept:name', 'concept:name'])['time:timestamp_complete_case_end'].idxmax()
]

In [None]:
case_duration_log = start_end_start_end_event_log.copy()
case_duration_log['case_duration'] = case_duration_log['time:timestamp_complete_case_end'] - case_duration_log['time:timestamp_start_case_start']
case_duration_log['case_duration_seconds'] = (case_duration_log['case_duration']).astype('timedelta64[s]').astype(int)

case_duration_log = case_duration_log.loc[case_duration_log.groupby('case:concept:name')['concept:name'].idxmin()]

case_duration_log = case_duration_log[['case:concept:name', 'time:timestamp_start_case_start', 'time:timestamp_complete_case_end',
                                       'case_duration', 'case_duration_seconds']]

case_duration_log

# Simulate case durations using model

In [None]:
from drbart_parser import *

### All

In [None]:
drbart = DRBART(parser_dir = '../../models/artificial/resource_concept_duration_seconds-day_resource-count_activity-count/')

In [None]:
event_log['duration_sampled'] = event_log.apply(lambda x : int(round(
                                                drbart.sample([x['org:resource'], x['concept:name'],
                                                              x['1'], x['Clark'], x['Jane'], x['Joe'], x['Karsten'],
                                                              x['DIAGNOSIS'], x['REPAIR']],
                                                              [x['seconds_in_day']])[1][0]
                                                                                ,0)), axis=1)

In [None]:
cased_duration_simulated = event_log.groupby('case:concept:name')['duration_sampled'].sum()
bins = 100
r = plt.hist(case_duration_log['case_duration_seconds'], bins, alpha=0.5, label='simulated')
plt.hist(cased_duration_simulated, r[1], alpha=0.5, label='real')
plt.legend(loc='upper right')
plt.show()

### Only resource

In [None]:
drbart = DRBART(parser_dir = '../../models/artificial/resource_duration_10000_100_100/')
event_log['duration_sampled'] = event_log.apply(lambda x : int(round(
                                                drbart.sample([x['org:resource']],
                                                              [])[1][0]
                                                                                ,0)), axis=1)

In [None]:
cased_duration_simulated = event_log.groupby('case:concept:name')['duration_sampled'].sum()
bins = 100
r = plt.hist(case_duration_log['case_duration_seconds'], bins, alpha=0.5, label='simulated')
plt.hist(cased_duration_simulated, r[1], alpha=0.5, label='real')
plt.legend(loc='upper right')
plt.show()