In [39]:
import math
import numpy as np
import pandas
import itertools
import scipy.stats as st
from pix_framework.statistics.distribution import DurationDistribution, get_best_fitting_distribution, DistributionType

In [2]:
event_log = pandas.read_pickle('../transformed_event_logs/artificial_start_end_2.pickle')
event_log

Unnamed: 0,concept:name,lifecycle:transition_start,time:timestamp_start,org:resource,case:concept:name,lifecycle:transition_complete,time:timestamp_complete,duration,duration_seconds,seconds_in_day,1,Clark,Jane,Joe,Karsten,DIAGNOSIS,QUALITY_CONTROL,REPAIR
0,DIAGNOSIS,START,2020-01-01 03:57:40.044121+00:00,Jane,0,COMPLETE,2020-01-01 04:34:23.549454+00:00,0 days 00:36:43.505333,2203,14260,0,0,1,0,0,1,0,0
1,REPAIR,START,2020-01-01 04:34:23.549454+00:00,Joe,0,COMPLETE,2020-01-01 14:30:27.423999+00:00,0 days 09:56:03.874545,35763,16463,0,0,1,1,0,1,0,1
2,QUALITY_CONTROL,START,2020-01-01 14:30:27.423999+00:00,Joe,0,COMPLETE,2020-01-01 22:13:45.345445+00:00,0 days 07:43:17.921446,27797,52227,0,0,1,2,0,1,1,1
3,DIAGNOSIS,START,2020-01-01 08:16:35.844753+00:00,Jane,1,COMPLETE,2020-01-01 08:47:14.772217+00:00,0 days 00:30:38.927464,1838,29795,0,0,1,0,0,1,0,0
4,REPAIR,START,2020-01-01 08:47:14.772217+00:00,Karsten,1,COMPLETE,2020-01-01 13:27:22.316694+00:00,0 days 04:40:07.544477,16807,31634,0,0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5399,QUALITY_CONTROL,START,2024-12-26 19:03:41.622915+00:00,Joe,1799,COMPLETE,2024-12-26 21:39:17.312456+00:00,0 days 02:35:35.689541,9335,68621,1,0,0,1,1,1,1,1
5400,DIAGNOSIS,START,2024-12-26 18:05:59.540931+00:00,Clark,1800,COMPLETE,2024-12-26 19:13:26.892809+00:00,0 days 01:07:27.351878,4047,65159,0,1,0,0,0,1,0,0
5401,REPAIR,START,2024-12-26 19:13:26.892809+00:00,Clark,1800,COMPLETE,2024-12-27 08:23:46.946760+00:00,0 days 13:10:20.053951,47420,69206,0,2,0,0,0,1,0,1
5402,QUALITY_CONTROL,START,2024-12-27 08:23:46.946760+00:00,Jane,1800,COMPLETE,2024-12-27 11:10:44.009123+00:00,0 days 02:46:57.062363,10017,30226,0,2,1,0,0,1,1,1


In [44]:
def get_likelihood(val, distribution):
    return get_pd(val, distribution.type, distribution.mean, distribution.var, distribution.std,
           distribution.min, distribution.max)

def get_pd(val, distribution_type, mean, var, std, minimum, maximum):
    if distribution_type == DistributionType.FIXED:
        sample = None
    elif distribution_type == DistributionType.EXPONENTIAL:
        # 'loc' displaces the samples, a loc=100 will be the same as adding 100 to each sample taken from a loc=1
        scale = mean - minimum
        if scale < 0.0:
            print("Warning! Trying to generate EXPON sample with 'mean' < 'min', using 'mean' as scale value.")
            scale = mean
        sample = st.expon.logpdf(val, loc=minimum, scale=scale)
    elif distribution_type == DistributionType.NORMAL:
        sample = st.norm.logpdf(val, loc=mean, scale=std)
    elif distribution_type == DistributionType.UNIFORM:
        sample = st.uniform.logpdf(val, loc=minimum, scale=maximum - minimum)
    elif distribution_type == DistributionType.LOG_NORMAL:
        # If the distribution corresponds to a 'lognorm' with loc!=0, the estimation is done wrong
        # dunno how to take that into account
        pow_mean = pow(mean, 2)
        phi = math.sqrt(var + pow_mean)
        mu = math.log(pow_mean / phi)
        sigma = math.sqrt(math.log(phi ** 2 / pow_mean))
        sample = st.lognorm.logpdf(val, sigma, loc=0, scale=math.exp(mu))
    elif distribution_type == DistributionType.GAMMA:
        # If the distribution corresponds to a 'gamma' with loc!=0, the estimation is done wrong
        # dunno how to take that into account
        sample = st.gamma.logpdf(
            val,
            pow(mean, 2) / var,
            loc=0,
            scale=var / mean
        )
    return sample

In [24]:
# Get PDF for each concept:name x resource
concepts = event_log['concept:name'].unique()
resources = event_log['org:resource'].unique()
dumas_models = dict()
for concept, resource in itertools.product(concepts, resources):
    print(concept, resource)
    dumas_models[(concept, resource)] = get_best_fitting_distribution(
        event_log[(event_log['concept:name'] == concept) & (event_log['org:resource'] == resource)]['duration_seconds']
    )
    print(dumas_models[(concept, resource)].type)

DIAGNOSIS Jane
DistributionType.GAMMA
DIAGNOSIS Joe
DistributionType.LOG_NORMAL
DIAGNOSIS Karsten
DistributionType.GAMMA
DIAGNOSIS Clark
DistributionType.GAMMA
DIAGNOSIS 1
DistributionType.LOG_NORMAL
REPAIR Jane
DistributionType.LOG_NORMAL
REPAIR Joe
DistributionType.LOG_NORMAL
REPAIR Karsten
DistributionType.GAMMA
REPAIR Clark
DistributionType.LOG_NORMAL
REPAIR 1
DistributionType.LOG_NORMAL
QUALITY_CONTROL Jane
DistributionType.EXPONENTIAL
QUALITY_CONTROL Joe
DistributionType.GAMMA
QUALITY_CONTROL Karsten
DistributionType.GAMMA
QUALITY_CONTROL Clark
DistributionType.NORMAL
QUALITY_CONTROL 1
DistributionType.LOG_NORMAL


In [65]:
ev_event_log = event_log.sample(frac=1)

def get_likelihood_for_row(row):
    return get_likelihood(row['duration_seconds'], dumas_models[(row['concept:name'], row['org:resource'])])

ev_event_log['likelihood'] = ev_event_log.apply(get_likelihood_for_row, axis=1)
ev_event_log['likelihood'].sum()

-51810.3552042433

In [66]:
# resource models

def get_likelihood_for_row(row):
    return get_likelihood(row['duration_seconds'], resource_models[row['org:resource']])

resource_models = dict()
for resource in resources:
    print(resource)
    resource_models[resource] = get_best_fitting_distribution(
        event_log[(event_log['org:resource'] == resource)]['duration_seconds']
    )
    print(resource_models[resource].type)

ev_event_log['likelihood'] = ev_event_log.apply(get_likelihood_for_row, axis=1)
ev_event_log['likelihood'].sum()

Jane
DistributionType.LOG_NORMAL
Joe
DistributionType.LOG_NORMAL
Karsten
DistributionType.EXPONENTIAL
Clark
DistributionType.EXPONENTIAL
1
DistributionType.EXPONENTIAL


-56556.63256004917

In [67]:
# activity models

def get_likelihood_for_row(row):
    return get_likelihood(row['duration_seconds'], resource_models[row['concept:name']])

resource_models = dict()
for concept in concepts:
    print(concept)
    resource_models[concept] = get_best_fitting_distribution(
        event_log[(event_log['concept:name'] == concept)]['duration_seconds']
    )
    print(resource_models[concept].type)

ev_event_log['likelihood'] = ev_event_log.apply(get_likelihood_for_row, axis=1)
ev_event_log['likelihood'].sum()

DIAGNOSIS
DistributionType.GAMMA
REPAIR
DistributionType.LOG_NORMAL
QUALITY_CONTROL
DistributionType.LOG_NORMAL


-52689.82327883489