In [29]:
import pandas as pd
import pm4py


from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.statistics.traces.generic.log import case_statistics

import numpy as np
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster

In [3]:

# Data Import:
xes_path = "/Users/moritz_hawener/Documents/Work/Studium/Master/WS25/BPI/BPI Challenge 2017_1_all/BPI Challenge 2017.xes.gz"

# Read the XES log
event_log = xes_importer.apply(xes_path)

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 31509/31509 [00:31<00:00, 992.94it/s] 


In [4]:
# Transform the event log to a pandas DataFrame
df = log_converter.apply(event_log, variant=log_converter.Variants.TO_DATA_FRAME)

In [6]:
roles = pm4py.discover_organizational_roles(
    df,
    resource_key='org:resource',
    activity_key='concept:name',
    timestamp_key='time:timestamp',
    case_id_key='case:concept:name'
)

In [7]:
roles

[Activities: ['A_Accepted', 'A_Complete', 'O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'W_Complete application'] Originators importance {'User_10': 10090, 'User_100': 39, 'User_101': 22, 'User_102': 57, 'User_103': 78, 'User_104': 2401, 'User_105': 1037, 'User_106': 60, 'User_107': 17, 'User_108': 586, 'User_109': 198, 'User_11': 3059, 'User_110': 75, 'User_111': 24, 'User_12': 4627, 'User_13': 2570, 'User_132': 1161, 'User_135': 192, 'User_14': 5479, 'User_141': 145, 'User_15': 5692, 'User_16': 5681, 'User_17': 4555, 'User_18': 7642, 'User_19': 5039, 'User_2': 5111, 'User_20': 1018, 'User_21': 4145, 'User_22': 1160, 'User_23': 2815, 'User_24': 5066, 'User_25': 1941, 'User_26': 2990, 'User_27': 3419, 'User_28': 8097, 'User_29': 222, 'User_3': 11443, 'User_30': 108, 'User_31': 2061, 'User_32': 4064, 'User_33': 1691, 'User_34': 1837, 'User_35': 5790, 'User_36': 4412, 'User_37': 6965, 'User_38': 4084, 'User_39': 5876, 'User_4': 6170, 'User_40': 5762, 'User_41': 6064, 'User_4

In [15]:
roles[0].__dict__


{'activities': ['A_Accepted',
  'A_Complete',
  'O_Create Offer',
  'O_Created',
  'O_Sent (mail and online)',
  'W_Complete application'],
 'originator_importance': {'User_10': 10090,
  'User_100': 39,
  'User_101': 22,
  'User_102': 57,
  'User_103': 78,
  'User_104': 2401,
  'User_105': 1037,
  'User_106': 60,
  'User_107': 17,
  'User_108': 586,
  'User_109': 198,
  'User_11': 3059,
  'User_110': 75,
  'User_111': 24,
  'User_12': 4627,
  'User_13': 2570,
  'User_132': 1161,
  'User_135': 192,
  'User_14': 5479,
  'User_141': 145,
  'User_15': 5692,
  'User_16': 5681,
  'User_17': 4555,
  'User_18': 7642,
  'User_19': 5039,
  'User_2': 5111,
  'User_20': 1018,
  'User_21': 4145,
  'User_22': 1160,
  'User_23': 2815,
  'User_24': 5066,
  'User_25': 1941,
  'User_26': 2990,
  'User_27': 3419,
  'User_28': 8097,
  'User_29': 222,
  'User_3': 11443,
  'User_30': 108,
  'User_31': 2061,
  'User_32': 4064,
  'User_33': 1691,
  'User_34': 1837,
  'User_35': 5790,
  'User_36': 4412,
  'Use

In [16]:
# Pearson’s correlation coefficient 

activity_user = {}

for role in roles:
    for act in role.activities:
        activity_user[act] = role.originator_importance

In [17]:
resource_allocation_df = (
    pd.DataFrame(activity_user)
    .fillna(0)
    .astype(float)
)


In [18]:
resource_allocation_df

Unnamed: 0,A_Accepted,A_Complete,O_Create Offer,O_Created,O_Sent (mail and online),W_Complete application,A_Cancelled,A_Concept,A_Create Application,W_Handle leads,...,W_Call after offers,W_Call incomplete files,W_Validate application,O_Cancelled,O_Sent (online only),A_Incomplete,W_Assess potential fraud,W_Shortened completion,W_Personal Loan collection,A_Submitted
User_10,10090.0,10090.0,10090.0,10090.0,10090.0,10090.0,1197.0,1197.0,1197.0,1197.0,...,3424.0,1053.0,303.0,142.0,31.0,6.0,2.0,0.0,0.0,0.0
User_100,39.0,39.0,39.0,39.0,39.0,39.0,8.0,8.0,8.0,8.0,...,28.0,7659.0,7675.0,220.0,3.0,2486.0,188.0,0.0,0.0,0.0
User_101,22.0,22.0,22.0,22.0,22.0,22.0,6.0,6.0,6.0,6.0,...,6.0,3306.0,1408.0,0.0,3.0,1102.0,27.0,0.0,0.0,0.0
User_102,57.0,57.0,57.0,57.0,57.0,57.0,13.0,13.0,13.0,13.0,...,21.0,1861.0,3153.0,282.0,3.0,609.0,39.0,0.0,0.0,0.0
User_103,78.0,78.0,78.0,78.0,78.0,78.0,21.0,21.0,21.0,21.0,...,27.0,4.0,8.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
User_131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,650.0,1062.0,3340.0,22.0,0.0,186.0,0.0,0.0,0.0,0.0
User_134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,495.0,797.0,2221.0,30.0,0.0,165.0,2.0,0.0,0.0,0.0
User_137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,133.0,171.0,594.0,2.0,0.0,35.0,3.0,0.0,0.0,0.0
User_139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,28.0,44.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0


In [None]:


df_log = np.log1p(resource_allocation_df)


In [None]:
corr_matrix = df_log.corr(method="pearson")
distance = 1 - corr_matrix


In [26]:
Z = linkage(squareform(distance), method="average")


In [None]:
clusters = fcluster(Z, t=3, criterion="maxclust")

activity_clusters = pd.Series(
    clusters,
    index=corr_matrix.columns,
    name="cluster"
).sort_values()


In [28]:
activity_clusters

A_Accepted                    1
A_Complete                    1
O_Create Offer                1
O_Created                     1
O_Sent (mail and online)      1
W_Complete application        1
A_Cancelled                   1
A_Concept                     1
A_Create Application          1
W_Handle leads                1
O_Sent (online only)          1
W_Call after offers           1
W_Shortened completion        2
A_Submitted                   3
A_Pending                     4
O_Accepted                    4
W_Call incomplete files       4
W_Validate application        4
O_Cancelled                   4
O_Returned                    4
A_Incomplete                  4
W_Assess potential fraud      4
A_Validating                  4
O_Refused                     4
A_Denied                      4
W_Personal Loan collection    5
Name: cluster, dtype: int32

In [30]:
cluster_resource_counts = {}

for cluster_id in activity_clusters.unique():
    # activities belonging to this cluster
    cluster_activities = activity_clusters[
        activity_clusters == cluster_id
    ].index

    # users who worked on at least one of those activities
    users_in_cluster = (
        resource_allocation_df[cluster_activities]
        .sum(axis=1) > 0
    )

    cluster_resource_counts[cluster_id] = users_in_cluster.sum()

cluster_resource_counts = pd.Series(
    cluster_resource_counts,
    name="num_resources"
).sort_index()


In [31]:
cluster_resource_counts

1    147
2     15
3      1
4    145
5      7
Name: num_resources, dtype: int64

In [32]:
# All activities mapped to clusters
all_cluster_activities = [activity for activity in activity_clusters.index]

# Users who worked in any cluster
users_any_cluster = (resource_allocation_df[all_cluster_activities].sum(axis=1) > 0).sum()

users_any_cluster


np.int64(149)