In [2]:
import pandas as pd

In [3]:
# Load raw data & Preprocess DataFrame (enrich with derived attributes)
log = 'bpic15'

preprocess = True

if preprocess:
    fn = f'../data/raw/{log}.csv'
else:
    fn = f'../data/processed/{log}.csv'

if preprocess:

    if log == 'bpic15':
        df = pd.read_csv(fn)[[
            'case:concept:name', 'activityNameEN', 'org:resource', 'time:timestamp',
            'case:last_phase', 'case:parts', 'action_code', 'r:municipality'
        ]]
        df = df.rename(columns={
            # Resource-related
            'municipality': 'r:municipality',
            'case:concept:name' : 'case_ID',
            'time:timestamp': 'Complete Timestamp',
            # CT-related
            'case:last_phase': 'ct:last_phase', 
            # AT-related
        })
        df = df.rename(columns={
            'case:parts': 'case_parts'
        })
        # TODO: derive 'ct:permit_type', 'at:phase'
        df = df[~df['case_parts'].isna()]
        df['ct:permit_type'] = df.apply(lambda row: 'Bouw' if 'Bouw' in str(row['case_parts']).split(',') else 'Non Bouw', axis=1)

        # only look at the main subprocess: "01_HOOFD"
        df = df[~df['action_code'].isna()]
        df = df[df['action_code'].str.startswith('01_HOOFD')]
        df['at:phase'] = df['action_code'].apply(lambda code: code[:10])
        
        # filter meaningless values

    # Universal (on Disco outputs)
    # derive and append TT related candidate attributes
    df['Complete Timestamp'] = pd.to_datetime(df['Complete Timestamp'], format='%Y-%m-%d %H:%M:%S.%f')
    MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    df['tt:month'] = df['Complete Timestamp'].apply(lambda ts: MONTHS[ts.month-1])
    df['tt:day'] = df['Complete Timestamp'].apply(lambda ts: 'Day_{}'.format(ts.day))
    WEEKDAYS = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    df['tt:weekday'] = df['Complete Timestamp'].apply(lambda ts: WEEKDAYS[ts.dayofweek])
    df['tt:ampm'] = df['Complete Timestamp'].apply(lambda ts: 'AM' if ts.hour < 12 else 'PM')
    
    print(df)
    df.to_csv(f'../data/processed/{log}.csv')
else:
    df = pd.read_csv(fn, index_col=0)
    print(df)

  df = pd.read_csv(fn)[[


         case_ID                    activityNameEN  org:resource  \
0       10009138  register submission date request       9264148   
1       10009138              OLO messaging active       9264148   
2       10009138         send confirmation receipt       9264148   
5       10009138     create procedure confirmation       9264148   
6       10009138      create subcases completeness       9264148   
...          ...                               ...           ...   
262621   9998898       read publication date field        560600   
262622   9998898     registration date publication        560600   
262623   9998898      stop all running subcases 2b        560600   
262624   9998898                phase case handled        560600   
262625   9998898       read publication date field        560600   

              Complete Timestamp          ct:last_phase  \
0      2014-04-10 22:00:00+00:00  Beschikking verzonden   
1      2014-04-13 22:00:00+00:00  Beschikking verzonden   
2     

In [3]:
if log == 'bpic15':
#     attr = 'Activity'
#     attr = 'r:municipality'
#     attr = 'ct:last_phase'
#     attr = 'ct:permit_type'
#     attr = 'at:phase'
#     attr = 'tt:month'
#     attr = 'tt:day'
#     attr = 'tt:weekday'
    attr = 'tt:ampm'

l = df.groupby(['org:resource', attr]).size().groupby(level=0).size().to_numpy()
print(l)
avg_val_per_resource = l.mean()
    
df_grouped = df.groupby(['org:resource', attr]).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
df_grouped = df_grouped.reset_index().pivot(index='org:resource', columns=attr, values=0)
#print(df_grouped)

print(f'Shape: {len(df_grouped)} x {len(df_grouped.columns)}')

import matplotlib.pyplot as plt
import seaborn as sns
#f, ax = plt.subplots(figsize=(20, 20))
#ax = sns.heatmap(df_grouped.T, square=True, cbar=False, ax=ax)

[2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 1 1 2 2]
Shape: 71 x 2


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_grouped = df.groupby(['org:resource', attr]).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))


In [5]:
# NOTE: the hopkins stat in package `pyclustertend` is defined similarly to https://en.wikipedia.org/wiki/Hopkins_statistic
# Only that the complement is used, i.e., Hopkins = 1 - H, where H is calculated according to the definition shown on Wikipedia
# Hence a value closer to 1 suggests strong clustering tendency
from pyclustertend import vat, ivat, hopkins
from sklearn.preprocessing import scale
from scipy.spatial.distance import pdist
from numpy import mean

# scale
X = scale(df_grouped.fillna(0).to_numpy())
sample_size = int(0.2 * len(X))

# binarize (for hacking hamming distance)
B = (X > 0)

'''
# X-related
# avg pdist
avg_pdist = pdist(X).mean()
print(f'Avg. Pairwise distance (Euclidean): \n{avg_pdist}')
# hopkins stat
hopkins_stat = mean([hopkins(X, sampling_size=sample_size) for i in range(1000)])
print(f'Hopkins statistic averaged over 1k runs, sampling {sample_size} / {len(X)} (20%) points: \n{hopkins_stat}')
'''

# B-related
# avg pdist
avg_pdist_bin = pdist(B, metric='hamming').mean()
print(f'Avg. Pairwise distance: \n{avg_pdist_bin}')
# hopkins stat
#hopkins_stat_bin = mean([hopkins(B, sampling_size=sample_size) for i in range(1000)])
#print(f'Hopkins statistic averaged over 1k runs, sampling {sample_size} / {len(B)} (20%) points: \n{hopkins_stat_bin}')


#print('{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}'.format(avg_val_per_resource, avg_pdist, hopkins_stat, avg_pdist_bin, hopkins_stat_bin))
#print('{:.3f},{:.3f},{:.3f}'.format(avg_val_per_resource, avg_pdist_bin, hopkins_stat_bin))
print('{:.3f},{:.3f}'.format(avg_val_per_resource, avg_pdist_bin))
    
#ivat(X)

ModuleNotFoundError: No module named 'pyclustertend'