## TL;DR
This notebook shows how create an event log in XES format from a csv where each row is an event. Due to the full implementation, one can also add classifier and extensions.

In [1]:
# %load /home/jonathan/.ipython/profile_default/startup/01-setup.py
# start up settings for jupyter notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys

plt.style.use('ggplot')
plt.rcParams['font.size'] = 15.0
plt.rcParams['axes.labelsize'] = 15.0
plt.rcParams['xtick.labelsize'] = 15.0
plt.rcParams['ytick.labelsize'] = 15.0
plt.rcParams['legend.fontsize'] = 15.0

%matplotlib inline

# set the max column width
pd.options.display.max_colwidth = 1000

# to avoid have warnings from chained assignments
pd.options.mode.chained_assignment = None


## Import event data in CSV format

In [2]:
csv_fp = './data/BPIC2012.csv'

['caseid' 'org:resource' 'lifecycle:transition' 'concept:name'
 'time:timestamp']

dtypes = {
    'caseid': str,
    'trace:concept:name': str,
    'trace:REG_DATE': str,
    'trace:AMOUNT_REQ': int,
    'event:org:resource': str,
    'event:lifecycle:transition': str,
    'event:concept:name': str,
    'event:time:timestamp': str
}

log_df = pd.read_csv(csv_fp, dtype=dtypes)

In [3]:
log_df.head()

Unnamed: 0,trace:REG_DATE,trace:concept:name,trace:AMOUNT_REQ,caseid,event:org:resource,event:lifecycle:transition,event:concept:name,event:time:timestamp
0,2011-10-01 00:38:44.546000+02:00,173688,20000,173688,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00
1,2011-10-01 00:38:44.546000+02:00,173688,20000,173688,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00
2,2011-10-01 00:38:44.546000+02:00,173688,20000,173688,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00
3,2011-10-01 00:38:44.546000+02:00,173688,20000,173688,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00
4,2011-10-01 00:38:44.546000+02:00,173688,20000,173688,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00


In [4]:
from opyenxes.factory.XFactory import XFactory
from opyenxes.extension.XExtensionManager import XExtensionManager
from opyenxes.extension.XExtension import XExtension
from opyenxes.classification.XEventAndClassifier import XEventAndClassifier
from opyenxes.classification.XEventLifeTransClassifier import XEventLifeTransClassifier
from opyenxes.classification.XEventNameClassifier import XEventNameClassifier
from opyenxes.classification.XEventResourceClassifier import XEventResourceClassifier

In [5]:
# filter event related attributes
event_attrib_names = list(filter(lambda val: 'event:' in val, log_df.columns))
trace_attrib_names = list(filter(lambda val: 'trace:' in val, log_df.columns))

# need to keep the special caseid column to connect events to traces
event_attrib_names = ['caseid'] + event_attrib_names
trace_attrib_names = ['caseid'] + trace_attrib_names

# separate trace attributes from event attributes
trace_df = log_df[trace_attrib_names].drop_duplicates()
event_df = log_df[event_attrib_names]

# removed the prefixes
trace_df.columns = ['caseid'] + [val.replace('trace:', '') for val in trace_attrib_names[1:]]
event_df.columns = ['caseid'] + [val.replace('event:', '') for val in event_attrib_names[1:]]

# convert timestamps
trace_df['REG_DATE'] = trace_df['REG_DATE'].astype(np.datetime64)
event_df['time:timestamp'] = event_df['time:timestamp'].astype(np.datetime64)

# make sure resource id gets read properly

In [6]:
print('Event attributes: {}\n'.format(event_df.columns.values))
print('Trace attributes: {}'.format(trace_df.columns.values))

Event attributes: ['caseid' 'org:resource' 'lifecycle:transition' 'concept:name'
 'time:timestamp']

Trace attributes: ['caseid' 'REG_DATE' 'concept:name' 'AMOUNT_REQ']


In [7]:
# create a map between attribute names to create_attribute_X
trace_attrib_to_factory = {
    'REG_DATE': XFactory.create_attribute_timestamp,
    'concept:name': XFactory.create_attribute_literal,
    'AMOUNT_REQ': XFactory.create_attribute_discrete
}

event_attrib_to_factory = {
    'org:resource': XFactory.create_attribute_literal,
    'lifecycle:transition': XFactory.create_attribute_literal,
    'concept:name': XFactory.create_attribute_literal,
    'time:timestamp': XFactory.create_attribute_timestamp
}


def get_extension(attrib_name):
    str_segments = attrib_name.split(':')
    
    if len(str_segments) < 2:
        return None
    
    prefix = str_segments[0]
    extension = XExtensionManager().get_by_prefix(prefix)
    return extension
    

def create_attribute(attrib, attrib_type, attrib_to_factory_map, extensions):
    factory_method = attrib_to_factory_map[attrib_type]
    
    # add extension to the attribute
    extension = None
    if ':' in attrib_type:
        extension = get_extension(attrib_type)

        assert isinstance(extension, XExtension), \
            'Extension is a {}'.format(str(type(extension)))

        extensions.add(extension)
    
    # create xattribute
    xattrib = factory_method(attrib_type, attrib, extension)
    
    return xattrib

    
def convert_event_row_to_xevent(event_attrib_types, event_row, extensions):
    attrib_map = XFactory.create_attribute_map()
    
    for ind in range(len(event_row)):
        attrib = event_row[ind]
        attrib_type = event_attrib_types[ind]
        
        xattrib = create_attribute(attrib, attrib_type,
                                   event_attrib_to_factory,
                                   extensions)
        
        # add attribute to attribute map
        attrib_map[xattrib.get_key()] = xattrib
        
    return XFactory.create_event(attrib_map)


def create_trace(trace_attrib_types, trace_row, 
                 event_attrib_types, event_df,
                 extensions):
    # assume that the event_rows are related to the trace and 
    # that there is no caseid column
    
    # need to add the trace attributes
    attrib_map = XFactory.create_attribute_map()
    
    for ind in range(len(trace_row)):
        attrib = trace_row[ind]
        attrib_type = trace_attrib_types[ind]
        
        xattrib = create_attribute(attrib, attrib_type, 
                                   trace_attrib_to_factory,
                                   extensions)
        
        # add attribute to attribute map
        attrib_map[xattrib.get_key()] = xattrib
        
    xtrace = XFactory.create_trace(attrib_map)
    
    # now add the events
    for row in event_df.iterrows():
        event_row = [row[1][key] for key in event_attrib_types]
        xevent = convert_event_row_to_xevent(event_attrib_types, 
                                             event_row, extensions)
        xtrace.append(xevent)
        
    return xtrace


def create_xlog(trace_df, event_df):
    xlog = XFactory.create_log()
    
    # iterate through trace rows
    trace_attrib_types = list(filter(
        lambda val: val != 'caseid', trace_df.columns
    ))
    
    event_attrib_types = list(filter(
        lambda val: val != 'caseid', event_df.columns
    ))

    extensions = set()
    
    for row in trace_df.iterrows():
        trace_row = [row[1][key] for key in trace_attrib_types]
                
        # get the events that are related to this trace
        caseid = row[1]['caseid']
        
#         print('Creating XTrace with caseid: {}'.format(caseid))
        
        events = event_df[(event_df['caseid'] == caseid)]
        
        xtrace = create_trace(trace_attrib_types, trace_row,
                             event_attrib_types, events, extensions)
        
        xlog.append(xtrace)
        
    # add the used extensions
    for ext in extensions:
        xlog.get_extensions().add(ext)
        
    # create a classifier that classifies using activity and lifecycle
    activity_clf = XEventNameClassifier()
    lifecycle_clf = XEventLifeTransClassifier()
    activity_lifecycle_clf = XEventAndClassifier([activity_clf, 
                                                  lifecycle_clf])
    activity_lifecycle_clf.set_name('Activity classifier')
    resource_clf = XEventResourceClassifier()
    resource_clf.set_name('Resource classifier')
    
    xlog.get_classifiers().append(activity_lifecycle_clf)
    xlog.get_classifiers().append(resource_clf)
        
    return xlog

In [8]:
# make an event log of 100 traces
trace_df_200 = trace_df.iloc[:200, :]

print('Creating a XLog with {} traces'.format(trace_df_200.shape[0]))

xlog = create_xlog(trace_df_200, event_df)

Creating a XLog with 200 traces


In [9]:
from opyenxes.data_out.XesXmlGZIPSerializer import XesXmlGZIPSerializer

In [10]:
# save the log in XES format
xlog_fp = './data/BPIC2012-200.xes.gz'

with open(xlog_fp, 'w') as f:
    XesXmlGZIPSerializer().serialize(xlog, f)

Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (605.481689453125 msec.)

