In [1]:
## import all necessary libraries ##
## import all necessary libraries ##

import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import copy

#########################

from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.algo.filtering.log.variants import variants_filter
from pm4py.statistics.traces.log import case_statistics
from pm4py.objects.log.util import insert_classifier
from pm4py.util import constants

#########################

import distance

from similarity.levenshtein import Levenshtein
levenshtein = Levenshtein()

from similarity.damerau import Damerau
damerau = Damerau()

from pyjarowinkler import distance as jwdistance
from similarity.jarowinkler import JaroWinkler
jarowinkler = JaroWinkler()

from similarity.weighted_levenshtein import WeightedLevenshtein
from similarity.weighted_levenshtein import CharacterSubstitutionInterface
import math
from random import sample
from random import seed
class CharacterSubstitution(CharacterSubstitutionInterface):
    def cost(self, c0, c1):
        return math.inf # assign inifte weight to all substitutions
levenshtein2 = WeightedLevenshtein(CharacterSubstitution())

#########################

import pomegranate as pom
from sklearn import model_selection as ms

# LOF

## Distance class

In [2]:
class Distance:
    
    ### SETUP ###
    
    ## load dataset, generate mapping and generate strings
    def __init__(self, log):
        self.log = log
        
        self.strings = []
        self.matrix = []
        self.transl = {}
        self.variant_to_Vindex = {}
        self.index_to_variant = [] 
        #seed(1633048)
        #self.log = sample(self.log, int(len(self.log)/10))
        self.clear_caches()
        
        self.gen_trace_to_Tindex()
        
        self.gen_mapping()
        self.gen_variant_strings()        
    
    def clear_caches(self):
        self.Nk_res_dict = {} # N_k result cache
    
    ## generate mapping from activity to char
    def gen_mapping(self):
        ## generate mapping from activities to chars ##
        # TODO read Activity Classifier for correct naming of activities
        #activities = list(attributes_filter.get_attribute_values(self.log, "concept:name").keys())
        activities = list(attributes_filter.get_attribute_values(self.log, "customClassifier").keys())
        #activities2 = list(attributes_filter.get_attribute_values(self.log, "lifecycle:transition").keys())
        #activities = [i + "-" + j for i, j in zip(activities2, activities2)]
        for i, a in enumerate(activities):
            self.transl[a] = chr(i+1)

    def gen_trace_to_Tindex(self):
        self.trace_to_Tindex = {}
        for i, t in enumerate(self.log):
            self.trace_to_Tindex[t] = i
            
            
    ## generate strings for all variants
    def gen_variant_strings(self):
        self.variants = variants_filter.get_variants(self.log, parameters={variants_filter.Parameters.ACTIVITY_KEY: "customClassifier"}) # all variants as dictionary
        variant_strings = list(self.variants.keys()) # variants as strings
        self.variant_to_index = {} # dictionary to translate variant to index in list for later lookup of traces
        
        for i, v in enumerate(variant_strings):
            string = ""
            for e in v.split(","):
                string = string + self.transl[e] 
            
            #self.strings.append(list_to_string(v.split(",")))
            self.strings.append(string)
            
            self.variant_to_index[v] = i
            self.index_to_variant.append(v)
            
        print("Number of variants: " + str(len(self.strings)))
    
    ### CALCULATION ###
    
    ## calculate distance matrix
    def calculate(self):
        n = len(self.strings)
        self.matrix = np.full((n, n), 0, dtype = np.uint8)

        for i, x in enumerate(self.strings):
            for j, y in enumerate(self.strings):
                if j >= i: # only calculate upper right triangle of matrix
                    #dist = distance.hamming(x, y)
                    dist = levenshtein.distance(x, y)
                    #dist = levenshtein2.distance(x, y)
                    #dist = damerau.distance(x, y)
                    #dist = (1- jarowinkler.similarity(x, y))*255
                    #print(dist)
                    self.matrix[i][j] = dist

        # mirror upper right triangle of matrix by adding the transposition
        self.matrix = self.matrix + self.matrix.T

        return self.matrix
    
    ### RETRIEVAL ###
    
    ## translate trace to its corresponding matrix index
    def trace_to_index(self, trace):
        # convert trace to string tion of variant (concept:name separated by commas)
        trace_string = ""
        for e in trace:
            #trace_string = trace_string + e["concept:name"] + ","
            trace_string = trace_string + e["customClassifier"] + ","
        trace_string = trace_string[:-1] # remove last comma

        return self.variant_to_index[trace_string]
    
    ## translate matrix (variant) index to trace indices
    def index_to_traces(self, i):
        variant_string = self.index_to_variant[i]
        traces = self.variants[variant_string] # retrieve traces from variant dictionary
        filtered_variants = {variant_string: traces} # generate new variants dictionary with only one variant
        
        filtered_log = variants_filter.apply(self.log, filtered_variants, parameters={variants_filter.Parameters.ACTIVITY_KEY: "customClassifier"})
        traces = []
        for t in filtered_log:
            traces.append(t)
        
        return traces
        
    
    ## retrieve distance of two traces from matrix
    def dist(self, t1, t2):
        i1 = self.trace_to_index(t1)
        i2 = self.trace_to_index(t2)
        return self.matrix[i1][i2]
    
    # return traces of k nearest neighbors of A
    def N_k(self, k, A):
        A_variant_index = self.trace_to_index(A)
        if A_variant_index in self.Nk_res_dict.keys(): # check result cache
            #print("N_k cache hit")
            return self.Nk_res_dict[A_variant_index]
        else:
            idx_sort = np.argsort(self.matrix[A_variant_index]) # indices of neighbors in ascending distance

            i = 0
            N_k_traces = []
            while len(N_k_traces) < k:
                N_k_traces.extend(self.index_to_traces(idx_sort[i]))
                if A in N_k_traces:
                    N_k_traces.remove(A)
                i = i + 1
                
            
            self.Nk_res_dict[A_variant_index] = N_k_traces
            return N_k_traces

## LOF class

In [3]:
class LOF:
    
    def __init__(self, log):
        self.dist = Distance(log)
        self.dist.calculate()
        self.clear_caches()
    
    def clear_caches(self):
        self.dist.clear_caches()
        
        # result caches
        self.kd_res_dict = {}
        self.rd_res_dict = {}
        self.lof_res_dict = {}
        self.lrd_res_dict = {}
    
    
    ## k-distance
    def k_distance(self, k, A):
        A_variant_index = self.dist.trace_to_index(A)
        if A_variant_index in self.kd_res_dict.keys(): # check result cache
            #print("k_distance cache hit")
            return self.kd_res_dict[A_variant_index]
        else:
            N_k = self.dist.N_k(k, A)
            k_variant_index = self.dist.trace_to_index(N_k[-1])
            A_variant_index = self.dist.trace_to_index(A)

            res = self.dist.matrix[A_variant_index][k_variant_index] # retrieve distance from k-th nearest neighbor (-1 to offset arraystart, +1 to not include trace itself)
            self.kd_res_dict[A_variant_index] = res
            return res
        
    ## reachability distance
    def reachability_dist(self, k, A, B):
        A_variant_index = self.dist.trace_to_index(A)
        B_variant_index = self.dist.trace_to_index(B)
        if (A_variant_index, B_variant_index) in self.rd_res_dict.keys(): # check result cache
            #print("rd cache hit")
            return self.rd_res_dict[(A_variant_index, B_variant_index)]
        else:
            res = max(self.k_distance(k, B), self.dist.matrix[A_variant_index][B_variant_index])
            self.rd_res_dict[(A_variant_index, B_variant_index)] = res
            return res
    
    ## local reachability density
    def lrd(self, k, A):
        A_variant_index = self.dist.trace_to_index(A)
        if A_variant_index in self.lrd_res_dict.keys(): # check result cache
            #print("lrd cache hit")
            return self.lrd_res_dict[A_variant_index]
        else:
            
            N_k = self.dist.N_k(k, A)            
            sum = 0
            for b in N_k:
                   sum = sum + self.reachability_dist(k, A, b) # sum of rachability distances in k-Neighborhood
            
            res = 1 / (sum / len(N_k))
            self.lrd_res_dict[A_variant_index] = res
            return res
        
    def lof(self, k, A):
        A_variant_index = self.dist.trace_to_index(A)
        if A_variant_index in self.lof_res_dict.keys(): # check result cache
            #print("lof cache hit")
            return self.lof_res_dict[A_variant_index]
        else:
            
            N_k = self.dist.N_k(k, A)
            sum = 0
            for b in N_k:
                sum = sum + self.lrd(k, b)

            res = sum / (len(N_k) * self.lrd(k, A))
            self.lof_res_dict[A_variant_index] = res
            return res
    
    def calculate(self, k):
        res = np.array([])
        
        for a in self.dist.log:
            res = np.append(res, self.lof(k, a))
            
        return res
    

# Run

## Import log

In [4]:
path = "Datasets/BPIC13.xes"
log = xes_importer.apply(path)

# generate custom activity classifier
try:
    
    #log, activity_key = insert_classifier.insert_activity_classifier_attribute(self.log, "Activity classifier")
    for trace in log:
        for event in trace:
            custom_classifier = ""
            for activity_classifier in log.classifiers["Activity classifier"]:
                custom_classifier = custom_classifier + event[activity_classifier] + "+"
            custom_classifier = custom_classifier[:-1]
            event["customClassifier"] = custom_classifier
except:
    print("foo")
    for trace in log:
        for event in trace:
            event["customClassifier"] = event["concept:name"]

# generate index attribute for each event (later used to fiter)
for trace in log:
    for i, event in enumerate(trace):
        event["index"] = i


parsing log, completed traces ::   0%|          | 0/1487 [00:00<?, ?it/s]

## Generate LOFs

In [6]:
start = datetime.now()
print(start)

# find length of longest trace
trace_len = [len(i) for i in log]
max_trace_len = max(trace_len)
print("max trace length: " + str(max_trace_len))

res = []

for l in range(1, max_trace_len+1): # iterate from 1 to length of longest trace
    print("l: " + str(l))
    # filter events by attribute "index". only events with index between 0 and l are kept
    log_tmp = attributes_filter.apply_numeric_events(log, 0, l, 
                                                     parameters={constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "index"})
    
    # run LOF calculation on filtered log
    lof = LOF(log_tmp)

    k = len(max(lof.dist.variants.values(), key=len)) + 1 # no. traces in largest variant + 1
    res_tmp = lof.calculate(k)
    res.append(res_tmp)
    
end = datetime.now()
print(end-start)

2021-06-29 10:26:08.601534
max trace length: 35
l: 1
Number of variants: 21
l: 2
Number of variants: 49
l: 3
Number of variants: 107
l: 4
Number of variants: 184
l: 5
Number of variants: 248
l: 6
Number of variants: 290
l: 7
Number of variants: 314
l: 8
Number of variants: 322
l: 9
Number of variants: 325
l: 10
Number of variants: 326
l: 11
Number of variants: 327
l: 12
Number of variants: 327
l: 13
Number of variants: 327
l: 14
Number of variants: 327
l: 15
Number of variants: 327
l: 16
Number of variants: 327
l: 17
Number of variants: 327
l: 18
Number of variants: 327
l: 19
Number of variants: 327
l: 20
Number of variants: 327
l: 21
Number of variants: 327
l: 22
Number of variants: 327
l: 23
Number of variants: 327
l: 24
Number of variants: 327
l: 25
Number of variants: 327
l: 26
Number of variants: 327
l: 27
Number of variants: 327
l: 28
Number of variants: 327
l: 29
Number of variants: 327
l: 30
Number of variants: 327
l: 31
Number of variants: 327
l: 32
Number of variants: 327
l: 

In [16]:
# mirror results and rotate by 270 degrees
res_rot = np.rot90(res[::-1], 3) 

In [69]:
res_rot = np.genfromtxt("res_BPIC20_sample.csv", delimiter=",")
res = np.rot90(res_rot)
res = res[::-1]

In [70]:
# define cutoffs for anomaly at each length
cutoffs = []

for l in range(len(res)):
    r = []
    for t in range(len(res_rot)):
        if len(log[t]) >= l + 1:
              r.append(res[l][t])        
    
    cutoffs_tmp = []
    for cutoff in range(10, 100, 10):
        cutoffs_tmp.append(np.percentile(r, cutoff))
    cutoffs.append(cutoffs_tmp)

In [71]:
# classify traces at each point
classification = [None] * len(res_rot)

for j, trace_res in enumerate(res_rot): # for every trace
    classification_trace = [10] * len(log[j])
    for l, r in enumerate(trace_res): # for every length cutoff
        if l < len(log[j]): # if cutoff is within trace length
            for cutoff in range(8, -1, -1):
                if r > cutoffs[l][cutoff]:
                    #print("*" + str(cutoff) + " - " + str(r) + " - " + str(cutoffs[l][cutoff]))
                    classification_trace[l] = str((cutoff+1)*10)
                    break
    classification[j] = classification_trace

## Setup data

In [73]:
lists = []
transl = {}
## generate mapping from activities to chars ##
activities = list(attributes_filter.get_attribute_values(log, "customClassifier").keys())
for i, a in enumerate(activities):
    transl[a] = chr(i+1)
    
for trace in log:
    tlist = []
    for i, event in enumerate(trace):
        tlist.append(event["customClassifier"])
    lists.append(tlist)


# split data into training and test
lists_train, lists_test, is_anomaly_train, is_anomaly_test = ms.train_test_split(lists, classification, test_size = 0.2, random_state = 3)

## Setup model
- Discrete obersvations
- 9 states

In [74]:
model = pom.HiddenMarkovModel.from_samples(
    pom.DiscreteDistribution, 
    n_components=9,
    X=lists_train, 
    labels=is_anomaly_train,
    state_names=["10", "20", "30", "40", "50", "60", "70", "80", "90"],
    algorithm="labeled")
model.bake()

In [75]:
model.states

[{
     "class" : "State",
     "distribution" : {
         "class" : "Distribution",
         "dtype" : "numpy.str_",
         "name" : "DiscreteDistribution",
         "parameters" : [
             {
                 "Declaration APPROVED by ADMINISTRATION" : 0.07926829268292683,
                 "Declaration APPROVED by BUDGET OWNER" : 0.01707317073170732,
                 "Declaration APPROVED by PRE_APPROVER" : 0.0,
                 "Declaration APPROVED by SUPERVISOR" : 0.0012195121951219512,
                 "Declaration FINAL_APPROVED by DIRECTOR" : 0.0012195121951219512,
                 "Declaration FINAL_APPROVED by SUPERVISOR" : 0.08536585365853659,
                 "Declaration REJECTED by ADMINISTRATION" : 0.00975609756097561,
                 "Declaration REJECTED by BUDGET OWNER" : 0.0012195121951219512,
                 "Declaration REJECTED by DIRECTOR" : 0.0,
                 "Declaration REJECTED by EMPLOYEE" : 0.020731707317073172,
                 "Declaration REJ

In [76]:
model.dense_transition_matrix()

array([[0.59718026, 0.15206445, 0.03121853, 0.0735146 , 0.00302115,
        0.04531722, 0.05639476, 0.02719033, 0.01409869, 0.        ,
        0.        ],
       [0.44649446, 0.11808118, 0.07749077, 0.10701107, 0.07380074,
        0.13653137, 0.        , 0.02214022, 0.01845018, 0.        ,
        0.        ],
       [0.23383085, 0.08955224, 0.2238806 , 0.16915423, 0.02487562,
        0.08457711, 0.04975124, 0.09452736, 0.02985075, 0.        ,
        0.        ],
       [0.05586592, 0.10055866, 0.10614525, 0.29329609, 0.25977654,
        0.04748603, 0.06424581, 0.03351955, 0.03910615, 0.        ,
        0.        ],
       [0.12980769, 0.25      , 0.17307692, 0.08653846, 0.10096154,
        0.12980769, 0.07211538, 0.01442308, 0.04326923, 0.        ,
        0.        ],
       [0.        , 0.00364964, 0.1350365 , 0.10218978, 0.16788321,
        0.22262774, 0.24452555, 0.06934307, 0.05474453, 0.        ,
        0.        ],
       [0.00344828, 0.01034483, 0.05517241, 0.1137931 , 0.

## Evaluate

In [77]:
for i, t in enumerate(lists_test):
    prediction = model.predict(t)
    prediction_last_name = model.states[prediction[-1]].name
    print(prediction)

[1, 3, 3, 4, 1, 0, 0, 0, 6, 6, 8, 8]
[1, 3, 3, 4, 1, 0]
[1, 0, 1, 0, 0, 0, 0, 0, 8, 8]
[8, 8, 8, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8]
[1, 0, 0, 0, 0, 0, 0, 0, 8, 8]
[1, 2, 6, 8, 8, 8]
[1, 0, 1, 0, 0, 3, 4, 1, 0, 0, 0, 8, 8]
[1, 0, 1, 3, 3, 4, 2, 2]
[1, 0, 0, 0, 0, 0, 0, 0, 8, 8]
[1, 3, 3, 6, 7, 7, 7, 8, 8, 8, 8, 8, 8, 7, 7, 7, 8, 8, 8, 8, 0, 0, 7, 6, 6, 8, 8]
[1, 0, 0, 0, 0, 3, 4, 1, 0, 0, 0, 8, 8]
[1, 0, 0, 0, 0, 0, 8, 8]
[1, 3, 3, 7, 3, 4, 6, 8, 8, 3, 4, 1, 7, 7, 7, 8, 8, 8, 8, 7, 7, 7, 8, 8, 8, 8, 8, 6, 7, 6, 7, 1, 3, 4, 1, 0, 0, 6, 6, 8, 8]
[1, 0, 0, 0, 0, 3, 4, 1, 3, 4, 1, 0, 0, 0, 8, 8]
[1, 3, 3, 4, 1, 0, 0, 0, 6, 6, 8, 8]
[1, 0, 0, 0, 0, 0, 0, 0, 8, 8]
[1, 0, 1, 0, 0, 3, 4, 1, 0, 0, 0, 8, 8]
[1, 2, 0, 0, 0, 6, 7, 8, 8, 8]
[1, 3, 8, 3, 1, 3, 3, 4, 1, 0, 2, 2]
[1, 3, 3, 4, 1, 0, 0, 0, 6, 6, 8, 8]
[1, 0, 0, 0, 0, 3, 4, 1, 0, 0, 0, 8, 8]
[1, 0, 0, 0, 0, 3, 4, 1, 0, 0, 0, 8, 8]
[1, 3, 3, 1, 0, 0, 2, 2]
[1, 0, 0, 0, 0, 3, 4, 1, 0, 0, 0, 8, 8]
[1, 0, 0, 0, 3, 4, 6, 0, 6, 6, 8, 8]
[1, 0, 6,