In [501]:
## import all necessary libraries ##

import numpy as np
from datetime import datetime

#########################

from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.algo.filtering.log.variants import variants_filter

#########################

import distance

from similarity.levenshtein import Levenshtein
levenshtein = Levenshtein()

from similarity.damerau import Damerau
damerau = Damerau()

from pyjarowinkler import distance as jwdistance
from similarity.jarowinkler import JaroWinkler
jarowinkler = JaroWinkler()

from similarity.weighted_levenshtein import WeightedLevenshtein
from similarity.weighted_levenshtein import CharacterSubstitutionInterface
import math
class CharacterSubstitution(CharacterSubstitutionInterface):
    def cost(self, c0, c1):
        return math.inf # assign inifte weight to all substitutions
levenshtein2 = WeightedLevenshtein(CharacterSubstitution())

#########################

import sklearn

# Distance class

In [563]:
class Distance:
    
    ### SETUP ###
    
    ## load dataset, generate mapping and generate strings
    def __init__(self, path):
        self.strings = []
        self.matrix = []
        self.transl = {}
        self.variant_to_index = {}
        self.index_to_variant = []
        
        self.log = xes_importer.apply(path) # adjust for local file location       
    
        self.gen_mapping()
        self.gen_variant_strings()
    
    ## generate mapping from activity to char
    def gen_mapping(self):
        ## generate mapping from activities to chars ##
        # TODO read Activity classifier for correct naming of activities
        activities = list(attributes_filter.get_attribute_values(self.log, "concept:name").keys())
        #activities2 = list(attributes_filter.get_attribute_values(self.log, "lifecycle:transition").keys())
        #activities = [i + "-" + j for i, j in zip(activities2, activities2)]
        for i, a in enumerate(activities):
            self.transl[a] = chr(i+1)

    ## generate strings for all variants
    def gen_variant_strings(self):
        variants = list(variants_filter.get_variants(self.log).keys())
        self.variant_to_index = {} # dictionary to translate variant to index in list for later lookup of traces
        
        for i, v in enumerate(variants):
            string = ""
            for e in v.split(","):
                string = string + self.transl[e] 
            
            #self.strings.append(list_to_string(v.split(",")))
            self.strings.append(string)
            
            self.variant_to_index[v] = i
            self.index_to_variant.append(v)
            
        print("Number of strings: " + str(len(self.strings)))
    
    ### CALCULATION ###
    
    ## calculate distance matrix
    def calculate(self):
        n = len(self.strings)
        self.matrix = np.full((n, n), 0) #, dtype = np.uint8)

        for i, x in enumerate(self.strings):
            for j, y in enumerate(self.strings):
                if j >= i: # only calculate upper right triangle of matrix
                    #dist = distance.hamming(x, y)
                    dist = levenshtein.distance(x, y)
                    #dist = levenshtein2.distance(x, y)
                    #dist = damerau.distance(x, y)
                    #dist = jarowinkler.similarity(x, y)
                    #print(dist)
                    self.matrix[i][j] = dist

        # mirror upper right triangle of matrix by adding the transposition
        self.matrix = self.matrix + self.matrix.T

        return self.matrix
    
    ### RETRIEVAL ###
    
    ## translate trace to its corresponding matrix index
    def trace_to_index(self, trace):
        # convert trace to string tion of variant (concept:name separated by commas)
        trace_string = ""
        for e in trace:
            trace_string = trace_string + e["concept:name"] + ","
        trace_string = trace_string[:-1] # remove last comma

        return self.variant_to_index[trace_string]
    
    ## retrieve distance of two traces from matrix
    def dist(self, t1, t2):
        i1 = self.trace_to_index(t1)
        i2 = self.trace_to_index(t2)
        return self.matrix[i1][i2]
    
    # return indices of k nearest neighbors of A
    def N_k(self, k, A):
        l = k+1
        idx_sort = np.argsort(self.matrix[A]) # indices of neighbors in ascending distance
        N_k = idx_sort[1:l]
        
        # add neighbors with same distance as k-th nearest
        while self.matrix[A][idx_sort[l]] == self.matrix[A][N_k[-1]]:
            N_k = np.append(N_k, idx_sort[l])
            l = l+1
        
        return N_k

# LOF

In [553]:
class LOF:
    def __init__(self, path):
        self.dist = Distance(path)
        self.dist.calculate()
        
    ## k-distance
    def k_distance(self, k, A):
        idx_sort = np.argsort(self.dist.matrix[A]) # indices of neighbors in ascending distance
        N_k = self.dist.N_k(k, A)
        
        return self.dist.matrix[A][N_k[-1]] # retrieve distance from k-th nearest neighbor (-1 to offset arraystart, +1 to not include trace itself)
    
    ## reachability distance
    def reachability_dist(self, k, A, B):
        return max(self.k_distance(k, B), self.dist.matrix[A][B])
    
    ## local reachability density
    def lrd(self, k, A):
        idx_sort = np.argsort(self.dist.matrix[A]) # indices of neighbors in ascending distance
        N_k = self.dist.N_k(k, A)
            
        sum = 0
        for b in N_k:
               sum = sum + self.reachability_dist(k, A, b) # sum of rachability distances in k-Neighborhood
                
        return 1 / (sum / len(N_k))     
        
    def lof(self, k, A):
        N_k = self.dist.N_k(k, A)
        
        sum = 0
        for b in N_k:
            sum = sum + self.lrd(k, b)
            
        return sum / (len(N_k) * self.lrd(k, A))
    
    def calculate(self, k):
        res = np.array([])
        for a in range(len(self.dist.matrix)):
            res = np.append(res, self.lof(k, a))
            
        return res

# Run

In [580]:
path = "Datasets/BPIC20.xes"
lof = LOF(path)

parsing log, completed traces ::   0%|          | 0/7065 [00:00<?, ?it/s]

Number of strings: 1478


In [497]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [566]:
lofs = lof.calculate(4)

In [578]:
lofs_argsort = np.argsort(lofs)

for i in lofs_argsort:
    print(str(round(lofs[i], 3)) + " \t " + lof.dist.index_to_variant[i])

0.825 	 Accepted,Queued,Accepted,Accepted,Accepted,Accepted,Queued,Accepted,Accepted,Accepted,Completed
0.86 	 Queued,Accepted,Completed
0.877 	 Accepted,Queued,Accepted,Accepted,Queued,Accepted,Queued,Accepted,Accepted,Accepted,Completed
0.879 	 Accepted,Queued,Accepted,Queued,Accepted,Queued,Accepted,Queued,Accepted,Accepted,Completed
0.891 	 Accepted,Accepted,Accepted,Accepted,Accepted,Accepted,Accepted,Accepted,Completed
0.893 	 Accepted,Queued,Accepted,Accepted,Queued,Accepted,Accepted,Accepted,Queued,Accepted,Accepted,Completed
0.899 	 Accepted,Queued,Accepted,Accepted,Queued,Accepted,Accepted,Queued,Accepted,Completed
0.912 	 Accepted,Queued,Accepted,Accepted,Accepted,Queued,Accepted,Accepted,Completed
0.917 	 Accepted,Queued,Accepted,Accepted,Queued,Accepted,Accepted,Accepted,Accepted,Accepted,Completed
0.924 	 Accepted,Queued,Accepted,Queued,Accepted,Accepted,Accepted,Completed
0.927 	 Accepted,Completed,Accepted,Completed
0.932 	 Accepted,Queued,Accepted,Queued,Queued,Accepte

In [571]:
lofs

array([0.96586942, 1.00129832, 1.001411  , 0.94725247, 0.9560998 ,
       0.96830484, 0.99971641, 0.95610119, 1.36549085, 1.3623756 ,
       1.11320916, 0.89101511, 1.42092487, 0.96904855, 1.01566629,
       0.9620171 , 1.44327731, 1.74665978, 1.64158814, 0.96598639,
       0.97363215, 0.99624285, 0.97037037, 1.37056764, 0.96014283,
       0.94606481, 1.81098446, 1.47465448, 0.97182307, 1.80454455,
       1.01684947, 1.04047038, 0.92435331, 1.4752533 , 0.98459893,
       0.98835232, 1.69772612, 0.82467532, 0.965386  , 1.63170316,
       1.15686252, 1.52498419, 0.89863636, 0.99822217, 0.95254332,
       0.98138999, 0.98552402, 0.98529412, 1.54534511, 1.12413987,
       1.03900232, 0.87688312, 1.13908261, 1.13145738, 0.86030303,
       0.96336996, 1.32557478, 0.99533284, 0.95648148, 0.98614609,
       1.41519026, 1.4227779 , 1.46646111, 1.06494593, 1.58726978,
       1.        , 1.35820686, 1.01294536, 1.50572831, 1.83594201,
       2.03027893, 0.95      , 1.23503745, 0.97479889, 0.95225