In [None]:
# We import the neccessary packages in the beginning
import os
import pm4py
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.conversion.bpmn import converter as bpmn_converter
import numpy as np
import pandas as pd
import pickle
from pm4py.util import exec_utils
from enum import Enum
from pm4py.algo.discovery.footprints import algorithm as footprints_discovery
from pm4py.visualization.petri_net import visualizer as pn_viz
from pm4py.objects.process_tree.utils import generic as pt_util
from pm4py.objects.process_tree.utils.generic import tree_sort
from pm4py.util.variants_util import get_variant_from_trace
from pm4py.statistics.variants.log.get import get_variants_sorted_by_count
import time



In [None]:
# Returns a path to the file selected by the user
# Input: The folder in which to look for the files - the default is the current folder
def ask_for_path(rel_path='', index = -1):
    #Crawl all files in the input folder
    print("The following files are available in the input folder:\n")

    count = 0
    file_list = os.listdir(os.getcwd() + rel_path)
    for file in file_list:
        print(str(count) + " - " + file)
        count+=1

    if(index == -1):
        #Ask for which of the files shall be transformed and select it.
        inp = input("Please choose from the list above which of the files shall be transformed by typing the corresponding number.")
    else:
        #Automatic iteration
        print('Automatic Iteration.')
        inp = index 

    input_file = file_list[int(inp)]

    return (os.getcwd() + rel_path + input_file)
# this function converts a selected file in the path that is the input into a log
def transform_to_log(file_path):
    filename, file_extension = os.path.splitext(file_path)
    x,z =os.path.split(file_path)
    
    if file_extension == '.csv':
        log_csv = pd.read_csv(file,sep=None,encoding='utf-8-sig')
        if z =='mobis_challenge_log_2019.csv' or z =='mobis_challenge_log_2019_only_complete_cases.csv':
            log_csv['end'] = pd.to_datetime(log_csv['end'])
            log_csv['start'] = pd.to_datetime(log_csv['start'])
            log_csv['cost'] = log_csv['cost'].apply(pd.to_numeric, errors='coerce')
            log_csv.rename(columns={'cost': 'case:cost','case':'case:concept:name','activity':'concept:name','end':'time:timestamp', 'user':'org:resource'}, inplace=True)
        log = log_converter.apply(log_csv)
    elif file_extension == '.xes':
        log = pm4py.read_xes(file_path)
        log['time:timestamp']=pd.to_datetime(log['time:timestamp'])
        log = pm4py.convert_to_event_log(log)
    elif file_extension == '.dfg':
        log = pm4py.read_dfg(file_path)
    else:
        print("Current filetype is equal to {}. \nPlease input a file with any of the following extensions: - csv; - xes; - dfg".format(str(file_extension)))
        return -1

    return log

In [None]:
##########
"""Settings"""
##########
# set the input and output path according to the files you want to select
REL_INPUT_PATH = "/BINet/wide/" # adjust this path to a similar model structure as the one in the github repo to ensure functionality

file= ask_for_path(REL_INPUT_PATH,1) # adjust to your path
model_file=ask_for_path(REL_INPUT_PATH,0)
log=transform_to_log(file)
log

In [None]:
from subprocess import *
# this function executes the ProConformance.jar file with the correct arguments passed
def jarWrapper(*args):
    process = Popen(['java', '-jar']+list(args), stdout=PIPE, stderr=PIPE)
    ret = []
    while process.poll() is None:
        line = process.stdout.readline()
        if line != '':
            ret.append(line[:-1])
    stdout, stderr = process.communicate()
    ret += stdout.split()
    if stderr != '':
        ret += stderr.split()
    #ret.remove('')
    return ret



In [None]:
# use this function for the hosseinpour_jans log only
def garcia_perTrace_hosseinpour(log):
    time_start = time.clock()
    collect_traces = pd.DataFrame(data=0, columns=['variant_idx'], index=range(
        len(log)))  # Data Frame that stores the information whether a deviation happened for each trace on trace level

    # select only one trace per variant
    variants = pm4py.get_variants(log)
    variant_list = list(variants.keys())
    variant_list = list(','.join(variant_list[e]) for e in range(len(variant_list)))

    for i, trace in enumerate(log):
        collect_traces['variant_idx'][i] = variant_list.index(','.join(list(get_variant_from_trace(log[i]))))


    var_counts = dict(get_variants_sorted_by_count(variants))
    variants_deviating = pd.DataFrame(data=None, columns=['first_idx', 'count'],
                                        index=variant_list)
    for i in range(len(log)):
        if not pd.isna(variants_deviating['first_idx'][','.join(list(get_variant_from_trace(log[i])))]):
            next
        else:
            variants_deviating['first_idx'][','.join(list(get_variant_from_trace(log[i])))] = i
            variants_deviating['count'][','.join(list(get_variant_from_trace(log[i])))] = var_counts[
                get_variant_from_trace(log[i])]
            
    vars_to_analyze = []
    for var in variant_list:
        vars_to_analyze.append(
            var)  # this list stores all variants that deviate, i.e., all variants that should be analyzed
    #assumes that the filtered logs already lie in the folder structure (i.e., one trace in a log for each of the 18 trace variants)
    traces=['t1','t2','t3','t4','t5','t6','t7','t8','t9','t10','t11','t12','t13','t14','t15','t16','t17','t18']
    results=variants_deviating.copy(deep=True)
    results['ID']=0
    for i in range(len(results)):
        results['ID'][i]=('t'+str(i+1))
    results
    for trace in traces:
        args = ['ProConformance.jar', str('hosseinpour/'+trace+'/'), str(trace+'.xes'), 'hosseinpour.pnml'] # Any number of args to be passed to the jar file

        result = jarWrapper(*args)

        print(result)
    timer = pd.DataFrame(data=0, columns=['time'], index=[0])
    timer['time'][0] = time.clock() - time_start
    for trace in traces:
        with open(str(os.getcwd() + str('/hosseinpour/'+trace+'/')+'BehavioralStatements.txt')) as f:
            lines = f.readlines()
        lines=lines[2:]
        for j, pattern in enumerate(lines):
            if not str('pld_'+str(j)) in results.columns:
                results[str('pld_'+str(j))]=0
            results[str('pld_'+str(j))][i]=pattern

    writer = pd.ExcelWriter(str(os.getcwd() + '/hosseinpour/' + 'garcia.xlsx'),
            engine="xlsxwriter")

    results.to_excel(writer, sheet_name=('Patterns'))
    timer.to_excel(writer, sheet_name=('Time'))
    writer.close()

In [None]:
# use this function for the binet logs only
def garcia_perTrace_binet(log):
    time_start = time.clock()
    collect_traces = pd.DataFrame(data=0, columns=['variant_idx'], index=range(
        len(log)))  # Data Frame that stores the information whether a deviation happened for each trace on trace level

    # select only one trace per variant
    variants = pm4py.get_variants(log)
    variant_list = list(variants.keys())
    variant_list = list(','.join(variant_list[e]) for e in range(len(variant_list)))

    for i, trace in enumerate(log):
        collect_traces['variant_idx'][i] = variant_list.index(','.join(list(get_variant_from_trace(log[i]))))

    var_counts = dict(get_variants_sorted_by_count(variants))
    variants_deviating = pd.DataFrame(data=None, columns=['deviating', 'first_idx', 'count', 'label'],
                                      index=variant_list)
    for i in range(len(log)):
        if not pd.isna(variants_deviating['first_idx'][','.join(list(get_variant_from_trace(log[i])))]):
            next
        else:
            variants_deviating['first_idx'][','.join(list(get_variant_from_trace(log[i])))] = i
            variants_deviating['count'][','.join(list(get_variant_from_trace(log[i])))] = var_counts[
                get_variant_from_trace(log[i])]
            variants_deviating['label'][','.join(list(get_variant_from_trace(log[i])))] = log[i].attributes['label']
            if not variants_deviating['label'][','.join(list(get_variant_from_trace(log[i])))] == 'normal':
                variants_deviating['deviating'][','.join(list(get_variant_from_trace(log[i])))] = 1
            else:
                variants_deviating['deviating'][','.join(list(get_variant_from_trace(log[i])))] = 0
    vars_to_analyze = []
    for var in variant_list:
        if variants_deviating['deviating'][var] == 1:
            vars_to_analyze.append(
                var)  # this list stores all variants that deviate, i.e., all variants that should be analyzed
    len(vars_to_analyze)

    results = variants_deviating[variants_deviating['deviating'] == 1]
    results['ID'] = 0
    for i in range(len(results)):
        results['ID'][i] = i
    results
    for trace in log:
        trace.attributes['id'] = -1
        for event in trace:
            event['id'] = -1
    for i in range(len(results)):
        log[results['first_idx'][i]].attributes['id'] = i
        for event in log[results['first_idx'][i]]:
            event['id'] = i
    # we store one event log with one trace per deviating variant
    for i in range(len(results)):
        filtered_log = pm4py.filter_event_attribute_values(log, "id", [i], level="case", retain=True)
        # mode
        mode = 0o666

        # Path
        path = (os.getcwd() + REL_INPUT_PATH + str(i))

        # Create the directory
        # 'GeeksForGeeks' in
        # '/home / User / Documents'
        # with mode 0o666
        os.mkdir(path, mode)
        import shutil

        shutil.copy(model_file, path)
        pm4py.write_xes(filtered_log, str(path + '/' + str(i) + '_log.xes'))
    # we run the ProConformance plugin for each created event log
    for i in range(len(results)):
        args = ['ProConformance.jar', str(os.getcwd() + REL_INPUT_PATH + str(i) + '/'), str(str(i) + '_log.xes'),
                str(REL_INPUT_PATH.split('/')[2] + '.pnml')]  # Any number of args to be passed to the jar file

        result = jarWrapper(*args)

        print(result)
    timer = pd.DataFrame(data=0, columns=['time'], index=[0])
    timer['time'][0] = time.clock() - time_start
    # we parse the textual output
    for i in range(len(results)):
        with open(str(os.getcwd() + REL_INPUT_PATH + str(i) + '/' + 'BehavioralStatements.txt')) as f:
            lines = f.readlines()
        lines = lines[2:]
        for j, pattern in enumerate(lines):
            if not str('pld_' + str(j)) in results.columns:
                results[str('pld_' + str(j))] = 0
            results[str('pld_' + str(j))][i] = pattern
    results
    writer = pd.ExcelWriter(str(os.getcwd() + REL_INPUT_PATH + 'garcia.xlsx'),
                            engine="xlsxwriter")

    results.to_excel(writer, sheet_name=('Patterns'))
    timer.to_excel(writer, sheet_name=('Time'))
    writer.close()