#### Imports

In [1]:
import os
import csv
import pm4py
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pm4py.util import constants
from pm4py.statistics.sojourn_time.log import get as soj_time_get
from pm4py.statistics.concurrent_activities.log import get as conc_act_get
from pm4py.statistics.traces.generic.log import case_statistics
from pm4py.visualization.graphs import visualizer as graphs_visualizer
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.visualization.petri_net import visualizer as pn_visualizer
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.visualization.heuristics_net import visualizer as hn_visualizer
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator
from pm4py.algo.discovery.footprints import algorithm as footprints_discovery
from pm4py.visualization.footprints import visualizer as fp_visualizer
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.visualization.dfg import visualizer as dfg_visualization
log = None #defined here just for hide lint warnings


# Process mining

## Read log

In [2]:
%%capture
log = pm4py.read_xes("data.xes")
log

### Resource
Xes formátumban `org:resource` oszlop azonosítja.\
Azonosítja ki végezte az adott eventet (pl.: operator1).

In [4]:
resources = list(pm4py.get_event_attribute_values(log,'org:resource').keys()) 
resources

['operators2', 'operators4', 'operators3', 'operators1', 'operators5']

### Activities
Xes formátumban `concept:name` oszlop azonosítja.\
Az activity egy egy bizonyos típusú akciót azonosít (pl.: email küldése).\
Egy aktivitás előfordulását pedig eventnek nevezünk.

In [5]:
activities = list(pm4py.get_event_attribute_values(log, 'concept:name').keys())
activities

['P11', 'P1', 'P8', 'P9', 'P4', 'P10', 'P2', 'P3', 'P5', 'P7', 'P6']

In [None]:
start_activities = list(pm4py.get_start_activities(log).keys())
start_activities

In [None]:
end_activities = list(pm4py.get_end_activities(log).keys())
end_activities

### Case/Trace/Sequence
Xes formátumban `case:concept:name` oszlop azonosítja.\
Több eventből áll.

In [None]:
%%capture
traces = sorted(list(set(log['case:concept:name'].astype(int))))
traces

## Filtering

### Time
Log szűrése idő alapján.

In [None]:
start_date = '2000-01-10 06:00:00'
end_date = '2000-01-10 14:00:00'

#### Traces contained
A kezdő és vég időpontnak is bele kell esnie a megadott intervallumba.

In [None]:
%%capture
filtered_log = pm4py.filter_time_range(log,start_date,end_date,mode='traces_contained', case_id_key='case:concept:name', timestamp_key='time:timestamp')
traces_in = sorted(list(set(filtered_log['case:concept:name'].astype(int))))
traces_in

#### Traces intersecting
Akár a kezdő akár a vég időpont beleesik a megadott intervallumba megtartja.

In [None]:
%%capture
filtered_log = pm4py.filter_time_range(log,start_date,end_date,mode='traces_intersecting', case_id_key='case:concept:name', timestamp_key='time:timestamp')
traces_in = sorted(list(set(filtered_log['case:concept:name'].astype(int))))
traces_in

### Activities and events

#### Filter start activities
Megtartja azon traceket amelyek a megadott activity-vel kezdődtek (lista, akár több is lehet).

In [None]:
%%capture
start_activity = [start_activities[0]]
filtered_log_start = pm4py.filter_start_activities(log, start_activity)
filtered_log_start

#### Filter end activities
Megtartja azon traceket amelyek a megadott activity-vel végződtek (lista, akár több is lehet).

In [None]:
%%capture
end_activity = [end_activities[0]]
filtered_log_end = pm4py.filter_end_activities(log, end_activity)
filtered_log_end

#### Filter event attribute values
`retain` --->  meghatározza, hogy azt tartsa meg amire igaz a feltétel, vagy azt amire nem\
`level` --->  case vagy event\
Ha case szintű, akkor az egész case-t megtartja vagy eldobja amibe benne van a megadott resource.\
Ha event szintű, akkor az adott eventet megtartja vagy eldobja amibe a megadott resource.

In [None]:
filter_resource = [resources[-1]]

In [None]:
%%capture
filter_log_resource_case_in = pm4py.filter_event_attribute_values(log, 'org:resource', filter_resource, level='case', retain=True)
filter_log_resource_case_in

In [None]:
%%capture
filter_log_resource_event_in = pm4py.filter_event_attribute_values(log, 'org:resource', filter_resource, level='event', retain=True)
filter_log_resource_event_in

In [None]:
%%capture
filter_log_resource_case_out = pm4py.filter_event_attribute_values(log, 'org:resource', filter_resource,level='case', retain=False)
filter_log_resource_case_out

In [None]:
%%capture
filter_log_resource_event_out = pm4py.filter_event_attribute_values(log, 'org:resource', filter_resource,level='event', retain=False)
filter_log_resource_event_out

## Statistics

### Footprint
Tratalmazza:
- Directly follows
- Sequence
- Parallel
- Start activities
- End activities
- Activities
- Minimum trace length

In [None]:
fp_log = footprints_discovery.apply(log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG)
fp_log

### Sojourn time
Kummulatív számolás arra vonatkozóan, hogy mennyi időt töltött a folyamat egyes activity-ben.

In [None]:
parameters = {
    soj_time_get.Parameters.TIMESTAMP_KEY:'time:timestamp',
    soj_time_get.Parameters.START_TIMESTAMP_KEY:'start:timestamp'
}
soj_time = soj_time_get.apply(log, parameters=parameters)
soj_time

### Concurrent activities
Egymással párhuzamosan futó activity-k ábrázolása.

In [None]:
parameters = {
    conc_act_get.Parameters.TIMESTAMP_KEY:'time:timestamp',
    conc_act_get.Parameters.START_TIMESTAMP_KEY:'start:timestamp'
}

concurrent_activities = conc_act_get.apply(log, parameters=parameters)
activities_uniques = list(set([x for tup in concurrent_activities.keys() for x in tup]))

activities_uniques = sorted(activities, key=lambda x: int(x.split('P')[1])) #optional sort

ccact_matrix = pd.DataFrame(columns=activities_uniques, index=activities_uniques)

for (i, j), value in concurrent_activities.items():
    ccact_matrix.at[i, j] = value

ccact_matrix = ccact_matrix.fillna(int(0))

In [None]:
plt.clf()
sns.heatmap(ccact_matrix, cmap='Reds', robust=True)
plt.savefig('./statistics/concurrent_activities_matrix.png', format='png', dpi=300, bbox_inches='tight')

### Case distribution
Case-k időbeli eloszlásának ábrázolása.

In [None]:
parameters = {
    constants.PARAMETER_CONSTANT_TIMESTAMP_KEY:'time:timestamp'
}
x, y = case_statistics.get_kde_caseduration(log,parameters=parameters)
gviz = graphs_visualizer.apply_plot(x, y, variant=graphs_visualizer.Variants.CASES)
graphs_visualizer.save(gviz, './statistics/case_distribution.png')

### Attributes filter
Event-k előfordulásának eloszlása idő szerint, gráfon ábrázolva. 

In [None]:
x,y = attributes_filter.get_kde_date_attribute(log, attribute='time:timestamp')
gviz = graphs_visualizer.apply_plot(x, y, variant=graphs_visualizer.Variants.DATES)
graphs_visualizer.save(gviz, './statistics/attributes_filter.png')

### Dotted chart
Egy ábra ami jó valamire.

In [None]:
%%capture
pm4py.save_vis_dotted_chart(log, './statistics/dotted_chart.png', attributes=['concept:name', 'org:resource'])

### Events distribution
Event-k eloszlása a beállított idő szerint.

In [None]:
d_type = 'days_month' #options: hours, days_week, days_month, months, years
pm4py.save_vis_events_distribution_graph(log, './statistics/event_distribution.jpg', distr_type=d_type) # png valamiért nem jó

### Directly follows statistics

#### Heatmap

In [None]:
%%capture
df = dict(fp_log['dfg'])

activities = list(fp_log['activities'])
activities = sorted(activities, key=lambda x: int(x.split('P')[1]))
df_mtx = pd.DataFrame(columns=activities,index=activities)

for (i, j), value in df.items():
    df_mtx.at[i, j] = value

df_mtx = df_mtx.fillna(int(0))

plt.clf()
sns.heatmap(df_mtx, cmap='coolwarm', robust=True, annot=True, fmt='.0f')
plt.savefig('./statistics/directly_follows_heatmap.png', format='png', dpi=300, bbox_inches='tight')

#### Graph

In [None]:
%%capture
dfg = dfg_discovery.apply(log)
gviz = dfg_visualization.apply(dfg, log=log, variant=dfg_visualization.Variants.FREQUENCY)
dfg_visualization.save(gviz, './statistics/directly_follows_graph.png')

# Sequence mining

### Miners


#### Alpha

In [None]:
net_a, im, fm = pm4py.discover_petri_net_alpha(log)
pm4py.save_vis_petri_net(net_a, im, fm, './miners/alpha/petri_net_alpha.png')

In [None]:
fp_net = footprints_discovery.apply(net_a, im, fm)
gviz = fp_visualizer.apply(fp_net, parameters={fp_visualizer.Variants.SINGLE.value.Parameters.FORMAT:'png'})
fp_visualizer.save(gviz, './miners/alpha/alpha_footprint.png')

##### Frequency

In [None]:
%%capture
parameters = {
    pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT:'png'
}

gviz = pn_visualizer.apply(net_a, im, fm, parameters=parameters, variant=pn_visualizer.Variants.FREQUENCY, log=log)
pn_visualizer.save(gviz, './miners/alpha/petri_net_frequency_alpha.png')

##### Performance
Átmenetek átlagos ideje

In [None]:
%%capture
parameters = {
    pn_visualizer.Variants.PERFORMANCE.value.Parameters.FORMAT:'png'

}
gviz = pn_visualizer.apply(net_a, im, fm, parameters=parameters, variant=pn_visualizer.Variants.PERFORMANCE, log=log)
pn_visualizer.save(gviz, './miners/alpha/petri_net_performance_alpha.png')

##### Evaluation

In [None]:
%%capture
fitness_alpha = pm4py.fitness_token_based_replay(log, net_a, im, fm)
fit_alpha = fitness_alpha['log_fitness']
precision_alpha = pm4py.precision_token_based_replay(log, net_a, im, fm)
gen_alpha = generalization_evaluator.apply(log, net_a, im, fm)
simp_alpha = simplicity_evaluator.apply(net_a)

#### Inductive

In [None]:
net_i, im, fm = pm4py.discover_petri_net_inductive(log)
pm4py.save_vis_petri_net(net_i, im, fm, './miners/inductive/petri_net_inductive.png')

##### Business Process Model and Notation (BPMN)
Workflow ábrázolása diagrammon ami gyakran tartalmaz:
- activity
- döntési pont
- nyilak (folyamat iránya)

In [None]:
bpmn_graph = pm4py.discover_bpmn_inductive(log, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp')
gviz = pm4py.visualization.bpmn.visualizer.apply(bpmn_graph)
pm4py.visualization.bpmn.visualizer.save(gviz,'./miners/inductive/BPMN.png')

##### Evaluation

In [None]:
%%capture
fitness_ind = pm4py.fitness_token_based_replay(log, net_i, im, fm)
fit_ind = fitness_ind['log_fitness']
precision_ind = pm4py.precision_token_based_replay(log, net_i, im, fm)
gen_ind = generalization_evaluator.apply(log, net_i, im, fm)
simp_ind = simplicity_evaluator.apply(net_i)

#### Heuristic

In [None]:
net_h, im, fm = pm4py.discover_petri_net_heuristics(log, dependency_threshold=0.7)
pm4py.save_vis_petri_net(net_h, im, fm, './miners/heuristic/petri_net_heuristic.png')

In [None]:
dth = 0.99
heu_net = pm4py.discover_heuristics_net(log, dependency_threshold=dth)
pm4py.save_vis_heuristics_net(heu_net, './miners/heuristic/heunet_dth_' + str(dth) + '.png')

##### Minimum activity count
Kiveszi azon activity-et amik amelyek a határértéknél kevesebbszer szerepelnek.

In [None]:
mac = 800
heu_net_mac = heuristics_miner.apply_heu(log, parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.MIN_ACT_COUNT: mac})
gviz = hn_visualizer.apply(heu_net_mac, parameters={'format':'png'})
hn_visualizer.save(gviz, './miners/heuristic/heunet_mac_' + str(mac) + '.png')

##### Minimum directly follows
Két aktivitás között minimum hány kapocsolatnak kell lennie, hogy belekerüljön a gráfba.

In [None]:
mdfg = 200
heu_net_mdfg = heuristics_miner.apply_heu(log, parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.MIN_DFG_OCCURRENCES: mdfg})
gviz = hn_visualizer.apply(heu_net_mdfg, parameters={'format':'png'})
hn_visualizer.save(gviz, './miners/heuristic/heunet_mdfg_' + str(mdfg) + '.png')

##### Evaluation

In [None]:
%%capture
fitness_heu = pm4py.fitness_token_based_replay(log, net_h, im, fm)
fit_heu = fitness_heu['log_fitness']
precision_heu = pm4py.precision_token_based_replay(log, net_h, im, fm)
gen_heu = generalization_evaluator.apply(log, net_h, im, fm)
simp_heu = simplicity_evaluator.apply(net_h)

### Evaluation comparison

In [None]:
ev_comp = pd.DataFrame(columns=['Alpha','Inductive','Heuristic'], index=['fitness','precision','generalization','simplicity'])

ev_comp.at['fitness','Alpha'] = round(fit_alpha, 3) # type: ignore
ev_comp.at['fitness','Inductive'] = round(fit_ind, 3) # type: ignore
ev_comp.at['fitness','Heuristic'] = round(fit_heu, 3) # type: ignore

ev_comp.at['precision','Alpha'] = round(precision_alpha, 3) # type: ignore
ev_comp.at['precision','Inductive'] = round(precision_ind, 3) # type: ignore
ev_comp.at['precision','Heuristic'] = round(precision_heu, 3) # type: ignore

ev_comp.at['generalization','Alpha'] = round(gen_alpha, 3) # type: ignore
ev_comp.at['generalization','Inductive'] = round(gen_ind, 3) # type: ignore
ev_comp.at['generalization','Heuristic'] = round(gen_heu, 3) # type: ignore

ev_comp.at['simplicity','Alpha'] = round(simp_alpha, 3) # type: ignore
ev_comp.at['simplicity','Inductive'] = round(simp_ind, 3) # type: ignore
ev_comp.at['simplicity','Heuristic'] = round(simp_heu, 3) # type: ignore


ev_comp = ev_comp[ev_comp.columns].astype(float)
plt.clf()
sns.heatmap(ev_comp, cmap='Greens',robust=True,annot=True,fmt='.2f')
plt.savefig('./miners/evaluation_comp.png',format='png',dpi=300,bbox_inches='tight')

### Frequent itemset and sequential pattern mining

In [None]:
'''Események átkódolása számokká'''
cnames = heu_net.activities
codes = []
for i in range(0,len(cnames)): codes.append(i+1)

name_code_dict = dict(zip(cnames,codes))
code_name_dict = dict(zip(codes,cnames))

filt_log = log.loc[:,['case:concept:name','concept:name']]
filt_log['Code'] = ''
for i in range(0,len(filt_log)): filt_log.at[i,'Code'] = name_code_dict[filt_log.at[i,'concept:name']]

In [None]:
'''Trace-ek kinyerése'''
traces = list(log['case:concept:name'].values)
traces = list(dict.fromkeys(traces))

sequences = []
for trace in traces:
    fl = filt_log[filt_log['case:concept:name']==str(trace)]
    events = list(fl['Code'].values)
    sequences.append(events)

In [None]:
'''Trace lista mentése'''
with open('sequences.csv','w',newline='') as f:
    writer = csv.writer(f)
    writer.writerows(sequences)

In [None]:
'''SPMF formátummá konvertálás'''
filename_in_spmf = 'sequences.csv'
filename_out_spmf = 'sequences_spmf.csv'
command = 'java -jar spmf.jar run Convert_a_sequence_database_to_SPMF_format ' + filename_in_spmf + ' ' + filename_out_spmf + ' CSV_INTEGER 100000'
os.system(command)

In [None]:
'''Frequent itemset mining'''
minsup_fi = 0.02
filename_in_fi = 'sequences_spmf.csv'
filename_out_fi = 'fis_Apr_' + str(int(minsup_fi*100)) + '.csv'
command = 'java -jar spmf.jar run Apriori ' + filename_in_fi + ' ' + filename_out_fi + ' ' + str(minsup_fi)  #The command needs to be a string
os.system(command)

In [None]:
'''Frequent closed itemset mining'''
minsup_fci = 0.02
filename_in_fci = 'sequences_spmf.csv'
filename_out_fci = 'fis_AprC_' + str(int(minsup_fci*100)) + '.csv'
command = 'java -jar spmf.jar run AprioriClose ' + filename_in_fci + ' ' + filename_out_fci + ' ' + str(minsup_fci)  #The command needs to be a string
os.system(command)

In [None]:
'''Frequent sequential pattern mining'''
minsup_fsp = 0.2
filename_in_fsp = 'sequences_spmf.csv'
filename_out_fsp = 'fsp_PrefixSpan_' + str(int(minsup_fsp*100)) + '.csv'
command = 'java -jar spmf.jar run PrefixSpan ' + filename_in_fsp + ' ' + filename_out_fsp + ' ' + str(minsup_fsp)  #The command needs to be a string
os.system(command)


In [None]:
minsup_fsp = 0.2
min_pat_l = 2
max_pat_l = 4
gap = 2
filename_in_fsp = 'sequences_spmf.csv'
filename_out_fsp = 'fsp_CMSPAN_' + str(int(minsup_fsp*100)) + '_minp_' + str(min_pat_l) + '_maxp_'+str(max_pat_l) + '_gap_' + str(gap) + '.csv'
command = 'java -jar spmf.jar run CM-SPAM ' + filename_in_fsp + ' ' + filename_out_fsp + ' ' + str(minsup_fsp) + ' ' + str(min_pat_l) + ' ' + str(max_pat_l) + ' "" ' + str(gap) + ' true'  #The command needs to be a string
os.system(command)