In [1]:
import os
from collections import Counter

In [2]:
from pm4py.algo.discovery.dfg import factory as dfg_factory
from pm4py.objects.log.adapters.pandas import csv_import_adapter
from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics
from pm4py import util as pmutil

from pandas import read_csv, concat
file = read_csv(os.path.join("files","input_data","sepsis.csv"))

log = csv_import_adapter.convert_timestamp_columns_in_df(file, timest_columns=[
    'timestamp'
])

In [3]:
log

Unnamed: 0,case_id,activity,lifecycle,resource,timestamp,age,crp,diagnose,diagnosticartastrup,diagnosticblood,...,lacticacid,leucocytes,oligurie,sirscritheartrate,sirscritleucos,sirscrittachypnea,sirscrittemperature,sirscriteria2ormore,activity_instance_id,.order
0,A,ER Registration,complete,A,2014-10-22 11:15:41+00:00,85.0,,A,True,True,...,,,False,True,False,True,True,True,1,1
1,A,Leucocytes,complete,B,2014-10-22 11:27:00+00:00,,,,,,...,,9.6,,,,,,,2,2
2,A,CRP,complete,B,2014-10-22 11:27:00+00:00,,210.0,,,,...,,,,,,,,,3,3
3,A,LacticAcid,complete,B,2014-10-22 11:27:00+00:00,,,,,,...,2.2,,,,,,,,4,4
4,A,ER Triage,complete,C,2014-10-22 11:33:37+00:00,,,,,,...,,,,,,,,,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15209,KNA,CRP,complete,B,2014-12-16 07:00:00+00:00,,660.0,,,,...,,,,,,,,,15210,15210
15210,KNA,Release A,complete,E,2014-12-16 17:00:00+00:00,,,,,,...,,,,,,,,,15211,15211
15211,LNA,ER Registration,complete,L,2014-12-03 10:50:28+00:00,50.0,,,False,False,...,,,False,False,False,False,False,False,15212,15212
15212,LNA,ER Triage,complete,C,2014-12-03 10:54:19+00:00,,,,,,...,,,,,,,,,15213,15213


In [4]:
from pandas import DataFrame

def get_duplicate_edges(log):
    events = log.groupby('case_id')

    duplicate_edge_counter = Counter()
    for case_id, traces in events:
        node_pairs = [
            (traces['activity'].iloc[i], traces['activity'].iloc[i+1]) 
            for i in range(len(traces)-1)
        ]
        duplicate_edges = Counter(node_pairs) - Counter(set(node_pairs))
        duplicate_edge_counter += duplicate_edges
        
    return duplicate_edge_counter

In [5]:

source_list=[]
sink_dict={}
source_cases=[]
for i, case_id in enumerate(log['case_id']):
    sink_dict[case_id]=log['activity'][i]
    if case_id not in source_cases:
        source_cases.append(case_id)
        source_list.append(log['activity'][i])
source_dict = Counter(source_list)
sink_dict = Counter(sink_dict.values())


In [6]:
from ntnu_process_mining import ProcessMap


activities = file.groupby('case_id').agg({'activity': lambda x: ''.join(x)}).groupby('activity')

process_map = ProcessMap()

def on_filter_change(change={}):
    filter_value = process_map.filter

    case_ids = activities.filter(lambda x: len(x) > filter_value).index
    
    filtered_log = log[log['case_id'].isin(case_ids)]
    dfg_frequency, dfg_performance = df_statistics.get_dfg_graph(filtered_log, measure="both",
                                                                activity_key='activity',
                                                                timestamp_key='timestamp',
                                                                case_id_glue='case_id')
    
    duplicate_edges = get_duplicate_edges(filtered_log)

    abs_dfg = Counter(dfg_frequency) - duplicate_edges
    pm_edges = []
       
    #pm_edges.append({'from': 'start', 'to': dst, 'value': weight})
    # max_weight = max([freq for edge,freq in dfg_frequency.items()])
    sources = []
    destinations = []
    for (src, dst), weight in dfg_frequency.items():
        # if max_weight*(process_map.filter/100) <= weight:
        abs_freq = abs_dfg[(src, dst)] if abs_dfg[(src, dst)] else 0
        pm_edges.append({"from": src, "to": dst, "freq": weight, "abs_freq": abs_freq, "perf": round(dfg_performance[(src, dst)]/60, 0)})
        sources.append(src)
        destinations.append(dst)   
    for key, value in source_dict.items():
        if key in sources:
            pm_edges.append({"from": "SOURCE", "to": key, "value": ""})
    for key, value in sink_dict.items():
        if key in destinations:
            pm_edges.append({"from": key, "to": "SINK", "value": ""})    
        
    process_map.value=pm_edges

process_map.observe(on_filter_change, names='filter')

on_filter_change()
process_map

ProcessMap(value=[{'from': 'ER Registration', 'to': 'ER Triage', 'freq': 35, 'abs_freq': 35, 'perf': 16.0}, {'…