In [1]:
import os
from collections import Counter

In [104]:
from pm4py.algo.discovery.dfg import factory as dfg_factory
from pm4py.objects.log.adapters.pandas import csv_import_adapter
from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics
from pm4py import util as pmutil

from pandas import read_csv, concat
file = read_csv(os.path.join("files","input_data","sepsis.csv"))

# This is needed because the sepsis dataset has some rows where case_id = NaN
fixed_file = file.fillna(value={'case_id':"N/A"})

log = csv_import_adapter.convert_timestamp_columns_in_df(fixed_file, timest_columns=[
    'timestamp'
])


In [105]:
def get_duplicate_edges_and_sources_and_sinks(log):
    events = log.groupby('case_id')

    duplicate_edge_counter = Counter()
    source_list = []
    sink_list = []
    for case_id, traces in events:
        source_list.append(traces['activity'].iloc[0])
        sink_list.append(traces['activity'].iloc[-1])
        
        node_pairs = [
            (traces['activity'].iloc[i], traces['activity'].iloc[i+1]) 
            for i in range(len(traces)-1)
        ]
        duplicate_edges = Counter(node_pairs) - Counter(set(node_pairs))
        duplicate_edge_counter += duplicate_edges
    
    return duplicate_edge_counter, Counter(source_list), Counter(sink_list)

In [108]:
from ntnu_process_mining import ProcessMap
import datetime
from pandas import concat

cases = log.groupby('case_id')
num_cases = len(cases)
activities = cases.agg({'activity': lambda x: ''.join(x)}).groupby('activity')

process_map = ProcessMap()

def on_filter_change(change={}):
    filter_value = process_map.filter

    percentile = filter_value / 100

    sorted_activities = activities.size().sort_values(ascending=False)

    traces_to_include = []
    cumulative = 0
    for trace, count in sorted_activities.items():
        traces_to_include.append(trace)
        cumulative += count
        if cumulative > num_cases * percentile:
            break


    case_ids = concat([activities.get_group(trace).index.to_series() for trace in traces_to_include])

    #case_ids = activities.filter(lambda x: len(x) > filter_value).index
    
    filtered_log = log[log['case_id'].isin(case_ids)]
    dfg_frequency, dfg_performance = df_statistics.get_dfg_graph(filtered_log, measure="both",
                                                                activity_key='activity',
                                                                timestamp_key='timestamp',
                                                                case_id_glue='case_id')
    
    dfg_performance_med = df_statistics.get_dfg_graph(filtered_log, measure="performance",
                                                                activity_key='activity',
                                                                perf_aggregation_key="median",
                                                                timestamp_key='timestamp',
                                                                case_id_glue='case_id')
    
    duplicate_edges, sources, sinks = get_duplicate_edges_and_sources_and_sinks(filtered_log)

    abs_dfg = Counter(dfg_frequency) - duplicate_edges
    pm_edges = []
    
    for dst, freq in sources.items():
        abs_freq = freq
        pm_edges.append({
            "from": "START", 
            "to": dst, 
            "freq": freq, 
            "abs_freq": abs_freq, 
            "perf": "0", 
            "perf_med": "0"
        })

        
    for (src, dst), freq in dfg_frequency.items():
        abs_freq = abs_dfg[(src, dst)] if abs_dfg[(src, dst)] else 0
        pm_edges.append({
            "from": src, 
            "to": dst, 
            "freq": freq, 
            "abs_freq": abs_freq, 
            "perf": str(datetime.timedelta(seconds=round(dfg_performance[(src, dst)], 0))), 
            "perf_med": str(datetime.timedelta(seconds=round(dfg_performance_med[(src, dst)], 0)))
        })
    
    for src, freq in sinks.items():
        abs_freq = freq
        pm_edges.append({
            "from": src, 
            "to": "END", 
            "freq": freq, 
            "abs_freq": abs_freq, 
            "perf": "0", 
            "perf_med": "0"
        })
        
    process_map.value=pm_edges

process_map.observe(on_filter_change, names='filter')
process_map.filter=10
on_filter_change()
process_map

ProcessMap(filter=10, value=[{'from': 'START', 'to': 'ER Registration', 'freq': 114, 'abs_freq': 114, 'perf': …

In [3]:
import os
from ntnu_process_mining import ProcessMap

from pandas import read_csv, concat

file = read_csv(os.path.join("files","input_data","sepsis.csv"))
# This is needed because the sepsis dataset has some rows where case_id = NaN
df = file.fillna(value={'case_id':"N/A"})

ProcessMap(df)


ProcessMap(value=[{'from': 'START', 'to': 'ER Registration', 'freq': 264, 'abs_freq': 264, 'perf': '0', 'perf_…