## Notebook to build an enriched case log from an event log (my version)

Motivated and parts copied from a fresh approach to analyze process outcomes ... todo: ref

In [58]:
import pandas as pd
import pm4py

## Data loading

In [59]:
#INPUT_FILE_NAME = 'berlin-preprocessed-gesetzgebung-2006-2020_with_context_passed_bills.xes'
#OUTPUT_FILE_NAME = 'berlin-preprocessed-gesetzgebung-2006-2020_with_context_passed_bills.csv'

#INPUT_FILE_NAME = 'brandenburg-preprocessed-gesetzgebung-2006-2020_processed_with_context_passed_bills.xes'
#OUTPUT_FILE_NAME = 'brandenburg-preprocessed-gesetzgebung-2006-2020_processed_with_context_passed_bills.csv'

INPUT_FILE_NAME = 'baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020_processed_with_context_passed_bills.xes'
OUTPUT_FILE_NAME = 'baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020_processed_with_context_passed_bills.csv'

CASE_ID_COL = 'case:concept:name'
TIMESTAMP_COL = 'time:timestamp'
ACTIVITY_COL = 'concept:name'

df = pm4py.read_xes(INPUT_FILE_NAME)
df = df.sort_values(TIMESTAMP_COL, ignore_index=True)
print(len(df), 'rows read from', INPUT_FILE_NAME)
#df.info()
#df.head()

parsing log, completed traces ::   0%|          | 0/312 [00:00<?, ?it/s]

2603 rows read from baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020_processed_with_context_passed_bills.xes


### Remove columns we do not care about

In [60]:
# remove columns that are not of interest at all
columns_to_drop = ['DBID', 'ReihNr', 'DokArt', 'DokArtL', 'lifecycle:transition', 
                   'Titel', 'DokNr', 'Sb', 'LokURL', 'HNr', 'Jg', 'NrInTyp', 
                   'DHerk', 'DHerkL', 'DokTyp', '@@index', 'DokDat', 'VkDat',
                   'case:variant', 'case:variant-index', 'case:creator',
                    'case:VSys',
                    'case:VTyp', 'case:VTypL',
                   'case:DHerk', 'case:DHerkL', 'case:Wp', 'case:NrInTyp',
                   'case:@@case_index']
df = df.drop(columns=columns_to_drop, errors='ignore')

# Case attributes

In [61]:
# Configure the number of attributes to be included
INCLUDE_DATA = False
INCLUDE_TIME = True
INCLUDE_DELAYS = True
INCLUDE_DATA_NOT_CONNECTED_TO_ACTIVITIES = True

In [62]:
case_index = {}
activity_list = []
log_starting_time = df.at[0, TIMESTAMP_COL] # this will be used to calculate the relative time
columns = df.columns.to_list()

for col in [CASE_ID_COL, ACTIVITY_COL, TIMESTAMP_COL]:
    columns.remove(col)
    
for index, row in df.iterrows():
    case_id = row[CASE_ID_COL]
    activity_name = row[ACTIVITY_COL]
    
    if not activity_name in activity_list:
        activity_list.append(activity_name)
        
    qualifier = activity_name + "."
    
    if not case_id in case_index:
        case_record = {'case_id': case_id, qualifier+'count': 1, 'event_count': 1}
        
        # add time related features
        if INCLUDE_TIME:
            case_record['start_time'] = row[TIMESTAMP_COL]
            case_record['start_time_rel'] = int((row[TIMESTAMP_COL]-log_starting_time).components.days)
            case_record['duration'] = 0
            case_record[qualifier + 'start'] = 0
        
        
        # add data related features
        for col in columns:
            if not pd.isna(row[col]):
                if INCLUDE_DATA:
                    case_record[qualifier + col] = row[col]
        
        for col in columns:
            if not pd.isna(row[col]):
                if INCLUDE_DATA_NOT_CONNECTED_TO_ACTIVITIES:
                    if (col.startswith('case:')):
                        print(col)
                        case_record[col] = row[col]
        
        # store the information so that it can be updated further 
        # when more events of the same case occur (else part)
        case_index[case_id] = case_record
    
    else:
        case_record = case_index[case_id]
        case_record['event_count'] += 1
        
        # if this activity type has already been seen for this case
        # note that it is the n-th occurrence of this activity type
        # and adjust the qualifier accordingly
        # else set the count to 1
        if qualifier + "start" in case_record:
            activity_count = case_record[qualifier + 'count'] + 1
            case_record[qualifier + 'count'] = activity_count
            qualifier = activity_name + ":" + str(activity_count) + "."
        else:
            case_record[qualifier + 'count'] = 1
        
        if INCLUDE_TIME:
            duration = row[TIMESTAMP_COL] - case_record['start_time']
            case_record[qualifier + 'start'] = int(duration.components.days)
            case_record['duration'] = int(duration.components.days)
        
        
        for col in columns:
            if not pd.isna(row[col]):
                if INCLUDE_DATA:
                    case_record[qualifier + col] = row[col]
        

out_df = pd.DataFrame([case_index[case_id] for case_id in case_index])

# fill nan counts with 0
for col in out_df.columns:
    if col.endswith('.count'):
        out_df[col] = out_df[col].fillna(0)
        
if INCLUDE_DELAYS: 
    first_activities = activity_list.copy()
    while len(first_activities) > 0:
        first_act = first_activities.pop()
        second_activities = first_activities.copy()
        while len(second_activities) > 0:
            second_act = second_activities.pop()
            column_name = first_act + ':' + second_act + '.delay'
            out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']

out_df.info(verbose=True)


case:DokTypLFirstDoc
case:VSysL
case:VorgangsDeskriptoren
case:start_month
case:start_weekday
case:author_first_activity
case:author_first_activity_count
case:WIP_during_start
case:pdf_bytes
case:pdf_word_count
case:squire
case:salary
case:staff
case:commDays
case:plenDays
case:af_score
case:is_passed_bill
case:is_election_year
case:yearly_frequency
case:yearly_cycle_time
case:yearly_arrival_rate
case:yearly_variants
case:author_first_activity_christlich-demokratische union (cdu)
case:author_first_activity_freie demokratische partei / demokratische volkspartei (fdp/dvp)
case:author_first_activity_landesregierung
case:author_first_activity_sozialdemokratische partei deutschlands (spd)
case:author_first_activity_nan
case:author_first_activity_ausschuss
case:author_first_activity_präsident des landtags
case:author_first_activity_bündnis 90/die grünen (grüne)
case:author_first_activity_mack, winfried (cdu)
case:author_first_activity_untersteller, franz (grüne)
case:author_first_activity_kü

  out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']
  out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']
  out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']
  out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']
  out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']
  out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']
  out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']
  out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']
  out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']
  out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']
  out_df[column_name] = out_df[second_act + '.start'] - out_df[first_act + '.start']
  out_df[column_name] = out_df[second_act + '.start'] - out_df[fi

## Write output

In [63]:
output_file_name = 'cases_' + OUTPUT_FILE_NAME
out_df.to_csv(output_file_name, index=False)
print(len(out_df), 'cases written to', output_file_name)

312 cases written to cases_baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020_processed_with_context_passed_bills.csv
