## Pre-processing to make data more comparable

#### Brandenburg

In [None]:
import pm4py
import pandas as pd
import ast  # To safely evaluate string representations of lists

ACTIVITY_COLUMN = 'concept:name'

INPUT_FILENAME = "baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020.xes"
OUTPUT_FILENAME = 'baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020_processed.xes'

#INPUT_FILENAME = "brandenburg-preprocessed-gesetzgebung-2006-2020.xes"
#OUTPUT_FILENAME = 'brandenburg-preprocessed-gesetzgebung-2006-2020_processed.xes'

isBrandenburg = False
#isBawue = True


df = pm4py.read_xes(INPUT_FILENAME)

parsing log, completed traces ::   0%|          | 0/1005 [00:00<?, ?it/s]

Now I also split up all activities with several activity names in one -> just make two activities from them - e.g.: [Lesung, Sitzung] - copy the activity and then have one "Lesung" and one "Sitzung"

In [19]:
# Function to expand rows with list-like activity strings
def expand_activity_rows(df):
    rows = []
    for _, row in df.iterrows():
        activity_value = row[ACTIVITY_COLUMN]
        try:
            # Safely evaluate if the value is a string representation of a list
            activities = ast.literal_eval(activity_value)
            if isinstance(activities, list):
                # Duplicate row for each item in the list
                for activity in activities:
                    new_row = row.copy()
                    new_row[ACTIVITY_COLUMN] = activity
                    rows.append(new_row)
            else:
                rows.append(row)
        except (ValueError, SyntaxError):
            # If not a list-like string, keep the row as-is
            rows.append(row)
    return pd.DataFrame(rows)

# Apply the function to the DataFrame
df = expand_activity_rows(df)


### Brandenburg: enumerate readings (Lesung)

In [20]:
# in the Brandenburg data there is no numbering to the Readings - they are just called "Lesung"
# so to make the comparison easier, we number them

# Function to enumerate "Lesung" activities
def modify_labels(group):
    # Identify rows where the activity is "Lesung"
    lesung_mask = group['concept:name'] == 'Lesung'
    # Enumerate the occurrences of "Lesung" within the group
    lesung_counter = range(1, sum(lesung_mask) + 1)
    # Update the activity column for "Lesung" rows
    group.loc[lesung_mask, 'concept:name'] = [f'{n}. Lesung' for n in lesung_counter]
    return group

# Apply the function to each group of the DataFrame
if isBrandenburg:
    df = df.groupby(['case:concept:name']).apply(modify_labels).reset_index(drop=True)
    print(df[df['concept:name'].str.contains('Lesung')].head(10)["concept:name"])

In [None]:
'''
if isBawue:
    # since proM performance spectrum miner does not like it "," needs to be removed 
    df[ACTIVITY_COLUMN] = df[ACTIVITY_COLUMN].str.replace(',', '')
    df[ACTIVITY_COLUMN] = df[ACTIVITY_COLUMN].str.lstrip()
'''


In [22]:
pm4py.write_xes(df, OUTPUT_FILENAME)

exporting log, completed traces ::   0%|          | 0/1005 [00:00<?, ?it/s]