In [12]:
# First define the following parameters in this cell
# Then run the entire notebook

from os import listdir
from os.path import isfile, join

## folder name where a list of xml files are stored
## these will be concatenated to one event log file for the given process type

## filename of xml input
folderPath = "./OriginalData/Berlin"


files = [join(folderPath, f) for f in listdir(folderPath) if isfile(join(folderPath, f))]

print(files)


## VTyp to take as process type
#vtyp = "Antrag"
vtyp = "Gesetz"

## output filename
outputFilename = './all-data-xes/berlin-gesetz' + '.xes'
#outputFilename = './WP16_BadenWürttemberg_' + vtyp + '.xes'
#outputFilename = './WP6_Brandenburg_' + vtyp + '.xes'




['./OriginalData/Berlin/pardok-wp16.xml', './OriginalData/Berlin/pardok-wp18.xml', './OriginalData/Berlin/pardok-wp19.xml', './OriginalData/Berlin/pardok-wp14.xml', './OriginalData/Berlin/pardok-wp17.xml', './OriginalData/Berlin/pardok-wp12.xml', './OriginalData/Berlin/pardok-wp15.xml', './OriginalData/Berlin/pardok-wp11.xml', './OriginalData/Berlin/pardok-wp13.xml']


In [13]:
# read data into dict
import pandas as pd
import xmltodict
import pprint
import pandas as pd
pp = pprint.PrettyPrinter(depth=4)

processes = []
for file in files:
    with open(file) as xml_file:
        data = xmltodict.parse(xml_file.read())
        dataVorgaenge = data["Export"]["Vorgang"] # this has the actual data
        processes.extend(dataVorgaenge) # append all processes to the list

In [14]:
# get all processes into a dataframe
processesDF = pd.DataFrame.from_dict(processes)

print(processesDF.keys())
print("\n")

groupedByType = processesDF.groupby(['VTyp'])
print(groupedByType['VTyp'].count())

Index(['VNr', 'VFunktion', 'VID', 'ReihNr', 'VTyp', 'VTypL', 'VSys', 'VSysL',
       'VIR', 'Nebeneintrag', 'Dokument'],
      dtype='object')


VTyp
Anfrage                75958
Antrag                 10180
Beschlussempfehlung       56
Debatte                14815
Gesetz                  2267
Wahl                     605
Name: VTyp, dtype: int64


In [15]:
docsOfVorgaenge = groupedByType.get_group(vtyp)["Dokument"]
vSysLOfVorgange = groupedByType.get_group(vtyp)["VSysL"]
vSysOfVorgange = groupedByType.get_group(vtyp)["VSys"]
vorgangNebeneintraege = groupedByType.get_group(vtyp)["Nebeneintrag"]

# for a Vorgang there are several Deskriptoren stored in several Nebeneintraege
# -> get that information per Vorgang
vorgangDeskriptoren = []
for nebeneintraege in vorgangNebeneintraege:
    if type(nebeneintraege) is not list:
        if nebeneintraege is not dict:
            deskriptoren = []
        else:
            deskriptoren = [nebeneintraege.get("Desk", None)]
    else:
        deskriptoren = [obj.get("Desk", None) for obj in nebeneintraege]

    vorgangDeskriptoren.append([x for x in deskriptoren if x is not None])

allDocs = []

# add trace id to each document
# so that the documents can then be single events that belong to a specific trace in an event log

for idx, vorgang in enumerate(docsOfVorgaenge):
    helperX = vorgang
    if type(vorgang) is dict:
        helperX = [vorgang]
        
    dokTypLOfFirstDoc = None
    for i, doc in enumerate(helperX):
        # add case id column to each document
        # call it case:concept:name as this is the name for pm4py transformation into XES format
        doc['case:concept:name'] = idx
        
        # if it is the first document in a Vorgang, then set the dokTypLOfFirstDoc - this could also be used to see what type of process a Vorgang is
        if (i == 0):
            dokTypLOfFirstDoc = doc.get("DokTypL", None)
        
        doc['case:DokTypLFirstDoc'] = dokTypLOfFirstDoc
            
        doc['case:VSys'] = vSysOfVorgange.iloc[idx]
        doc['case:VSysL'] = vSysLOfVorgange.iloc[idx]
        doc['case:VorgangsDeskriptoren'] = vorgangDeskriptoren[idx]

        # if there is no value for a key, replace it with "none"
        # or if a list value has none values
        for key, value in doc.items():
            if value is None: 
                doc.update({key: "none"})
            if type(value) is list:
                doc.update({key: [x if x is not None else "none" for x in value]})
        
        # for grouping reasons also sort all other entries that can be sorted
        doc_sortedEntries = {key: sorted(value) if type(value) is list else value for key, value in doc.items()}
        
        # now add this doc as a row 
        allDocs.append(doc)



# turn docs into data frame, so each row is a document now
df = pd.DataFrame(allDocs)
#print(df.keys())

# turn date string into date time object
df["DokDat"] = pd.to_datetime(df['DokDat'], format='%d.%m.%Y')
#print(df["DokDat"])

In [16]:
# use pm4py to create XES file
import pm4py
from pm4py.objects.conversion.log import converter as log_converter

#eventlog = df.copy(deep=True)
#eventlog.rename(columns={'DokDat': 'time:timestamp', 'DokTypL': 'concept:name'}, inplace=True)

event_log = pm4py.format_dataframe(df, case_id='case:concept:name', activity_key='DokTypL', timestamp_key='DokDat')
#start_activities = pm4py.get_start_activities(event_log)
#end_activities = pm4py.get_end_activities(event_log)
#print("Start activities: {}\nEnd activities: {}".format(start_activities, end_activities))

pm4py.write_xes(event_log, outputFilename)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[constants.CASE_CONCEPT_NAME] = df[constants.CASE_CONCEPT_NAME].astype("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_NAME_KEY] = df[xes_constants.DEFAULT_NAME_KEY].astype("string")


exporting log, completed traces ::   0%|          | 0/2267 [00:00<?, ?it/s]