## SO FAR
- possono esistere più processi nello stesso XML 

In [8]:
import xml.etree.ElementTree as ET
import pandas as pd
import re

In [9]:
def extract_bpmn_info(ns, file):
    """
    Extracts the tasks, gateways, start and end events from a BPMN file

    """
    # parse the xml file
    tree = ET.parse(file)
    root = tree.getroot()
    process = list(root)[-2]   # extract process child of the root element
    # list to store attributes of tasks, gateways, start and end events
    start_end = []
    tasks = []
    gateways = []
    edges = []
    # iterate through all elements in the process 
    for child in process:
        # extract all tasks
            if child.tag == ns + 'task' or child.tag == ns + 'serviceTask' or child.tag == ns + 'userTask' or child.tag == ns + 'sendTask' or child.tag == ns + 'receiveTask' or child.tag == ns + 'manualTask' or child.tag == ns + 'businessRuleTask' or child.tag == ns + 'scriptTask':
                tasks.append(child.attrib)
            # extract all gateways
            if child.tag == ns +  'exclusiveGateway' or child.tag == ns + 'parallelGateway':
                gateways.append(child.attrib)
            # extract start and end events
            if child.tag == ns + 'startEvent' or child.tag == ns + 'endEvent':
                start_end.append(child.attrib)
            # extract all sequence flows
            if child.tag == ns + 'sequenceFlow':
                edges.append(child.attrib)
    return tasks, gateways, start_end, edges
            


In [10]:
def add_name(df, tasks, gateways, start_end):
    """
    Adds the name of the source and target nodes to the dataframe using tasks, gateways, and start_end lists.
    """
    # Combine tasks, gateways, and start_end into a single dictionary with 'id' as the key and 'name' as the value
    id_to_name = {item['id']: item['name'] for item in tasks + gateways + start_end}

    # Define a function to map the sourceRef and targetRef to names
    def map_names(row):
        row['sourceName'] = id_to_name.get(row['sourceRef'], None)
        row['targetName'] = id_to_name.get(row['targetRef'], None)
        return row

    # Apply the map_names function to each row
    df = df.apply(map_names, axis=1)

    return df


In [11]:
def add_brackets(df):
    """
    Adds brackets to the source and target names based on the type of the node
    
    """
    # Define a function to process each row
    def process_row(row):
        # For 'sourceRef' column
        if re.search('start|end', row['sourceRef']):
            row['sourceOpen'], row['sourceClose'] = '((', '))'
        elif re.search('tasks|task|Tasks|voltaTask', row['sourceRef']):
            row['sourceOpen'], row['sourceClose'] = '[', ']'
        elif re.search('Gateways', row['sourceRef']):
            row['sourceOpen'], row['sourceClose'] = '{', '}'

        # For 'targetRef' column
        if re.search('start|end', row['targetRef']):
            row['targetOpen'], row['targetClose'] = '((', '))'
        elif re.search('tasks|task|Tasks|voltaTask', row['targetRef']):
            row['targetOpen'], row['targetClose'] = '[', ']'
        elif re.search('Gateways', row['targetRef']):
            row['targetOpen'], row['targetClose'] = '{', '}'

        return row

    # Apply the function row-wise
    df = df.apply(process_row, axis=1)
    
    return df

In [12]:
file = 'esteco_trial.bpmn'
ns = '{http://www.omg.org/spec/BPMN/20100524/MODEL}'

In [13]:
tasks, gateways, start_end, edges = extract_bpmn_info(ns, file)
df = pd.DataFrame(edges) 
df = add_name(df, tasks, gateways, start_end)
df = add_brackets(df)
df

Unnamed: 0,id,name,sourceRef,targetRef,sourceName,targetName,sourceOpen,sourceClose,targetOpen,targetClose
0,sequenceFlows_500af155-4592-d7de-9fc6-ad9579f6...,Sequence Flow_31,startEvents_5c1ab15d-31dc-df54-03cd-ad2751a16ead,tasks_aa9180b2-3294-ee67-7047-386150c6ff6c,Start Event_26,Formulate industrial problem,((,)),[,]
1,sequenceFlows_9c10c84f-c35b-eef3-0cd2-b4514858...,Sequence Flow_74,tasks_756b276b-009a-d9c2-f495-d10bca1bf833,voltaTask_c9afd6c1-b50a-93d0-0286-df6f3592f8f5,Finalize modeling execution,Design of Experiments,[,],[,]
2,sequenceFlows_8779f83b-4f73-d109-661b-2bb8031f...,Sequence Flow_81,voltaTask_c9afd6c1-b50a-93d0-0286-df6f3592f8f5,tasks_db9ebf10-ab25-568e-bf83-6f6d373481dd,Design of Experiments,Translate results,[,],[,]
3,sequenceFlows_a6e6be39-2f75-9072-a04d-a72fff66...,Sequence Flow_89,tasks_db9ebf10-ab25-568e-bf83-6f6d373481dd,tasks_15d03a56-4f3a-5c0a-eeca-71b26c770e03,Translate results,Make decision,[,],[,]
4,sequenceFlows_55a84103-769b-5600-e7e5-549b3b6f...,Sequence Flow_95,tasks_15d03a56-4f3a-5c0a-eeca-71b26c770e03,endEvents_9aa6a45a-e22e-f9d1-69fb-6a8d04623bc7,Make decision,End Event_93,[,],((,))
5,sequenceFlows_d1a32e7a-8ccf-1d8b-0ee2-71bb81da...,Sequence Flow_117,tasks_aa9180b2-3294-ee67-7047-386150c6ff6c,tasks_3cce143e-1118-df9d-8902-fbd1ce56666e,Formulate industrial problem,Understand business and industrial case,[,],[,]
6,sequenceFlows_8dc362fa-d552-6ede-99d5-eea6a4e6...,Sequence Flow_131,tasks_3cce143e-1118-df9d-8902-fbd1ce56666e,tasks_80fe73b3-afce-ee10-de4a-9b463fca2805,Understand business and industrial case,Analysis of data available,[,],[,]
7,sequenceFlows_540b5e82-530f-1577-bbab-3d25adfc...,Sequence Flow_138,tasks_80fe73b3-afce-ee10-de4a-9b463fca2805,tasks_9cb90488-84e5-07a2-6251-b8167b3d32c0,Analysis of data available,Translate to modeling workflows,[,],[,]
8,sequenceFlows_a4e5d108-11c8-654e-b327-cd5597f9...,Sequence Flow_155,tasks_651badfc-dba4-a165-f822-9fd39491daff,tasks_21ecbcab-3cbe-8564-e0fb-f322ccd0c34f,Define modeling execution,Evaluate modeling execution,[,],[,]
9,sequenceFlows_41511c6a-9ee2-7e3d-211b-c312e0d2...,Sequence Flow_225,tasks_21ecbcab-3cbe-8564-e0fb-f322ccd0c34f,exclusiveGateways_27fca393-59bb-208d-ea27-0f9f...,Evaluate modeling execution,Approved?,[,],{,}


In [14]:
for i in range(len(df)):
    print(f"{df.iloc[i]['sourceRef']}", f"{df.iloc[i]['sourceOpen']}", f"{df.iloc[i]['sourceName']}", f"{df.iloc[i]['sourceClose']}", '-->','|', f"{df.iloc[i]['name']}",'|', f"{df.iloc[i]['targetRef']}", f"{df.iloc[i]['targetOpen']}", f"{df.iloc[i]['targetName']}", f"{df.iloc[i]['targetClose']}", sep='')


startEvents_5c1ab15d-31dc-df54-03cd-ad2751a16ead((Start Event_26))-->|Sequence Flow_31|tasks_aa9180b2-3294-ee67-7047-386150c6ff6c[Formulate industrial problem]
tasks_756b276b-009a-d9c2-f495-d10bca1bf833[Finalize modeling execution]-->|Sequence Flow_74|voltaTask_c9afd6c1-b50a-93d0-0286-df6f3592f8f5[Design of Experiments]
voltaTask_c9afd6c1-b50a-93d0-0286-df6f3592f8f5[Design of Experiments]-->|Sequence Flow_81|tasks_db9ebf10-ab25-568e-bf83-6f6d373481dd[Translate results]
tasks_db9ebf10-ab25-568e-bf83-6f6d373481dd[Translate results]-->|Sequence Flow_89|tasks_15d03a56-4f3a-5c0a-eeca-71b26c770e03[Make decision]
tasks_15d03a56-4f3a-5c0a-eeca-71b26c770e03[Make decision]-->|Sequence Flow_95|endEvents_9aa6a45a-e22e-f9d1-69fb-6a8d04623bc7((End Event_93))
tasks_aa9180b2-3294-ee67-7047-386150c6ff6c[Formulate industrial problem]-->|Sequence Flow_117|tasks_3cce143e-1118-df9d-8902-fbd1ce56666e[Understand business and industrial case]
tasks_3cce143e-1118-df9d-8902-fbd1ce56666e[Understand business and 