# Random causal structure generator
Here the goal is to generate data based on known random DAGs.

The steps are as follows:
- generate random DAG (causal structure)
- genrate a data based on the DAG
- generate an event log

The generated data is for the experimaental results of the following [paper](https://arxiv.org/pdf/2108.07795.pdf):

Qafari MS, van der Aalst W. Feature Recommendation for Structural Equation Model Discovery in Process Mining. arXiv preprint arXiv:2108.07795. 2021 Aug 13.

In [1]:
import matplotlib.pyplot as plt
import networkx as nx
import random
import pandas as pd
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
import datetime
from pathlib import Path
from copy import deepcopy
from pm4py.objects.log.obj import EventLog, Trace, Event
from datetime import datetime, timedelta
from operator import add

In [2]:
# Generate a random DAG with k nodes
def random_DAG(name, mapping):
    num = 0
    while len(mapping.keys())!=num:
        G=nx.gnp_random_graph(len(mapping.keys()),0.5,directed=True)
        DAG = nx.DiGraph([(u,v,{'weight':random.randint(1,10)}) for (u,v) in G.edges() if u<v-1])
        nx.is_directed_acyclic_graph(DAG)
        num = DAG.number_of_nodes()
    DAG = nx.relabel_nodes(DAG, mapping)
    save_and_visualize_image(DAG,name)

    return DAG

def save_and_visualize_image(G,name):
    nx.draw(G, with_labels = True)
    plt.savefig(name+".png", format="PNG")
    plt.clf()
    

In [3]:
# visualize the graph
def visualize_graph(G):
    net = Network(notebook=True)
    net.from_nx(G)
    net.show("graph")

In [4]:
# find the parents on node t
def ancestors(G, n):
    return G.pred[n]

In [5]:
def rand_int_list(min_val, max_val, length):
    l = []
    for i in range(0, length):
        n = random.randint(min_val, max_val)
        l.append(n)
    return l

In the following cell, you can change the noise and coefficient intervals.

In [6]:
# generate the column of items for sources
mins = 0 # min_source_variable
maxs = 20 # max_source_variable
def generate_data_source_node(length):
    min_max = rand_int_list(mins, maxs, 2)
    while min_max[0]>=min_max[1]:
        min_max = rand_int_list(mins, maxs, 2) 
    print('noise --> min '+ str(min_max[0]) + "    max "+ str(min_max[1]))
    return rand_int_list(min_max[0], min_max[1], length)

# generate the column of items for non source features
# steps that need to be taken:
# 1- find the parents
# 2- generate noise list
# 3- generate coefficents
# 4- compute the linear equation
min_noise = -5
max_noise = 5
min_coeff = -5
max_coeff = 5
def generate_data_non_source_node(G, node, length, column_dict, coeffInfo):
    min_max = rand_int_list(min_noise, max_noise, 2)
    # noise vector
    while min_max[0]>=min_max[1]:
        min_max = rand_int_list(min_noise, max_noise, 2)
    noise = rand_int_list(min_max[0], min_max[1], length)  
    print('noise --> min '+ str(min_max[0]) + "    max "+ str(min_max[1]))
    values = noise
    node_list=[]
    for itam in G.predecessors(node):
        node_list.append(itam)
    for item in node_list:
        coeff = random.randint(min_coeff, max_coeff)
        while (coeff <= 1) & (coeff >= -1):
            coeff = random.randint(min_coeff, max_coeff)
        source = str(item)
        sink = str(node)
        coeff_str = str(coeff)
        coeffInfo += (source + " --> " + sink + " : " + coeff_str + '\n')
        temp_list = [i * coeff for i in column_dict[item]] 
        values = list( map(add, values, temp_list) )
    print(coeffInfo)
    return values

In [7]:
# Generate m datasets
# n : the number of event logs to generate
# m : the number of features in each event log --> use an even number bigger than 2
def generate_all_data(n, m, coeffInfo):
    for i in range(0, n):
        print("generate_all_data__" + str(i))
        fileName = "data" + str(i)
        coeffInfo += ('Data '+ 'i' + '\n')
        coeffInfo += ('first component' + '\n')
        data1 = generate_data(1000, fileName + 'part_1', create_mapping(m//2, 'A'), coeffInfo) 
        # print("   ----- data 1 ---   ")
        # print(type(data1))
        coeffInfo += 'second component' + '\n'
        data2 = generate_data(1000, fileName + 'part_2', create_mapping(m//2, 'B'), coeffInfo)
        data = pd.concat([data1, data2], axis=1, join="inner")
        data.to_csv(fileName+'.txt', index=False, line_terminator= '\n', sep = '\t')
        xes_exporter.apply(convert_to_event_log(data), 'eventlog_'+ str(i) + '.xes')

In [8]:
# Generate one dataset with respect to the given DAG
def generate_data_one_DAG(G, length, coeffInfo):
    column_dict={}
    for node in list((list(nx.topological_sort(G)))):
        print("node " + node)
        if G.in_degree(node)==0:
            column_dict[node] = generate_data_source_node(length)
        elif G.in_degree(node)>0:
            l = generate_data_non_source_node(G, node, length, column_dict, coeffInfo)
            column_dict[node] = l      
    return pd.DataFrame.from_dict(column_dict,orient='index').transpose()  

In [9]:
def generate_data(length, file_name, mapping, coeffInfo): 
    print("generate_data")
    data = pd.DataFrame() 
    G = random_DAG('G_'+ file_name, mapping)
    data = data.append(generate_data_one_DAG(G, length, coeffInfo), ignore_index=True)
    record_info(file_name, G, data)
       
    return data

In [10]:
# convert data set to event log
# convert to proper format 
# -- add activity name
# -- add timestamp
def change_format(data):
    act_names = ["start", "end"]
    timestamp = datetime.datetime.now()
    log_tabular = []
    for case_id in range(0, len(data)):
        event = data.iloc[case_id].values.flatten().tolist()
        for act_name in act_names:
            event.append(act_name)
            event.append(timestamp) #.timestamp())
            event.append(case_id)
            log_tabular.append(event.copy())
            timestamp = timestamp + datetime.timedelta(days=0.1)
            event = data.iloc[case_id].values.flatten().tolist()
            
    col_names = data.columns.tolist()
    col_names.append('concept:name')
    col_names.append('time:timestamp')
    col_names.append('case:concept:name')
    log_tabular = pd.DataFrame(log_tabular, columns=col_names)   
    log_tabular = dataframe_utils.convert_timestamp_columns_in_df(log_tabular)
    return log_converter.apply(log_tabular)

In [11]:
# number of features divided by two
# name : feature name (e.g., name = "A" nad n = 5, then the feature names would be {A_0, A_1, A_2, A_3, A_4})
def create_mapping(n, name):
    mapping = dict()
    for i in range(0, n):
        mapping[i] = name + str(i)
        
    return mapping

In [12]:
# record the information of the generated eventlogs. 
# data is recorded in 
def ancestor_writer(G, file_name):
    dir_name = file_name + '/' + file_name + "_ancestors.txt"
    ancestors = "Ancestors: \n"
    for node in G.nodes():
        ancestors = ancestors + node + "\n"
        ancestors = ancestors + str(G.pred[node]) + "\n"
    text_file = open(dir_name, "w")
    text_file.write(ancestors)
    text_file.close()    
    
def record_info(file_name, graph, data):
    Path(file_name).mkdir(parents=True, exist_ok=True)
    dir_name = file_name + '/' + file_name + ".txt"
    nx.write_edgelist(graph, dir_name)
    dir_name = file_name + '/' + file_name + ".png"
    nx.draw(graph, with_labels = True)
    plt.savefig(dir_name, format="PNG")
    plt.clf()
    dir_name = file_name + '/' + file_name + ".csv"
    data.to_csv(dir_name, index=False, line_terminator= '\n')
    ancestor_writer(graph, file_name)

In [13]:
def convert_to_event_log(data):
    L = EventLog()
    time = datetime.now()
    columns = data.columns.values.tolist()
    for index, row in data.iterrows():
        e1 = Event()
        e1["concept:name"] = "A"
        time = time + timedelta(seconds=300)
        e1["time:timestamp"] = time
        e2 = Event()
        e2["concept:name"] = "B"
        time = time + timedelta(seconds=600)
        e2["time:timestamp"] = time
        t = Trace()
        t.append(e1)
        t.append(e2)
        for att in columns:
            # Python typing is required in the current release
            t.attributes[att] = float(row[att])
        L.append(t)   
    return L

In [14]:
# generating 10 event logs each with 30 features

coeffInfo = "coefficients:\n"
generate_all_data(10, 20, coeffInfo)

with open("coeffInfo.txt", "w") as text_file:
    text_file.write("%s" % coeffInfo)

generate_all_data__0
generate_data
node A0
noise --> min 6    max 18
node A1
noise --> min 6    max 15
node A2
noise --> min 4    max 5
coefficients:
Data i
first component
A0 --> A2 : 3

node A3
noise --> min -5    max 5
coefficients:
Data i
first component
A0 --> A3 : -5
A1 --> A3 : 4

node A4
noise --> min -1    max 4
coefficients:
Data i
first component
A0 --> A4 : 5
A1 --> A4 : 5

node A7
noise --> min -1    max 0
coefficients:
Data i
first component
A2 --> A7 : 2

node A5
noise --> min -2    max 2
coefficients:
Data i
first component
A3 --> A5 : -4
A1 --> A5 : -5

node A6
noise --> min -3    max 3
coefficients:
Data i
first component
A3 --> A6 : 2

node A8
noise --> min -3    max 3
coefficients:
Data i
first component
A2 --> A8 : -3
A3 --> A8 : 2
A4 --> A8 : -2
A6 --> A8 : -3

node A9
noise --> min -4    max -2
coefficients:
Data i
first component
A0 --> A9 : -3
A2 --> A9 : -4
A3 --> A9 : 4
A4 --> A9 : -3
A1 --> A9 : -4
A5 --> A9 : -3
A6 --> A9 : -4

generate_data
node B0
noise -

exporting log, completed traces ::   0%|          | 0/1000 [00:00<?, ?it/s]

generate_all_data__1
generate_data
node A0
noise --> min 14    max 17
node A1
noise --> min 4    max 11
node A2
noise --> min 6    max 19
node A3
noise --> min -5    max 4
coefficients:
Data i
first component
second component
Data i
first component
A0 --> A3 : 5
A1 --> A3 : -5

node A4
noise --> min -5    max -3
coefficients:
Data i
first component
second component
Data i
first component
A0 --> A4 : -2
A1 --> A4 : -5

node A5
noise --> min -4    max -2
coefficients:
Data i
first component
second component
Data i
first component
A0 --> A5 : -5
A1 --> A5 : 3

node A7
noise --> min -1    max 5
coefficients:
Data i
first component
second component
Data i
first component
A2 --> A7 : 3

node A6
noise --> min 1    max 4
coefficients:
Data i
first component
second component
Data i
first component
A0 --> A6 : 2
A4 --> A6 : -4

node A8
noise --> min -5    max -1
coefficients:
Data i
first component
second component
Data i
first component
A0 --> A8 : -2
A5 --> A8 : 4
A6 --> A8 : 5
A2 --> A8 : -5


exporting log, completed traces ::   0%|          | 0/1000 [00:00<?, ?it/s]

generate_all_data__2
generate_data
node A0
noise --> min 8    max 9
node A1
noise --> min 15    max 18
node A3
noise --> min 13    max 14
node A4
noise --> min 11    max 12
node A2
noise --> min -2    max 1
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A2 : 4

node A8
noise --> min -4    max 2
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A8 : -5
A1 --> A8 : 2
A3 --> A8 : 3
A4 --> A8 : 5

node A5
noise --> min -2    max 4
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
A2 --> A5 : -5
A3 --> A5 : -5

node A6
noise --> min -3    max 0
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
A2 --> A6 : -3

node A7
noise --> min -5    max -3
coefficients:
Data i
first component
second component
Data i
first c

exporting log, completed traces ::   0%|          | 0/1000 [00:00<?, ?it/s]

generate_all_data__3
generate_data
node A0
noise --> min 4    max 9
node A1
noise --> min 13    max 19
node A2
noise --> min -4    max 5
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A2 : -5

node A3
noise --> min -4    max 3
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A3 : 4

node A4
noise --> min -2    max 5
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A4 : 5
A2 --> A4 : 4

node A5
noise --> min 3    max 4
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A2 --> A5 : -4
A3 --> A5 : 4

node A7
noise --> min -5    max 4
coefficien

exporting log, completed traces ::   0%|          | 0/1000 [00:00<?, ?it/s]

generate_all_data__4
generate_data
node A0
noise --> min 8    max 20
node A1
noise --> min 5    max 6
node A2
noise --> min 2    max 3
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A2 : -4

node A3
noise --> min -5    max -3
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A1 --> A3 : 2

node A4
noise --> min -5    max -1
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A4 : -3
A1 --> A4 : 3

node A5
noise --> min -5    max 0
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first co

exporting log, completed traces ::   0%|          | 0/1000 [00:00<?, ?it/s]

generate_all_data__5
generate_data
node A0
noise --> min 3    max 19
node A1
noise --> min 0    max 15
node A3
noise --> min 5    max 13
node A2
noise --> min -5    max 1
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A2 : -3

node A6
noise --> min 0    max 1
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A3 --> A6 : 5

node A4
noise --> min -5    max 2
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A2 --> A4 : -4
A1 --

exporting log, completed traces ::   0%|          | 0/1000 [00:00<?, ?it/s]

generate_all_data__6
generate_data
node A0
noise --> min 4    max 19
node A1
noise --> min 8    max 17
node A2
noise --> min 2    max 7
node A3
noise --> min -1    max 2
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A3 : -5

node A4
noise --> min 1    max 5
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A4 : -4
A2 --> A4 : 5

node A5
noise --> min -5    max -4
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
secon

exporting log, completed traces ::   0%|          | 0/1000 [00:00<?, ?it/s]

generate_all_data__7
generate_data
node A0
noise --> min 1    max 20
node A1
noise --> min 7    max 12
node A2
noise --> min 7    max 11
node A4
noise --> min -1    max 4
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A4 : 2

node A3
noise --> min 2    max 5
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A1 --> A3 : -2

node A6
noise --> min -5    max -2
coefficients:
Data i
first component
second component
Data i
first component
second component
Da

exporting log, completed traces ::   0%|          | 0/1000 [00:00<?, ?it/s]

generate_all_data__8
generate_data
node A0
noise --> min 7    max 19
node A1
noise --> min 2    max 12
node A2
noise --> min 10    max 14
node A5
noise --> min -4    max 1
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A5 : 2

node A3
noise --> min -5    max 5
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A3 : -3
A1 --> A3 : 3

node A4
noise --> min -4    max -1


exporting log, completed traces ::   0%|          | 0/1000 [00:00<?, ?it/s]

generate_all_data__9
generate_data
node A0
noise --> min 0    max 16
node A1
noise --> min 8    max 19
node A2
noise --> min -4    max 4
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A2 : -5

node A3
noise --> min 2    max 5
coefficients:
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
second component
Data i
first component
A0 --> A3 : 5
A1 -->

exporting log, completed traces ::   0%|          | 0/1000 [00:00<?, ?it/s]

<Figure size 432x288 with 0 Axes>

# Test code

In [None]:
# test generate_data and record
num_nodes = [5]
coeffInfo = "coefficients:\n"
mapping = create_mapping(5, "A")
data = generate_data(num_nodes, 100, "test_generate_data", mapping, coeffInfo)
print(coeffInfo)

In [None]:
def rand_int_list(min_val, max_val, length):
    l = []
    for i in range(0, length):
        n = random.randint(min_val, max_val)
        l.append(n)
    return l

l = rand_int_list(-5, 5, 100)
from collections import Counter
print(Counter(l))


In [None]:
# test convet_to_event_log

data = [{"A1":1, "A2":2, "A3":3}, {"A1":4, "A2":5, "A3":6}]
generated_log = convert_to_event_log(pd.DataFrame(data))
print(generated_log)

In [None]:
import pm4py
pm4py.write_xes(generated_log, r"C:\Users\qafari\prova.xes")

In [None]:
# create a mapping 
# random graph
# ancestor writer  
    
mapping = create_mapping(5, 'A')
print(mapping)
name = "DAG"
G = random_DAG(name, mapping)
save_and_visualize_image(G,name)
ancestor_writer(G, name)


In [None]:
  print("predecessors: ")
        s = " "
        if len(a.keys()) > 0:
            print(a)
            for item in list(a.values()):
                s = str(item) + " "

In [None]:
generate_data_source_node(10)

In [None]:
coeffInfo = "coefficients:\n"
generate_all_data(1, 10, coeffInfo)

with open("coeffInfo.txt", "w") as text_file:
    text_file.write("%s" % coeffInfo)