# Random causal structure generator
Here the goal is to generate data based on known random DAGs.

The steps are as follows:
- generate random DAG (causal structure)
- genrate a data based on the DAG
- generate an event log

The generated data is for the experimaental results of the following [paper](https://arxiv.org/pdf/2108.07795.pdf):

Qafari MS, van der Aalst W. Feature Recommendation for Structural Equation Model Discovery in Process Mining. arXiv preprint arXiv:2108.07795. 2021 Aug 13.

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import random
import pandas as pd
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
import datetime
from pathlib import Path
from copy import deepcopy
from pm4py.objects.log.obj import EventLog, Trace, Event
from datetime import datetime, timedelta
from operator import add

In [None]:
# Generate a random DAG with k nodes
def random_DAG(name, mapping):
    num = 0
    while len(mapping.keys())!=num:
        G=nx.gnp_random_graph(len(mapping.keys()),0.5,directed=True)
        DAG = nx.DiGraph([(u,v,{'weight':random.randint(1,10)}) for (u,v) in G.edges() if u<v-1])
        nx.is_directed_acyclic_graph(DAG)
        num = DAG.number_of_nodes()
    DAG = nx.relabel_nodes(DAG, mapping)
    save_and_visualize_image(DAG,name)

    return DAG

def save_and_visualize_image(G,name):
    nx.draw(G, with_labels = True)
    plt.savefig(name+".png", format="PNG")
    plt.clf()
    

In [None]:
# visualize the graph
def visualize_graph(G):
    net = Network(notebook=True)
    net.from_nx(G)
    net.show("graph")

In [None]:
# find the parents on node t
def ancestors(G, n):
    return G.pred[n]

In [None]:
def rand_int_list(min_val, max_val, length):
    l = []
    for i in range(0, length):
        n = random.randint(min_val, max_val)
        l.append(n)
    return l

In the following cell, you can change the noise and coefficient intervals.

In [None]:
# generate the column of items for sources
mins = 0 # min_source_variable
maxs = 20 # max_source_variable
def generate_data_source_node(length):
    min_max = rand_int_list(mins, maxs, 2)
    while min_max[0]>=min_max[1]:
        min_max = rand_int_list(mins, maxs, 2) 
    return rand_int_list(min_max[0], min_max[1], length)

# generate the column of items for non source features
# steps that need to be taken:
# 1- find the parents
# 2- generate noise list
# 3- generate coefficents
# 4- compute the linear equation
min_noise = -5
max_noise = 5
min_coeff = -5
max_coeff = 5
def generate_data_non_source_node(G, node, length, column_dict, coeffInfo):
    min_max = rand_int_list(min_noise, max_noise, 2)
    # noise vector
    while min_max[0]>=min_max[1]:
        min_max = rand_int_list(min_noise, max_noise, 2)
    noise = rand_int_list(min_max[0], min_max[1], length)  
    values = noise
    node_list=[]
    for itam in G.predecessors(node):
        node_list.append(itam)
    for item in node_list:
        coeff = random.randint(min_coeff, max_coeff)
        while (coeff <= 1) & (coeff >= -1):
            coeff = random.randint(min_coeff, max_coeff)
        source = str(item)
        sink = str(node)
        coeff_str = str(coeff)
        coeffInfo += (source + " --> " + sink + " : " + coeff_str + '\n')
        temp_list = [i * coeff for i in column_dict[item]] 
        values = list( map(add, values, temp_list) )
    return values

In [None]:
# Generate m datasets
# n : the number of event logs to generate
# m : the number of features in each event log --> use an even number bigger than 2
def generate_all_data(n, m, coeffInfo):
    for i in range(0, n):
        fileName = "data" + str(i)
        coeffInfo += ('Data '+ 'i' + '\n')
        coeffInfo += ('first component' + '\n')
        data1 = generate_data(1000, fileName + 'part_1', create_mapping(m//2, 'A'), coeffInfo) 
        coeffInfo += 'second component' + '\n'
        data2 = generate_data(1000, fileName + 'part_2', create_mapping(m//2, 'B'), coeffInfo)
        data = pd.concat([data1, data2], axis=1, join="inner")
        data.to_csv(fileName+'.txt', index=False, line_terminator= '\n', sep = '\t')
        xes_exporter.apply(convert_to_event_log(data), 'eventlog_'+ str(i) + '.xes')

In [None]:
# Generate one dataset with respect to the given DAG
def generate_data_one_DAG(G, length, coeffInfo):
    column_dict={}
    for node in list((list(nx.topological_sort(G)))):
        if G.in_degree(node)==0:
            column_dict[node] = generate_data_source_node(length)
        elif G.in_degree(node)>0:
            l = generate_data_non_source_node(G, node, length, column_dict, coeffInfo)
            column_dict[node] = l      
    return pd.DataFrame.from_dict(column_dict,orient='index').transpose()  

In [None]:
def generate_data(length, file_name, mapping, coeffInfo): 
    data = pd.DataFrame() 
    G = random_DAG('G_'+ file_name, mapping)
    data = data.append(generate_data_one_DAG(G, length, coeffInfo), ignore_index=True)
    record_info(file_name, G, data)
       
    return data

In [None]:
# convert data set to event log
# convert to proper format 
# -- add activity name
# -- add timestamp
def change_format(data):
    act_names = ["start", "end"]
    timestamp = datetime.datetime.now()
    log_tabular = []
    for case_id in range(0, len(data)):
        event = data.iloc[case_id].values.flatten().tolist()
        for act_name in act_names:
            event.append(act_name)
            event.append(timestamp) #.timestamp())
            event.append(case_id)
            log_tabular.append(event.copy())
            timestamp = timestamp + datetime.timedelta(days=0.1)
            event = data.iloc[case_id].values.flatten().tolist()
            
    col_names = data.columns.tolist()
    col_names.append('concept:name')
    col_names.append('time:timestamp')
    col_names.append('case:concept:name')
    log_tabular = pd.DataFrame(log_tabular, columns=col_names)   
    log_tabular = dataframe_utils.convert_timestamp_columns_in_df(log_tabular)
    return log_converter.apply(log_tabular)

In [None]:
# number of features divided by two
# name : feature name (e.g., name = "A" nad n = 5, then the feature names would be {A_0, A_1, A_2, A_3, A_4})
def create_mapping(n, name):
    mapping = dict()
    for i in range(0, n):
        mapping[i] = name + str(i)
        
    return mapping

In [None]:
# record the information of the generated eventlogs. 
# data is recorded in 
def ancestor_writer(G, file_name):
    dir_name = file_name + '/' + file_name + "_ancestors.txt"
    ancestors = "Ancestors: \n"
    for node in G.nodes():
        ancestors = ancestors + node + "\n"
        ancestors = ancestors + str(G.pred[node]) + "\n"
    text_file = open(dir_name, "w")
    text_file.write(ancestors)
    text_file.close()    
    
def record_info(file_name, graph, data):
    Path(file_name).mkdir(parents=True, exist_ok=True)
    dir_name = file_name + '/' + file_name + ".txt"
    nx.write_edgelist(graph, dir_name)
    dir_name = file_name + '/' + file_name + ".png"
    nx.draw(graph, with_labels = True)
    plt.savefig(dir_name, format="PNG")
    plt.clf()
    dir_name = file_name + '/' + file_name + ".csv"
    data.to_csv(dir_name, index=False, line_terminator= '\n')
    ancestor_writer(graph, file_name)

In [None]:
def convert_to_event_log(data):
    L = EventLog()
    time = datetime.now()
    columns = data.columns.values.tolist()
    for index, row in data.iterrows():
        e1 = Event()
        e1["concept:name"] = "A"
        time = time + timedelta(seconds=300)
        e1["time:timestamp"] = time
        e2 = Event()
        e2["concept:name"] = "B"
        time = time + timedelta(seconds=600)
        e2["time:timestamp"] = time
        t = Trace()
        t.append(e1)
        t.append(e2)
        for att in columns:
            # Python typing is required in the current release
            t.attributes[att] = float(row[att])
        L.append(t)   
    return L

In [None]:
# generating 10 event logs each with 30 features

coeffInfo = "coefficients:\n"
generate_all_data(10, 20, coeffInfo)

with open("coeffInfo.txt", "w") as text_file:
    text_file.write("%s" % coeffInfo)