In [9]:
import csv
import random

from FAdo.reex import *
from FAdo.conversions import *

from utils.random_nfa_generator import generate
from utils.heuristics import *

number_of_nfa_pairs = 10
minimization = True

def get_regex(gfa: GFA) -> list[RegExp]:
    regex = set()
    temp = gfa.dup()
    regex.add(eliminate_randomly(temp, minimization))
    temp = gfa.dup()
    regex.add(decompose(temp, False, False, minimization))
    temp = gfa.dup()
    regex.add(eliminate_by_state_weight_heuristic(temp, minimization))
    temp = gfa.dup()
    regex.add(decompose(temp, True, False, minimization))
    temp = gfa.dup()
    regex.add(eliminate_by_repeated_state_weight_heuristic(temp, minimization))
    temp = gfa.dup()
    regex.add(decompose(temp, True, True, minimization))
    return list(regex)

def get_positive_rows(regex: list) -> list[tuple]:
    rows = []
    for i in range(0, len(regex)):
        for j in range(i + 1, len(regex)):
            rows.append((str(regex[i]), str(regex[j]), 1))
    return rows

def get_negative_rows(first_regex: list, second_regex: list) -> list[tuple]:
    rows = []
    for i in range(0, len(first_regex)):
        for j in range(0, len(second_regex)):
            rows.append((str(first_regex[i]), str(second_regex[j]), 0))
    return rows

with open("./pretrain_data.csv", "w", newline='') as fp:
    writer = csv.writer(fp)
    for i in range(number_of_nfa_pairs):
        n = random.randint(3, 3)
        #n = 10
        k = 5
        d = 0.1
        first_gfa = generate(n, k, d, 'in-memory')
        second_gfa = generate(n, k, d, 'in-memory')
        first_regex = get_regex(first_gfa)
        second_regex = get_regex(second_gfa)
        #reject if two regex describe same set of language
        if first_regex[-1].compare(second_regex[-1]):
            continue
        #write positive cases
        writer.writerows(get_positive_rows(first_regex) + get_positive_rows(second_regex))
        #write negative cases
        writer.writerows(get_negative_rows(first_regex, second_regex))
        del first_regex, second_regex, first_gfa, second_gfa
