<a href="https://colab.research.google.com/github/nadineelnaggar/NeSy_2021/blob/master/Dyck_Generator_Suzgun.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import sys
import numpy as np
import torch
from collections import defaultdict
import random

In [37]:
sys.setrecursionlimit(5000)

all_pairs = ['()', '[]', '{}', '<>', '+-', 'ab', 'xo']
all_letters = ''
for elt in all_pairs:
    all_letters += elt

init_ascii = 48  ## corresponds to 0
print(all_letters)

()[]{}<>+-abxo


In [38]:
class DyckLanguage():
    def __init__(self, num_pairs, p, q):
        self.pair_num = num_pairs
        self.pairs = all_pairs[:num_pairs]
        self.vocabulary = all_letters[:2 * num_pairs]
        self.n_letters = len(self.vocabulary)

        self.openpar = [elt[0] for elt in self.pairs]
        self.closepar = [elt[1] for elt in self.pairs]

        self.p = p
        self.q = q

    # returns the vocabulary
    def return_vocab(self):
        return self.vocabulary

    # generate a sample
    def generate(self, current_size, max_size):
        # Houston, we have a problem here. (Limit exceeded.)
        if current_size >= max_size:
            return ''

        prob = random.random()
        # Grammar: S -> (_i S )_i with prob p | SS with prob q | empty with prob 1 - (p+q)
        if prob < self.p:
            chosen_pair = np.random.choice(self.pairs)  # randomly pick one of the pairs.
            sample = chosen_pair[0] + self.generate(current_size + 2, max_size) + chosen_pair[1]
            if len(sample) <= max_size:
                return sample
        elif prob < self.p + self.q:
            sample = self.generate(current_size, max_size) + self.generate(current_size, max_size)
            if len(sample) <= max_size:
                return sample
        else:
            return ''

        return ''

    # generate 'num' number of samples
    def generate_list(self, num, min_size, max_size):
        arr = []
        size_info = defaultdict(list)
        counter = 0
        while counter < num:
            sample = self.generate(0, max_size)
            if sample not in arr and len(sample) >= min_size:
                counter += 1
                arr.append(sample)
                # print(sample) #extra added by me
                size_info[len(sample)].append(sample)
                if counter % 500 == 0:
                    print('{} samples generated.'.format(counter))

        return arr, size_info

    def output_generator(self, seq):
        output_seq = ''
        stack = []

        for elt in seq:
            dyck_counter = [0 for _ in range(self.pair_num)]

            if elt in self.openpar:
                stack.append(self.closepar[self.openpar.index(elt)])
            else:
                stack.pop()

            if len(stack) > 0:
                index = self.closepar.index(stack[-1])
                dyck_counter[index] = 1

            temp = np.nonzero(dyck_counter)

            binary_code = 0

            for base in temp[0]:
                binary_code += (2 ** (base))

            output_seq += chr(binary_code + init_ascii)

            # print(output_seq) #extra added by me 

        return output_seq

    def depth_counter(self, seq):
        dyck_counter = np.zeros(self.pair_num)
        max_depth = np.zeros((len(seq), self.pair_num))
        counter = 0
        for elt in seq:
            indexl = 0
            if elt in self.openpar:
                indexl = self.openpar.index(elt)
                dyck_counter[indexl] += 1
            else:
                indexl = self.closepar.index(elt)
                dyck_counter[indexl] -= 1
            max_depth[counter] = dyck_counter
            counter += 1
        return max_depth

    def training_set_generator(self, num, min_size, max_size):
        input_arr, input_size_arr = self.generate_list(num, min_size, max_size)
        output_arr = []
        for seq in input_arr:
           output_arr.append(self.output_generator(seq))
        return input_arr, output_arr, input_size_arr
        ##### CHANGED HERE
        # return input_arr, input_size_arr

    # Find letter index from all_letters
    def letterToIndex(self, letter):
        return all_letters.find(letter)

    # Just for demonstration, turn a letter into a <1 x n_letters> Tensor
    def letterToTensor(self, letter):
        tensor = torch.zeros(1, self.n_letters)
        tensor[0][self.letterToIndex(letter)] = 1
        return tensor

    # Turn a line into a <line_length x 1 x n_letters>,
    # or an array of one-hot letter vectors
    def lineToTensor(self, line):
        tensor = torch.zeros(len(line), 1, self.n_letters)
        for li, letter in enumerate(line):
            tensor[li][0][self.letterToIndex(letter)] = 1.0
        return tensor

    def lineToTensorSigmoid(self, line):
        tensor = torch.zeros(len(line), self.n_letters)
        for li, letter in enumerate(line):
            for elt in self.openpar:
                tensor[li][self.letterToIndex(elt)] = 1.0

            binary_code = ord(letter) - init_ascii

            if binary_code > 0:
                for base in range(len(self.closepar) - 1, -1, -1):
                    if binary_code - (2 ** base) >= 0:
                        tensor[li][self.letterToIndex(self.closepar[base])] = 1.0
                        binary_code -= (2 ** base)
        return tensor

In [39]:
NUM_PAR = 1
MIN_SIZE = 2
MAX_SIZE = 50
P_VAL = 0.5
Q_VAL = 0.25

In [40]:
Dyck = DyckLanguage (NUM_PAR, P_VAL, Q_VAL)
all_letters = word_set = Dyck.return_vocab ()
n_letters = vocab_size = len (word_set)
print(all_letters)
print(n_letters)

()
2


In [44]:
# Number of samples in the training corpus
TRAINING_SIZE = 10000
# Number of samples in the test corpus
TEST_SIZE = 5000
print('Loading data...')

# training_input, training_input_lengths = Dyck.training_set_generator (TRAINING_SIZE, MIN_SIZE, MAX_SIZE)
# test_input, test_input_lenghts = Dyck.training_set_generator (TEST_SIZE, MAX_SIZE + 2, 2 * MAX_SIZE)


training_input, training_output, training_input_lengths = Dyck.training_set_generator (TRAINING_SIZE, MIN_SIZE, MAX_SIZE)
print('training data generated, writing training set to document')
with open('Dyck1_Dataset_Suzgun_train.txt', 'a') as f:
  for i in range(len(training_input)):
    f.write(str(training_input[i])+','+str(training_output[i])+','+str(training_input_lengths[i])+'\n')

print('train set written to document')


print('test data generated, writing test set to document')
test_input, test_output, test_input_lengths = Dyck.training_set_generator (TEST_SIZE, MAX_SIZE + 2, 2 * MAX_SIZE)
with open('Dyck1_Dataset_Suzgun_test.txt', 'a') as f:
  for i in range(len(test_input)):
    f.write(str(test_input[i])+','+str(test_output[i])+','+str(test_input_lengths[i])+'\n')

print('test set written to document')


Loading data...
500 samples generated.
1000 samples generated.
1500 samples generated.
2000 samples generated.
2500 samples generated.
3000 samples generated.
3500 samples generated.
4000 samples generated.
4500 samples generated.
5000 samples generated.
5500 samples generated.
6000 samples generated.
6500 samples generated.
7000 samples generated.
7500 samples generated.
8000 samples generated.
8500 samples generated.
9000 samples generated.
9500 samples generated.
10000 samples generated.
training data generated, writing training set to document
train set written to document
test data generated, writing test set to document
500 samples generated.
1000 samples generated.
1500 samples generated.
2000 samples generated.
2500 samples generated.
3000 samples generated.
3500 samples generated.
4000 samples generated.
4500 samples generated.
5000 samples generated.
test set written to document


In [46]:
print(training_input[1])
print(training_output[1])
print(len(training_input))

(((((()))((())))))
111111111111111110
10000


In [47]:
print(test_input[5])
print(test_output[5])
print(len(test_input))

(()(((((()((()))())))))((((((((()))))))(((())))((((()))))())))
11111111111111111111111111111111111111111111111111111111111110
5000


In [48]:
print(len(test_input[5]))

62


In [49]:
print(Dyck.output_generator('(())()()'))

11101010


In [None]:
# with open('Dyck1_Dataset_Suzgun_train.txt', 'a') as f:
#   for i in range(len(training_input)):
#     f.write(training_input[i]+','+training_input_lengths][i]+','+)