# seq2seqCalculator

## Import requirements

In [18]:
from random import seed
from random import randint

import numpy as np

from sklearn.preprocessing import LabelBinarizer 

## Data
One benefit of this task is that you don't need to download any data. You can generate it on your own! The input consists of the following: "number"+"sign"+"number", which then equals "number". The pluses stand for concatenation in this case. 
The seq2seq model doesn't care what symbols we use, we could use "+-*/" as symbols. 
<!-- We use a one-hot encoding for each symbol accroding the assigned number. For numbers this is straight forward and we end up with something like {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8: -->

In [2]:
def generate_equations(operators, dataset_size, min_value=0, max_value=10):
    '''Generate pairs of equations and solutions to them.
        
        Each equation has a form of two integers with an operator in between. 
        Each solution is an integer with the result of the operation.
        
        
        operators: list of strings, allowed operators. 
        dataset_size: an integer, number of equations to be generated.
        min_value: an integer, min value of each operand.
        max_value: an integer, max value of each operand.
    
        result: a list of tuples of strings (equation, solution).
    '''
    samples = []
    number_of_operators = len(operators)
    
    for _ in range(dataset_size):
        equation = (
            str(randint(min_value, max_value)) + 
            operators[randint(0, number_of_operators-1)] +
            str(randint(min_value, max_value))
        )
        
        try:
            solution = str(int(eval(equation)))
        except ZeroDivisionError as e: ## handle x/0 state
            equation = equation.replace("/", "*")
            solution = str(int(eval(equation)))

        samples.append((equation, solution))
    
    return samples

In [5]:
## Test generate_equations
test = generate_equations("+-/*",1)
print(test)

[('7-8', '-1')]


## Perpare data for the neural network

The maximum input length is “length_nr + 1 + length_nr” which is ```x_max_length``` in our case (for example 7). We would like to also have calculations that are shorter. This is entirely possible, however a seq2seq requires [check] inputs of the same length. So when a calculation is smaller then length ```x_max_length``` we fill it up from the left with spaces, e.g. “____2+2”.
Also we need to padd the solutions to ```y_max_length```.

In [9]:
def padding_to_max(equations, x_max_length, y_max_length):
    '''Padd the equations to max lengths.
    
    equations: a list of tuples of strings (equation, solution)
    x_max_length: max_len of equation samples.
    y_max_length: max_len of solution samples.
    
    return:  a list of tuples of strings (equation, solution) that padded to max length.
    '''
    samples = []
    for equation, solution in equations:
        samples.append((f'{equation:>{x_max_length}}', f'{solution:>{y_max_length}}'))
    return samples

In [10]:
def to_ids(equations, word2id):
    samples = []
    for equation, solution in equations:
        e = [word2id[c] for c in equation]
        s = [word2id[c] for c in solution]
        samples.append((e, s))
    return samples

d = padding_to_max(data, x_max_length, y_max_length)
print(d[0])
d = to_ids(d, word2id)
print(d[0])

('6743/2380', '       2')
([6, 7, 4, 3, 12, 2, 3, 8, 0], [14, 14, 14, 14, 14, 14, 14, 2])


In [13]:
def one_hot(equations, labels):
    label_binarizer = LabelBinarizer() 
    label_binarizer.fit(labels)
    x, y = [], []
    for equation, solution in equations:
        x.append(label_binarizer.transform(equation))
        y.append(label_binarizer.transform(solution))
    return x, y

In [None]:
# invert encoding
def invert(seq, alphabet):
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    strings = list()
    for pattern in seq:
        string = int_to_char[argmax(pattern)]
        strings.append(string)
    return ''.join(strings)



In [19]:
def find_max_lengths(equations):
    x_max_length, y_max_length = 0, 0
    for equation, solution in equations:
        x_max_length = max(len(equation), x_max_length) 
        y_max_length = max(len(solution), y_max_length)
    return x_max_length, y_max_length


def generate_data(n_samples, allowed_operators, alphabet, word2id, id2word):
    # generate pairs
    data = generate_equations(allowed_operators, n_samples, min_value=0, max_value=100)    
    # find max_lengths
    x_max_length, y_max_length = find_max_lengths(data)
    
    ## padding to max
    data = padding_to_max(data, x_max_length, y_max_length)
    
    ## string to indexs
    data = to_ids(data, word2id)
    # one hot encoding 
    x, y = one_hot(data, list(id2word.keys()))

    x, y = np.array(x), np.array(y)
    
    return x, y


In [20]:
allowed_operators = '+-'
n_samples = 1000
alphabet = '0123456789' + allowed_operators + ' ' ## space for padding

word2id = {symbol:i for i, symbol in enumerate(alphabet)}
id2word = {i:symbol for symbol, i in word2id.items()}

x, y = generate_data(n_samples, allowed_operators, alphabet, word2id, id2word)

In [22]:
print("shape of X", x.shape)
print("shape of y", y.shape)
print("X[0]:")
print(x[0])
print("y[0]")
print(y[0])

print("invert X[0]", invert(X[0], alphabet) )
print("invert y[0]", invert(y[0], alphabet) )

shape of X (1000, 6, 13)
shape of y (1000, 3, 13)
X[0]:
[[0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0]]
y[0]
[[0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0]]


NameError: name 'invert' is not defined