# seq2seqCalculator

## Import requirements

In [96]:
from random import seed
from random import randint

import numpy as np

from sklearn.preprocessing import LabelBinarizer 


from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import RepeatVector

from keras.callbacks import ModelCheckpoint
import tensorflow as tf


import os

## Data
One benefit of this task is that you don't need to download any data. You can generate it on your own! The input consists of the following: "number"+"sign"+"number", which then equals "number". The pluses stand for concatenation in this case. 
The seq2seq model doesn't care what symbols we use, we could use "+-*/" as symbols. 
<!-- We use a one-hot encoding for each symbol accroding the assigned number. For numbers this is straight forward and we end up with something like {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8: -->

In [97]:
def generate_equations(operators, dataset_size, min_value=0, max_value=10):
    '''Generate pairs of equations and solutions to them.
        
        Each equation has a form of two integers with an operator in between. 
        Each solution is an integer with the result of the operation.
        
        
        operators: list of strings, allowed operators. 
        dataset_size: an integer, number of equations to be generated.
        min_value: an integer, min value of each operand.
        max_value: an integer, max value of each operand.
    
        result: a list of tuples of strings (equation, solution).
    '''
    samples = []
    number_of_operators = len(operators)
    
    for _ in range(dataset_size):
        equation = (
            str(randint(min_value, max_value)) + 
            operators[randint(0, number_of_operators-1)] +
            str(randint(min_value, max_value))
        )
        
        try:
            solution = str(int(eval(equation)))
        except ZeroDivisionError as e: ## handle x/0 state
            equation = equation.replace("/", "*")
            solution = str(int(eval(equation)))

        samples.append((equation, solution))
    
    return samples

In [73]:
## Test generate_equations
test = generate_equations("+-/*",1)
print(test)

[('1*5', '5')]


## Perpare data for the neural network

The maximum input length is “length_nr + 1 + length_nr” which is ```x_max_length``` in our case (for example 7). We would like to also have calculations that are shorter. This is entirely possible, however a seq2seq requires [check] inputs of the same length. So when a calculation is smaller then length ```x_max_length``` we fill it up from the left with spaces, e.g. “____2+2”.
Also we need to padd the solutions to ```y_max_length```.

In [98]:
def padding_to_max(equations, x_max_length, y_max_length):
    '''Padd the equations to max lengths.
    
    equations: a list of tuples of strings (equation, solution)
    x_max_length: max_len of equation samples.
    y_max_length: max_len of solution samples.
    
    return:  a list of tuples of strings (equation, solution) that padded to max length.
    '''
    samples = []
    for equation, solution in equations:
        samples.append((f'{equation:>{x_max_length}}', f'{solution:>{y_max_length}}'))
    return samples

In [75]:
def to_ids(equations, word2id):
    '''Convert equations to ids. 
    '''
    samples = []
    for equation, solution in equations:
        e = [word2id[c] for c in equation]
        s = [word2id[c] for c in solution]
        samples.append((e, s))
    return samples

In [5]:
def one_hot(equations, labels):
    '''Create one-hot encoder for equations.
    '''
    label_binarizer = LabelBinarizer() 
    label_binarizer.fit(labels)
    x, y = [], []
    for equation, solution in equations:
        x.append(label_binarizer.transform(equation))
        y.append(label_binarizer.transform(solution))
    return x, y

In [6]:
# invert encoding
def invert(seq, id2word):
    strings = list()
    for pattern in seq:
        string = id2word[np.argmax(pattern)]
        strings.append(string)
    return ''.join(strings)

In [119]:
def generate_data(n_samples, allowed_operators, alphabet, word2id, id2word, x_max_length=7, y_max_length=4, min_value=0, max_value=100):
    ## generate pairs
    data = generate_equations(allowed_operators, n_samples, min_value=min_value, max_value=max_value)    

    ## padding to max
    data = padding_to_max(data, x_max_length, y_max_length)
    
    ## string to indexs
    data = to_ids(data, word2id)
    # one hot encoding 
    x, y = one_hot(data, list(id2word.keys()))

    x, y = np.array(x), np.array(y)
    
    return x, y


### Setup variables

In this section we should set the number of samples, allowed opeatots and alphabet:

In [129]:
seed(1)

allowed_operators = '+-'
n_samples = 1000
alphabet = '0123456789' + allowed_operators + ' ' ## space for padding

word2id = {symbol:i for i, symbol in enumerate(alphabet)}
id2word = {i:symbol for symbol, i in word2id.items()}

x_max_length, y_max_length = 7, 4
min_value, max_value = 0, 100

x, y = generate_data(n_samples, allowed_operators, alphabet, word2id, id2word, x_max_length=x_max_length, y_max_length=y_max_length, 
                    min_value=min_value, max_value=max_value)

Test the functions:

In [130]:
print("shape of X", x.shape)
print("shape of y", y.shape)
print("X[0]:")
print(x[0])
print("y[0]")
print(y[0])
print("invert X[0]", invert(x[0], id2word) )
print("invert y[0]", invert(y[0], id2word) )

shape of X (1000, 7, 13)
shape of y (1000, 4, 13)
X[0]:
[[0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0]]
y[0]
[[0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0]]
invert X[0]   17+32
invert y[0]   49


## Model

In [138]:
# define LSTM configuration
n_batch = 10
n_epoch = 50
n_chars = len(alphabet)

In [137]:
# create LSTM
def build_model(x_max_length, y_max_length, n_chars):
    model = Sequential()
    model.add(LSTM(100, input_shape=(x_max_length, n_chars)))
    model.add(RepeatVector(y_max_length))
    model.add(LSTM(50, return_sequences=True))
    model.add(TimeDistributed(Dense(n_chars, activation='softmax')))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    return model


### Configure checkpoints

Use a ```tf.keras.callbacks.ModelCheckpoint``` to ensure that checkpoints are saved during training:


In [133]:
# Directory where the checkpoints will be saved
checkpoint_dir = './models/training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

Train the model:

In [None]:
# train LSTM
model = build_model(x_max_length, y_max_length, n_chars)

print(model.summary())

In [None]:
for i in range(n_epoch):

    x, y = generate_data(n_samples, allowed_operators, alphabet, word2id, id2word, x_max_length=x_max_length, y_max_length=y_max_length, 
                    min_value=min_value, max_value=max_value)
    print(i)
    model.fit(x, y, epochs=1, batch_size=n_batch, callbacks=[checkpoint_callback])

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

In [136]:
model = build_model(x_max_length, y_max_length, n_chars)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

# evaluate on some new patterns
# x, y = generate_data(n_samples, allowed_operators, alphabet, word2id, id2word)    
x, y = generate_data(n_samples, allowed_operators, alphabet, word2id, id2word, x_max_length=x_max_length, y_max_length=y_max_length, 
                    min_value=min_value, max_value=max_value)

result = model.predict(x, batch_size=n_batch, verbose=0)

# calculate error
expected = [invert(x, alphabet) for x in y]
predicted = [invert(x, alphabet) for x in result]

# show some examples
for i in range(20):
    print('%s Expected=%s, Predicted=%s' % (invert(x[i], id2word),expected[i], predicted[i]))

  70+59 Expected= 129, Predicted= 127
  84+39 Expected= 123, Predicted= 114
  69+24 Expected=  93, Predicted=  91
  20-48 Expected= -28, Predicted= -22
  21-30 Expected=  -9, Predicted= -11
   3+59 Expected=  62, Predicted=  77
  52-24 Expected=  28, Predicted=  30
  64-69 Expected=  -5, Predicted=   1
  22-27 Expected=  -5, Predicted=  11
  10-58 Expected= -48, Predicted= -42
 100-67 Expected=  33, Predicted=  20
  47-89 Expected= -42, Predicted= -33
  53+43 Expected=  96, Predicted=  94
  82+43 Expected= 125, Predicted= 125
  47+24 Expected=  71, Predicted=  77
  52+77 Expected= 129, Predicted= 147
  71+57 Expected= 128, Predicted= 127
  13+58 Expected=  71, Predicted=  77
  95+94 Expected= 189, Predicted= 151
  21+10 Expected=  31, Predicted=  66
