# seq2seqCalculator

## Import requirements

In [66]:
from random import seed
from random import randint

import numpy as np

from sklearn.preprocessing import LabelBinarizer 


from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import RepeatVector

## Data
One benefit of this task is that you don't need to download any data. You can generate it on your own! The input consists of the following: "number"+"sign"+"number", which then equals "number". The pluses stand for concatenation in this case. 
The seq2seq model doesn't care what symbols we use, we could use "+-*/" as symbols. 
<!-- We use a one-hot encoding for each symbol accroding the assigned number. For numbers this is straight forward and we end up with something like {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8: -->

In [43]:
def generate_equations(operators, dataset_size, min_value=0, max_value=10):
    '''Generate pairs of equations and solutions to them.
        
        Each equation has a form of two integers with an operator in between. 
        Each solution is an integer with the result of the operation.
        
        
        operators: list of strings, allowed operators. 
        dataset_size: an integer, number of equations to be generated.
        min_value: an integer, min value of each operand.
        max_value: an integer, max value of each operand.
    
        result: a list of tuples of strings (equation, solution).
    '''
    samples = []
    number_of_operators = len(operators)
    
    for _ in range(dataset_size):
        equation = (
            str(randint(min_value, max_value)) + 
            operators[randint(0, number_of_operators-1)] +
            str(randint(min_value, max_value))
        )
        
        try:
            solution = str(int(eval(equation)))
        except ZeroDivisionError as e: ## handle x/0 state
            equation = equation.replace("/", "*")
            solution = str(int(eval(equation)))

        samples.append((equation, solution))
    
    return samples

In [44]:
## Test generate_equations
test = generate_equations("+-/*",1)
print(test)

[('5*1', '5')]


## Perpare data for the neural network

The maximum input length is “length_nr + 1 + length_nr” which is ```x_max_length``` in our case (for example 7). We would like to also have calculations that are shorter. This is entirely possible, however a seq2seq requires [check] inputs of the same length. So when a calculation is smaller then length ```x_max_length``` we fill it up from the left with spaces, e.g. “____2+2”.
Also we need to padd the solutions to ```y_max_length```.

In [45]:
def padding_to_max(equations, x_max_length, y_max_length):
    '''Padd the equations to max lengths.
    
    equations: a list of tuples of strings (equation, solution)
    x_max_length: max_len of equation samples.
    y_max_length: max_len of solution samples.
    
    return:  a list of tuples of strings (equation, solution) that padded to max length.
    '''
    samples = []
    for equation, solution in equations:
        samples.append((f'{equation:>{x_max_length}}', f'{solution:>{y_max_length}}'))
    return samples

In [46]:
def to_ids(equations, word2id):
    samples = []
    for equation, solution in equations:
        e = [word2id[c] for c in equation]
        s = [word2id[c] for c in solution]
        samples.append((e, s))
    return samples

In [47]:
def one_hot(equations, labels):
    label_binarizer = LabelBinarizer() 
    label_binarizer.fit(labels)
    x, y = [], []
    for equation, solution in equations:
        x.append(label_binarizer.transform(equation))
        y.append(label_binarizer.transform(solution))
    return x, y

In [48]:
# invert encoding
def invert(seq, id2word):
    strings = list()
    for pattern in seq:
        string = id2word[np.argmax(pattern)]
        strings.append(string)
    return ''.join(strings)

In [71]:
def find_max_lengths(equations):
    x_max_length, y_max_length = 0, 0
    for equation, solution in equations:
        x_max_length = max(len(equation), x_max_length) 
        y_max_length = max(len(solution), y_max_length)
    return x_max_length, y_max_length


def generate_data(n_samples, allowed_operators, alphabet, word2id, id2word):
    # generate pairs
    data = generate_equations(allowed_operators, n_samples, min_value=0, max_value=10)    
    # find max_lengths
#     x_max_length, y_max_length = find_max_lengths(data)
    
    ## padding to max
    data = padding_to_max(data, x_max_length, y_max_length)
    
    ## string to indexs
    data = to_ids(data, word2id)
    # one hot encoding 
    x, y = one_hot(data, list(id2word.keys()))

    x, y = np.array(x), np.array(y)
    
    return x, y


In [89]:
allowed_operators = '+-*'
n_samples = 1000
alphabet = '0123456789' + allowed_operators + ' ' ## space for padding
seed(1)

word2id = {symbol:i for i, symbol in enumerate(alphabet)}
id2word = {i:symbol for symbol, i in word2id.items()}
x_max_length, y_max_length = 7, 3
x, y = generate_data(n_samples, allowed_operators, alphabet, word2id, id2word)

In [90]:
print("shape of X", x.shape)
print("shape of y", y.shape)
print("X[0]:")
print(x[0])
print("y[0]")
print(y[0])
print("invert X[0]", invert(x[0], id2word) )
print("invert y[0]", invert(y[0], id2word) )

shape of X (1000, 7, 14)
shape of y (1000, 3, 14)
X[0]:
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0]]
y[0]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0]]
invert X[0]     2*1
invert y[0]   2


## Model

In [91]:
# define LSTM configuration
n_batch = 10
n_epoch = 100
n_chars = len(alphabet)

In [93]:
# create LSTM
model = Sequential()
model.add(LSTM(100, input_shape=(x_max_length, n_chars)))
model.add(RepeatVector(y_max_length))
model.add(LSTM(50, return_sequences=True))
model.add(TimeDistributed(Dense(n_chars, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_10 (LSTM)              (None, 100)               46000     
                                                                 
 repeat_vector_5 (RepeatVect  (None, 3, 100)           0         
 or)                                                             
                                                                 
 lstm_11 (LSTM)              (None, 3, 50)             30200     
                                                                 
 time_distributed_5 (TimeDis  (None, 3, 14)            714       
 tributed)                                                       
                                                                 
Total params: 76,914
Trainable params: 76,914
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# train LSTM
for i in range(n_epoch):
    x, y = generate_data(n_samples, allowed_operators, alphabet, word2id, id2word)    
#     X, y = generate_data(n_samples, , largest, alphabet)
    print(i)
    model.fit(x, y, epochs=1, batch_size=n_batch)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89


90
91
92
93
94
95


In [88]:
# evaluate on some new patterns
x, y = generate_data(n_samples, allowed_operators, alphabet, word2id, id2word)    

result = model.predict(x, batch_size=n_batch, verbose=0)
# calculate error
expected = [invert(x, alphabet) for x in y]
predicted = [invert(x, alphabet) for x in result]
# show some examples
for i in range(20):
    print('%s Expected=%s, Predicted=%s' % (invert(x[i], id2word),expected[i], predicted[i]))

    0+8 Expected=  8, Predicted=  8
    1+8 Expected=  9, Predicted=  9
    6+8 Expected= 14, Predicted= 14
    7-4 Expected=  3, Predicted=  3
   10+2 Expected= 12, Predicted= 12
    0+2 Expected=  2, Predicted=  2
    1+9 Expected= 10, Predicted= 10
    3+6 Expected=  9, Predicted=  9
    7-7 Expected=  0, Predicted=  0
   10-9 Expected=  1, Predicted=  1
    8+1 Expected=  9, Predicted=  9
    9+6 Expected= 15, Predicted= 15
    4+2 Expected=  6, Predicted=  6
    5-1 Expected=  4, Predicted=  4
    4-7 Expected= -3, Predicted= -3
    2+0 Expected=  2, Predicted=  2
   10-4 Expected=  6, Predicted=  6
    9+1 Expected= 10, Predicted= 10
   10+4 Expected= 14, Predicted= 14
    9+0 Expected=  9, Predicted=  9
