## Concatenate all files

```bash
$ cd path/to/train-easy/
$ find -name '*.txt' -exec cat {} \; > ../../../interim/train-easy_all.txt
```

## Load libraries

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import glob
import pickle
import json
import matplotlib.pyplot as plt
from lstm import LSTM_S2S
from metrics import exact_match_metric_index
from callbacks import NValidationSetsCallback, GradientLogger
import sys
sys.path.append('../../')
from src.models.generator import DataGeneratorAttention

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Bidirectional, Layer, Input, Dense, LSTM, Embedding, Activation, dot, concatenate, TimeDistributed
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
print(tf.__version__)
print("GPU Available: ", tf.test.is_gpu_available())

W0802 13:37:37.702202 4617815488 deprecation_wrapper.py:118] From /Users/lewtun/git/deep-math/notebooks/baselines/lstm.py:7: The name tf.keras.layers.CuDNNLSTM is deprecated. Please use tf.compat.v1.keras.layers.CuDNNLSTM instead.



2.0.0-beta1
GPU Available:  False


## Load settings

In [3]:
settings_path = Path('../../settings/settings_local.json')

In [4]:
with open(str(settings_path), 'r') as file:
    settings_dict = json.load(file)

In [5]:
settings_dict

{'math_module': 'numbers__round_number',
 'train_level': 'easy',
 'batch_size': 1024,
 'thinking_steps': 0,
 'epochs': 1,
 'latent_dim': 256,
 'save_path': '/artifacts/',
 'data_path': '../../data/raw/v1.0/'}

## Load data

Start with batching a single file before tackling the whole dataset.

In [6]:
raw_path = Path(settings_dict['data_path'])
!ls {raw_path}

[1m[36mextrapolate[m[m  [1m[36minterpolate[m[m  [1m[36mtrain-easy[m[m   [1m[36mtrain-hard[m[m   [1m[36mtrain-medium[m[m


In [7]:
interpolate_path = raw_path/'interpolate'
!ls {interpolate_path} | head -5

algebra__linear_1d.txt
algebra__linear_1d_composed.txt
algebra__linear_2d.txt
algebra__linear_2d_composed.txt
algebra__polynomial_roots.txt


In [8]:
extrapolate_path = raw_path/'extrapolate'
!ls {extrapolate_path} | head -5

algebra__polynomial_roots_big.txt
arithmetic__add_or_sub_big.txt
arithmetic__add_sub_multiple_longer.txt
arithmetic__div_big.txt
arithmetic__mixed_longer.txt


In [9]:
train_easy_path = raw_path/'train-easy/'
!ls {train_easy_path} | head -5

algebra__linear_1d.txt
algebra__linear_1d_composed.txt
algebra__linear_2d.txt
algebra__linear_2d_composed.txt
algebra__polynomial_roots.txt


In [10]:
def concatenate_texts(path, pattern):
    file_paths = list(path.glob('{}*.txt'.format(pattern)))
    
    input_texts = []
    target_texts = []

    for file_path in file_paths:
        with open(str(file_path), 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')[:-1]

        input_texts.extend(lines[0::2])
        target_texts.extend(['\t' + target_text + '\n' for target_text in lines[1::2]])
        
    return input_texts, target_texts

### Data settings

In [11]:
math_module = settings_dict["math_module"]
train_level = settings_dict["train_level"]

In [12]:
datasets = {
    'train':(raw_path, 'train-' + train_level + '/' + math_module),
    'interpolate':(interpolate_path, math_module),
    'extrapolate':(extrapolate_path, math_module)
           }

In [13]:
%%time

input_texts = {}
target_texts = {}

for k, v in datasets.items():
    input_texts[k], target_texts[k] = concatenate_texts(v[0], v[1])
    print('Length of set {} is {}'.format(k, len(input_texts[k])))

Length of set train is 1333332
Length of set interpolate is 20000
Length of set extrapolate is 10000
CPU times: user 617 ms, sys: 240 ms, total: 857 ms
Wall time: 864 ms


**Sample:**

In [14]:
random_idx = np.random.randint(1, len(input_texts['train']))
print('INPUT:', input_texts['train'][random_idx])
print('OUTPUT:', target_texts['train'][random_idx].strip())

INPUT: Round 0.000001671 to 7 dps.
OUTPUT: 0.0000017


Concatenate texts to get text metrics (max length, number of unique tokens, etc.):

In [15]:
all_input_texts = sum(input_texts.values(), [])
all_target_texts = sum(target_texts.values(), [])

In [16]:
input_characters = set(''.join(all_input_texts))
target_characters = set(''.join(all_target_texts))

In [17]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in all_input_texts])
max_decoder_seq_length = max([len(txt) for txt in all_target_texts])

print('Number of samples:', len(all_input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 1363332
Number of unique input tokens: 51
Number of unique output tokens: 14
Max sequence length for inputs: 160
Max sequence length for outputs: 16


### Delete all texts to realease memory

In [18]:
del all_input_texts
del all_target_texts

## Create train test splits

In [19]:
input_texts_train, input_texts_valid, target_texts_train, target_texts_valid = train_test_split(input_texts['train'], target_texts['train'], test_size=0.2, random_state=42)

In [20]:
print('Number of training samples:', len(input_texts_train))

Number of training samples: 1066665


In [21]:
print('Number of validation samples:', len(input_texts_valid))

Number of validation samples: 266667


## Process text

### Vectorise the text
Before training, we need to map strings to a numerical representation. Create two lookup tables: one mapping question characters to numbers, and another for answer characters to number.

In [22]:
# Creating a mapping from unique characters to indices
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

## Create keras data generator

In [23]:
# Parameters
params = {'batch_size': settings_dict["batch_size"],
          'max_encoder_seq_length': max_encoder_seq_length,
          'max_decoder_seq_length': max_decoder_seq_length,
          'num_encoder_tokens': num_encoder_tokens,
          'num_decoder_tokens': num_decoder_tokens,
          'input_token_index': input_token_index,
          'target_token_index': target_token_index,
          'num_thinking_steps': settings_dict["thinking_steps"]
         }

In [24]:
training_generator = DataGeneratorAttention(input_texts=input_texts_train, target_texts=target_texts_train, **params)
validation_generator = DataGeneratorAttention(input_texts=input_texts_valid, target_texts=target_texts_valid, **params)
interpolate_generator = DataGeneratorAttention(input_texts=input_texts['interpolate'], target_texts=target_texts['interpolate'], **params)
extrapolate_generator = DataGeneratorAttention(input_texts=input_texts['extrapolate'], target_texts=target_texts['extrapolate'], **params)

In [25]:
example_idx = 0
# example_input_batch, example_target_batch = training_generator[example_idx][0][0][:,:,0], training_generator[example_idx][0][1][:,:,0]
# example_input_batch.shape, example_target_batch.shape

In [26]:
training_generator[example_idx][0][0]

array([[22., 30., 45., ...,  0.,  0.,  0.],
       [25., 33., 26., ...,  0.,  0.,  0.],
       [23., 40., 46., ...,  0.,  0.,  0.],
       ...,
       [25., 33., 26., ...,  0.,  0.,  0.],
       [25., 33., 26., ...,  0.,  0.,  0.],
       [23., 40., 46., ...,  0.,  0.,  0.]], dtype=float32)

In [27]:
print(training_generator[example_idx][0][0].shape)
print(training_generator[example_idx][0][1].shape)
print(training_generator[example_idx][1].shape)

(1024, 160)
(1024, 16)
(1024, 16, 14)


In [40]:
encoder_inputs = Input(shape=(max_encoder_seq_length,))
decoder_inputs = Input(shape=(max_decoder_seq_length,))

In [48]:
decoder_inputs.shape

TensorShape([None, 16])

In [45]:
encoder = Embedding(num_encoder_tokens + 1, 64, input_length=max_encoder_seq_length, mask_zero=True)(encoder_inputs)
print(encoder)
encoder_outputs = LSTM(256, return_sequences=True, unroll=True)(encoder)
print(encoder_outputs)

Tensor("embedding_4/Identity:0", shape=(None, 160, 64), dtype=float32)
Tensor("lstm_4/Identity:0", shape=(None, 160, 256), dtype=float32)


In [53]:
encoder_last = encoder_outputs[:,-1,:]
encoder_last.set_shape([None, 256])
print('encoder', encoder_last)

encoder Tensor("strided_slice_3:0", shape=(None, 256), dtype=float32)


In [56]:
decoder = Embedding(num_decoder_tokens + 1, 64, input_length=max_decoder_seq_length, mask_zero=True)(decoder_inputs)
print(decoder)
decoder_outputs = LSTM(256, return_sequences=True, unroll=True)(decoder, initial_state=[encoder_last, encoder_last])

print('decoder', decoder_outputs)

Tensor("embedding_11/Identity:0", shape=(None, 16, 64), dtype=float32)
decoder Tensor("lstm_11/Identity:0", shape=(None, 16, 256), dtype=float32)


In [58]:
# Equation (7) with 'dot' score from Section 3.1 in the paper.
# Note that we reuse Softmax-activation layer instead of writing tensor calculation
attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
print('attention', attention)
attention = Activation('softmax', name='attention')(attention)
print('attention', attention)

context = dot([attention, encoder_outputs], axes=[2,1])
print('context', context)

decoder_combined_context = concatenate([context, decoder_outputs])
print('decoder_combined_context', decoder_combined_context)

# # Has another weight + tanh layer as described in equation (5) of the paper
output = TimeDistributed(Dense(64, activation="tanh"))(decoder_combined_context)
print('output 1',output)
output = TimeDistributed(Dense(num_decoder_tokens, activation="softmax"))(output)
print('output', output)

attention Tensor("dot_4/Identity:0", shape=(None, 16, 160), dtype=float32)
attention Tensor("attention_2/Identity:0", shape=(None, 16, 160), dtype=float32)
context Tensor("dot_5/Identity:0", shape=(None, 16, 256), dtype=float32)
decoder_combined_context Tensor("concatenate_2/Identity:0", shape=(None, 16, 512), dtype=float32)
output 1 Tensor("time_distributed_4/Identity:0", shape=(None, 16, 64), dtype=float32)
output Tensor("time_distributed_5/Identity:0", shape=(None, 16, 14), dtype=float32)


In [33]:
valid_dict = {
    'validation':validation_generator,
    'interpolation': interpolate_generator,
    'extrapolation': extrapolate_generator
}

In [34]:
history = NValidationSetsCallback(valid_dict)
gradient = GradientLogger(live_metrics=['loss', 'exact_match_metric_index'], live_gaps=10)

In [35]:
epochs = settings_dict['epochs']  # Number of epochs to train for.
latent_dim = settings_dict['latent_dim']  # Latent dimensionality of the encoding space.

In [36]:
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=[output])

In [37]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 16)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 160)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 16, 64)       960         input_2[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (None, 160, 64)      3328        input_1[0][0]                    
______________________________________________________________________________________________

In [38]:
adam = Adam(lr=6e-4, beta_1=0.9, beta_2=0.995, epsilon=1e-9, decay=0.0, amsgrad=False, clipnorm=0.1)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=[exact_match_metric_index])

In [39]:
print('start training...')
train_hist = model.fit_generator(training_generator,
                                 epochs=epochs,
                                 #use_multiprocessing=True, workers=8,
                                 callbacks=[history, gradient],
                                 verbose=0,
                                )

start training...
{"chart": "live_loss", "axis": "batch"}
{"chart": "live_exact_match_metric_index", "axis": "batch"}
{"chart": "loss", "axis": "epoch"}
{"chart": "exact_match_metric_index", "axis": "epoch"}
{"chart": "live_loss", "y": 5.4358835, "x": 10}
{"chart": "live_exact_match_metric_index", "y": 0.6963021, "x": 10}
{"chart": "live_loss", "y": 4.521158, "x": 20}
{"chart": "live_exact_match_metric_index", "y": 0.69636446, "x": 20}
{"chart": "live_loss", "y": 4.378515, "x": 30}
{"chart": "live_exact_match_metric_index", "y": 0.6968709, "x": 30}
{"chart": "live_loss", "y": 4.3312016, "x": 40}
{"chart": "live_exact_match_metric_index", "y": 0.6967309, "x": 40}
{"chart": "live_loss", "y": 4.27552, "x": 50}
{"chart": "live_exact_match_metric_index", "y": 0.6966175, "x": 50}
{"chart": "live_loss", "y": 4.26984, "x": 60}
{"chart": "live_exact_match_metric_index", "y": 0.69640714, "x": 60}
{"chart": "live_loss", "y": 3.9274971, "x": 70}
{"chart": "live_exact_match_metric_index", "y": 0.69

KeyboardInterrupt: 