## Concatenate all files

```bash
$ cd path/to/train-easy/
$ find -name '*.txt' -exec cat {} \; > ../../../interim/train-easy_all.txt
```

## Load libraries

In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import glob
import pickle
import json
import matplotlib.pyplot as plt
from lstm import LSTM_Simple
from metrics import exact_match_metric
from callbacks import NValidationSetsCallback, GradientLogger
from generator import DataGenerator, DataGeneratorSeq

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
print(tf.__version__)
print("GPU Available: ", tf.test.is_gpu_available())

2.0.0-alpha0
GPU Available:  True


## Load settings

In [27]:
settings_path = Path('../../settings/settings.json')

In [28]:
with open(str(settings_path), 'r') as file:
    settings_dict = json.load(file)

In [29]:
settings_dict

{'batch_size': 1024,
 'data_path': '/storage/git/deep-math/data/raw/v1.0/',
 'epochs': 1,
 'latent_dim': 2048,
 'math_module': 'arithmetic',
 'save_path': '/artifacts/',
 'thinking_steps': 16,
 'train_level': 'easy'}

## Load data

Start with batching a single file before tackling the whole dataset.

In [30]:
raw_path = Path(settings_dict['data_path'])
!ls {raw_path}

extrapolate  interpolate  train-easy  train-hard  train-medium


In [31]:
interpolate_path = raw_path/'interpolate'
!ls {interpolate_path} | head -5

algebra__linear_1d.txt
algebra__linear_1d_composed.txt
algebra__linear_2d.txt
algebra__linear_2d_composed.txt
algebra__polynomial_roots.txt


In [32]:
extrapolate_path = raw_path/'extrapolate'
!ls {extrapolate_path} | head -5

algebra__polynomial_roots_big.txt
arithmetic__add_or_sub_big.txt
arithmetic__add_sub_multiple_longer.txt
arithmetic__div_big.txt
arithmetic__mixed_longer.txt


In [33]:
train_easy_path = raw_path/'train-easy/'
!ls {train_easy_path} | head -5

algebra__linear_1d.txt
algebra__linear_1d_composed.txt
algebra__linear_2d.txt
algebra__linear_2d_composed.txt
algebra__polynomial_roots.txt


In [34]:
def concatenate_texts(path, pattern):
    file_paths = list(path.glob('{}*.txt'.format(pattern)))
    
    input_texts = []
    target_texts = []

    for file_path in file_paths:
        with open(str(file_path), 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')[:-1]

        input_texts.extend(lines[0::2])
        target_texts.extend(['\t' + target_text + '\n' for target_text in lines[1::2]])
        
    return input_texts, target_texts

### Data settings

In [35]:
math_module = settings_dict["math_module"]
train_level = settings_dict["train_level"]

In [36]:
datasets = {
    'train':(raw_path, 'train-' + train_level + '/' + math_module),
    'interpolate':(interpolate_path, math_module),
    'extrapolate':(extrapolate_path, math_module)
           }

In [37]:
%%time

input_texts = {}
target_texts = {}

for k, v in datasets.items():
    input_texts[k], target_texts[k] = concatenate_texts(v[0], v[1])
    print('Length of set {} is {}'.format(k, len(input_texts[k])))

Length of set interpolate is 90000
Length of set train is 5999994
Length of set extrapolate is 60000
CPU times: user 2.53 s, sys: 816 ms, total: 3.35 s
Wall time: 3.39 s


**Sample:**

In [38]:
print('INPUT:', input_texts['train'][42])
print('OUTPUT:', target_texts['train'][42].strip())

INPUT: What is 2 - (1 + -5) - 11?
OUTPUT: -5


Concatenate texts to get text metrics (max length, number of unique tokens, etc.):

In [39]:
all_input_texts = sum(input_texts.values(), [])
all_target_texts = sum(target_texts.values(), [])

In [40]:
input_characters = set(''.join(all_input_texts))
target_characters = set(''.join(all_target_texts))

In [41]:
tokens = sorted(list(input_characters | target_characters))
num_tokens = len(tokens)
max_seq_length  = max([len(txt_in) + len(txt_out) for txt_in, txt_out in zip(all_input_texts,all_target_texts)])

print('Number of samples:', len(all_input_texts))
print('number of tokens:', num_tokens)
print('max sequence length:', max_seq_length)

Number of samples: 6149994
number of tokens: 56
max sequence length: 188


### Delete all texts to realease memory

In [42]:
del all_input_texts
del all_target_texts

## Create train test splits

In [43]:
input_texts_train, input_texts_valid, target_texts_train, target_texts_valid = train_test_split(input_texts['train'], target_texts['train'], test_size=0.2, random_state=42)

In [44]:
print('Number of training samples:', len(input_texts_train))

Number of training samples: 4799995


In [45]:
print('Number of validation samples:', len(input_texts_valid))

Number of validation samples: 1199999


## Process text

### Vectorise the text
Before training, we need to map strings to a numerical representation. Create two lookup tables: one mapping question characters to numbers, and another for answer characters to number.

In [46]:
# Creating a mapping from unique characters to indices
token_index = dict([(char, i) for i, char in enumerate(tokens)])


In [47]:
token_index

{'\t': 0,
 '\n': 1,
 ' ': 2,
 '(': 3,
 ')': 4,
 '*': 5,
 '+': 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '/': 10,
 '0': 11,
 '1': 12,
 '2': 13,
 '3': 14,
 '4': 15,
 '5': 16,
 '6': 17,
 '7': 18,
 '8': 19,
 '9': 20,
 '?': 21,
 'A': 22,
 'C': 23,
 'D': 24,
 'E': 25,
 'I': 26,
 'M': 27,
 'P': 28,
 'S': 29,
 'T': 30,
 'W': 31,
 'a': 32,
 'b': 33,
 'c': 34,
 'd': 35,
 'e': 36,
 'f': 37,
 'g': 38,
 'h': 39,
 'i': 40,
 'k': 41,
 'l': 42,
 'm': 43,
 'n': 44,
 'o': 45,
 'p': 46,
 'q': 47,
 'r': 48,
 's': 49,
 't': 50,
 'u': 51,
 'v': 52,
 'w': 53,
 'x': 54,
 'y': 55}

## Create keras data generator

In [48]:
# Parameters
params = {'batch_size': settings_dict["batch_size"],
          'max_seq_length': max_seq_length,
          'num_tokens': num_tokens,
          'token_index': token_index,
          'num_thinking_steps': settings_dict["thinking_steps"]
         }

In [49]:
training_generator = DataGeneratorSeq(input_texts=input_texts_train, target_texts=target_texts_train, **params)
validation_generator = DataGeneratorSeq(input_texts=input_texts_valid, target_texts=target_texts_valid, **params)
interpolate_generator = DataGeneratorSeq(input_texts=input_texts['interpolate'], target_texts=target_texts['interpolate'], **params)
extrapolate_generator = DataGeneratorSeq(input_texts=input_texts['extrapolate'], target_texts=target_texts['extrapolate'], **params)

In [50]:
a,b = training_generator.__getitem__(1)

## Train model

In [51]:
valid_dict = {
    'validation':validation_generator,
    'interpolation': interpolate_generator,
    'extrapolation': extrapolate_generator
}

In [52]:
history = NValidationSetsCallback(valid_dict)
gradient = GradientLogger(live_metrics=['loss', 'exact_match_metric'], live_gaps=10)

In [53]:
epochs = settings_dict['epochs']  # Number of epochs to train for.
latent_dim = settings_dict['latent_dim']  # Latent dimensionality of the encoding space.

In [54]:
lstm = LSTM_Simple(num_tokens, latent_dim)

In [55]:
model = lstm.get_model()

In [56]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, 56)]        0         
_________________________________________________________________
cu_dnnlstm (CuDNNLSTM)       [(None, None, 2048), (Non 17252352  
_________________________________________________________________
dense (Dense)                (None, None, 56)          114744    
Total params: 17,367,096
Trainable params: 17,367,096
Non-trainable params: 0
_________________________________________________________________


In [58]:
adam = Adam(lr=6e-4, beta_1=0.9, beta_2=0.995, epsilon=1e-9, decay=0.0, amsgrad=False, clipnorm=0.1)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=[exact_match_metric])
print('start training...')
train_hist = model.fit_generator(training_generator,
                                 epochs=epochs,
                                 callbacks=[history],#, gradient],
                                 verbose=1,
                                )

start training...
  22/4687 [..............................] - ETA: 3:24:50 - loss: 0.0863 - exact_match_metric: 0.0017

KeyboardInterrupt: 

In [1]:
plt.plot(train_hist.history['loss'],color='C0', label='train')
plt.plot(train_hist.history['validation_loss'], color='C0', label='valid', linestyle='--')
plt.plot(train_hist.history['extrapolation_loss'], color='C1', label='extra',)
plt.plot(train_hist.history['interpolation_loss'], color='C2', label='inter')

plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend(loc='best')
plt.ylim([0,1])
plt.grid(True, linestyle='--')
plt.tight_layout()
plt.savefig(settings_dict['save_path'] + 'losses.png', dpi=300)

SyntaxError: invalid syntax (<ipython-input-1-aabe21b7fb28>, line 13)

In [None]:
plt.plot(train_hist.history['exact_match_metric'],color='C0', label='train')
plt.plot(train_hist.history['validation_exact_match_metric'], color='C0', label='valid', linestyle='--')
plt.plot(train_hist.history['extrapolation_exact_match_metric'], color='C1', label='extra',)
plt.plot(train_hist.history['interpolation_exact_match_metric'], color='C2', label='inter')

plt.xlabel('epochs')
plt.ylabel('exact match metric')
plt.legend(loc='best')
plt.ylim([0,1])
plt.grid(True, linestyle='--')
plt.tight_layout()
plt.savefig(settings_dict['save_path'] + 'metrics.png', dpi=300)

In [50]:
with open(settings_dict['save_path']+'experiments_output.pkl','wb') as file:
    pickle.dump(train_hist.history, file)

In [53]:
model.save(settings_dict['save_path']+'this_model.model')

In [78]:
with open(settings_dict['save_path']+'settings.json','w') as file:
    json.dump(settings_dict, file)

FileNotFoundError: [Errno 2] No such file or directory: '../../artifacts/settings.json'