## Concatenate all files

```bash
$ cd path/to/train-easy/
$ find -name '*.txt' -exec cat {} \; > ../../../interim/train-easy_all.txt
```

## Load libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [635]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import glob
import pickle
import json
import matplotlib.pyplot as plt
from lstm import LSTM_S2S
from metrics import exact_match_metric
from callbacks import NValidationSetsCallback, GradientLogger
from generator import DataGenerator

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Bidirectional, Layer, Input, Dense, LSTM, Embedding, Activation, dot, concatenate, TimeDistributed
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
print(tf.__version__)
print("GPU Available: ", tf.test.is_gpu_available())

2.0.0-beta1
GPU Available:  False


## Load settings

In [6]:
settings_path = Path('../../settings/settings_local.json')

In [19]:
with open(str(settings_path), 'r') as file:
    settings_dict = json.load(file)

In [20]:
settings_dict

{'math_module': 'numbers__round_number',
 'train_level': 'easy',
 'batch_size': 1024,
 'thinking_steps': 0,
 'epochs': 1,
 'latent_dim': 256,
 'save_path': '/artifacts/',
 'data_path': '../../data/raw/v1.0/'}

## Load data

Start with batching a single file before tackling the whole dataset.

In [21]:
raw_path = Path(settings_dict['data_path'])
!ls {raw_path}

[1m[36mextrapolate[m[m  [1m[36minterpolate[m[m  [1m[36mtrain-easy[m[m   [1m[36mtrain-hard[m[m   [1m[36mtrain-medium[m[m


In [22]:
interpolate_path = raw_path/'interpolate'
!ls {interpolate_path} | head -5

algebra__linear_1d.txt
algebra__linear_1d_composed.txt
algebra__linear_2d.txt
algebra__linear_2d_composed.txt
algebra__polynomial_roots.txt


In [23]:
extrapolate_path = raw_path/'extrapolate'
!ls {extrapolate_path} | head -5

algebra__polynomial_roots_big.txt
arithmetic__add_or_sub_big.txt
arithmetic__add_sub_multiple_longer.txt
arithmetic__div_big.txt
arithmetic__mixed_longer.txt


In [24]:
train_easy_path = raw_path/'train-easy/'
!ls {train_easy_path} | head -5

algebra__linear_1d.txt
algebra__linear_1d_composed.txt
algebra__linear_2d.txt
algebra__linear_2d_composed.txt
algebra__polynomial_roots.txt


In [25]:
def concatenate_texts(path, pattern):
    file_paths = list(path.glob('{}*.txt'.format(pattern)))
    
    input_texts = []
    target_texts = []

    for file_path in file_paths:
        with open(str(file_path), 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')[:-1]

        input_texts.extend(lines[0::2])
        target_texts.extend(['\t' + target_text + '\n' for target_text in lines[1::2]])
        
    return input_texts, target_texts

### Data settings

In [26]:
math_module = settings_dict["math_module"]
train_level = settings_dict["train_level"]

In [27]:
datasets = {
    'train':(raw_path, 'train-' + train_level + '/' + math_module),
    'interpolate':(interpolate_path, math_module),
    'extrapolate':(extrapolate_path, math_module)
           }

In [28]:
%%time

input_texts = {}
target_texts = {}

for k, v in datasets.items():
    input_texts[k], target_texts[k] = concatenate_texts(v[0], v[1])
    print('Length of set {} is {}'.format(k, len(input_texts[k])))

Length of set train is 1333332
Length of set interpolate is 20000
Length of set extrapolate is 10000
CPU times: user 589 ms, sys: 199 ms, total: 788 ms
Wall time: 797 ms


**Sample:**

In [30]:
random_idx = np.random.randint(1, len(input_texts['train']))
print('INPUT:', input_texts['train'][random_idx])
print('OUTPUT:', target_texts['train'][random_idx].strip())

INPUT: Let n(x) = 53*x + 4. Let m be n(4). What is m rounded to the nearest ten?
OUTPUT: 220


Concatenate texts to get text metrics (max length, number of unique tokens, etc.):

In [31]:
all_input_texts = sum(input_texts.values(), [])
all_target_texts = sum(target_texts.values(), [])

In [32]:
input_characters = set(''.join(all_input_texts))
target_characters = set(''.join(all_target_texts))

In [33]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in all_input_texts])
max_decoder_seq_length = max([len(txt) for txt in all_target_texts])

print('Number of samples:', len(all_input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 1363332
Number of unique input tokens: 51
Number of unique output tokens: 14
Max sequence length for inputs: 160
Max sequence length for outputs: 16


### Delete all texts to realease memory

In [34]:
del all_input_texts
del all_target_texts

## Create train test splits

In [35]:
input_texts_train, input_texts_valid, target_texts_train, target_texts_valid = train_test_split(input_texts['train'], target_texts['train'], test_size=0.2, random_state=42)

In [36]:
print('Number of training samples:', len(input_texts_train))

Number of training samples: 1066665


In [37]:
print('Number of validation samples:', len(input_texts_valid))

Number of validation samples: 266667


## Process text

### Vectorise the text
Before training, we need to map strings to a numerical representation. Create two lookup tables: one mapping question characters to numbers, and another for answer characters to number.

In [38]:
# Creating a mapping from unique characters to indices
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

## Create keras data generator

In [40]:
# Parameters
params = {'batch_size': settings_dict["batch_size"],
          'max_encoder_seq_length': max_encoder_seq_length,
          'max_decoder_seq_length': max_decoder_seq_length,
          'num_encoder_tokens': num_encoder_tokens,
          'num_decoder_tokens': num_decoder_tokens,
          'input_token_index': input_token_index,
          'target_token_index': target_token_index,
          'num_thinking_steps': settings_dict["thinking_steps"]
         }

In [41]:
training_generator = DataGenerator(input_texts=input_texts_train, target_texts=target_texts_train, **params)
validation_generator = DataGenerator(input_texts=input_texts_valid, target_texts=target_texts_valid, **params)
interpolate_generator = DataGenerator(input_texts=input_texts['interpolate'], target_texts=target_texts['interpolate'], **params)
extrapolate_generator = DataGenerator(input_texts=input_texts['extrapolate'], target_texts=target_texts['extrapolate'], **params)

In [204]:
example_idx = 0
example_input_batch, example_target_batch = training_generator[example_idx][0][0][:,:,0], training_generator[example_idx][0][1][:,:,0]
example_input_batch.shape, example_target_batch.shape

((1024, 160), (1024, 16))

In [760]:
training_generator[example_idx][0][0].shape

(1024, 160, 51)

## Train model

In [78]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.vocab_size = vocab_size
#         self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(self.enc_units, return_sequences=True, return_state=True)
    
    def call(self, x):
#         x = self.embedding(x)
        output, state_h, state_c = self.lstm(x)
        return output, state_h, state_c
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

    def get_placeholder(self):
        return self.lstm(Input(shape=(None, self.enc_units)))

In [79]:
encoder = Encoder(num_encoder_tokens, 256, 0)

In [80]:
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder.get_placeholder()
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (None, None, 256)
Encoder Hidden state shape: (batch size, units) (0, 256)


In [81]:
class BahdanauAttention(tf.keras.Model):

    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):

    # hidden shape == (batch_size, hidden size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden size)
    # we are doing this to perform addition to calculate the score

        hidden_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)

        score = self.V(tf.nn.tanh(self.W1(values)
                       + self.W2(hidden_with_time_axis)))

    # attention_weights shape == (batch_size, max_length, 1)

        attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)

        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return (context_vector, attention_weights)

In [82]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (None, 256)
Attention weights shape: (batch_size, sequence_length, 1) (None, None, 1)


In [112]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       )
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, _, _ = self.lstm(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, attention_weights

In [114]:
decoder = Decoder(num_decoder_tokens, 256, 256)

sample_decoder_output, _ = decoder(tf.random.uniform((1024, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (1024, 14)


In [117]:
class LstmWithAttention:
    def __init__(self, num_encoder_tokens, num_decoder_tokens, latent_dim):
        self.num_encoder_tokens = num_encoder_tokens
        self.num_decoder_tokens = num_decoder_tokens
        self.latent_dim = latent_dim

    def get_model(self):
        # Define an input sequence and process it.
        self.encoder_inputs = Input(shape=(None, self.num_encoder_tokens))
        # Use CuDNNLSTM if running on GPU
        if tf.test.is_gpu_available():
            encoder = CuDNNLSTM(self.latent_dim, return_state=True)
        else:
            encoder = LSTM(self.latent_dim, return_state=True)
        encoder_outputs, state_h, state_c = encoder(self.encoder_inputs)
        # We discard `encoder_outputs` and only keep the states.
        self.encoder_states = [state_h, state_c]

        # Set up the decoder, using `encoder_states` as initial state.
        self.decoder_inputs = Input(shape=(None, self.num_decoder_tokens))
        # We set up our decoder to return full output sequences,
        # and to return internal states as well. We don't use the
        # return states in the training model, but we will use them in inference.
        if tf.test.is_gpu_available():
            self.decoder_lstm = CuDNNLSTM(
                self.latent_dim, return_sequences=True, return_state=True
            )
        else:
            self.decoder_lstm = Decoder(self.num_decoder_tokens, self.latent_dim, self.latent_dim)
        decoder_outputs, _, _ = self.decoder_lstm(
            self.decoder_inputs, self.encoder_states, encoder_outputs
        )
        
        # Attention layer
        self.attention = BahdanauAttention(self.latent_dim)
        context_vector, attention_weights = self.attention(self.encoder_states, encoder_outputs)
        
        self.decoder_dense = Dense(self.num_decoder_tokens, activation="softmax")
        decoder_outputs = self.decoder_dense(decoder_outputs)

        return Model([self.encoder_inputs, self.decoder_inputs], decoder_outputs)

In [162]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x):
        x = self.embedding(x)
        output, state, _ = self.lstm(x)
        return output, state

In [163]:
embedding_dim = 256
units = 256

In [205]:
encoder = Encoder(num_encoder_tokens, embedding_dim, units)

# sample input
sample_output, sample_hidden = encoder(example_input_batch)
print ('Encoder output shape: (batch_size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch_size, sequence length, units) (1024, 160, 256)
Encoder Hidden state shape: (batch size, units) (1024, 256)


In [167]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [168]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (1024, 256)
Attention weights shape: (batch_size, sequence_length, 1) (1024, 160, 1)


In [414]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)
        print(context_vector.shape)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state, _ = self.lstm(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [415]:
decoder = Decoder(num_decoder_tokens, embedding_dim, units)

sample_decoder_output, _, _ = decoder(tf.random.uniform((1024, 14)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

(1024, 256)


InvalidArgumentError: ConcatOp : Dimensions of inputs should match: shape[0] = [1024,1,256] vs. shape[1] = [1024,14,256] [Op:ConcatV2] name: concat

In [395]:
class LstmWithAttention:
    def __init__(self, num_encoder_tokens, num_decoder_tokens, latent_dim, embedding_dim):
        self.num_encoder_tokens = num_encoder_tokens
        self.num_decoder_tokens = num_decoder_tokens
        self.latent_dim = latent_dim
        self.embedding_dim = embedding_dim

    def get_model(self):
        # Define an input sequence and process it.
        self.encoder_inputs = Input(shape=(max_encoder_seq_length))
        encoder = Encoder(self.num_encoder_tokens, self.latent_dim, self.embedding_dim)
        encoder_outputs, state_h = encoder(self.encoder_inputs)
        # We discard `encoder_outputs` and only keep the states.
#         self.encoder_states = [state_h, state_c]

        # Set up the decoder, using `encoder_states` as initial state.
        self.decoder_inputs = Input(shape=(1,))
        # We set up our decoder to return full output sequences,
        # and to return internal states as well. We don't use the
        # return states in the training model, but we will use them in inference.
        decoder = Decoder(self.num_decoder_tokens, self.embedding_dim, self.latent_dim)
        decoder_outputs, _, _ = decoder(self.decoder_inputs, state_h, encoder_outputs)
        
#         self.decoder_dense = Dense(self.num_decoder_tokens, activation="softmax")
#         decoder_outputs = self.decoder_dense(decoder_outputs)
        
        print('encoder_inputs shape:', self.encoder_inputs.shape)
        print('decoder_inputs shape:', self.decoder_inputs.shape)
        print('decoder_ouptut shape:', decoder_outputs.shape)

        return Model([self.encoder_inputs, self.decoder_inputs], decoder_outputs)

In [825]:
class LstmWithAttention:
    def __init__(self, num_encoder_tokens, num_decoder_tokens, latent_dim, embedding_dim):
        self.num_encoder_tokens = num_encoder_tokens
        self.num_decoder_tokens = num_decoder_tokens
        self.latent_dim = latent_dim
        self.embedding_dim = embedding_dim

    def get_model(self):
        # Define an input sequence and process it.
        self.encoder_inputs = Input(shape=(num_encoder_tokens,))
        x = self.encoder_inputs
        x = Embedding(self.num_encoder_tokens, self.embedding_dim, input_length=max_encoder_seq_length, mask_zero=True)(self.encoder_inputs)
        encoder = LSTM(self.latent_dim, return_sequences=True, return_state=True)
        encoder_outputs, state_h, state_c = encoder(x)
        # We discard `encoder_outputs` and only keep the states.
        self.encoder_states = [state_h, state_c]

        print('encoder', encoder_outputs)
        
        self.decoder_inputs = Input(shape=(num_decoder_tokens,))
        y = self.decoder_inputs
        y = Embedding(self.num_decoder_tokens, self.embedding_dim, input_length=max_decoder_seq_length, mask_zero=True)(self.decoder_inputs)
        decoder = LSTM(self.latent_dim, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder(y, initial_state=self.encoder_states)
        
        print('decoder', decoder_outputs)
        
        # Equation (7) with 'dot' score from Section 3.1 in the paper.
        # Note that we reuse Softmax-activation layer instead of writing tensor calculation
        attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
        attention = Activation('softmax', name='attention')(attention)
        print('attention', attention)
        
        context = dot([attention, encoder_outputs], axes=[2,1])
        print('context', context)
        
        decoder_combined_context = concatenate([context, decoder_outputs])
        print('decoder_combined_context', decoder_combined_context)

        # Has another weight + tanh layer as described in equation (5) of the paper
        output = TimeDistributed(Dense(self.latent_dim, activation="tanh"))(decoder_combined_context)
        print('output 1',output)
        output = TimeDistributed(Dense(self.num_decoder_tokens, activation="softmax"))(output)
        print('output', output)
#         print(tf.reshape(output, (-1, output.shape[2])))
#         output = tf.reshape(output, (-1, output.shape[2]))
        
        print(self.encoder_inputs.shape, self.decoder_inputs.shape, output.shape)
        return Model([self.encoder_inputs, self.decoder_inputs], output)

In [826]:
valid_dict = {
    'validation':validation_generator,
    'interpolation': interpolate_generator,
    'extrapolation': extrapolate_generator
}

In [827]:
history = NValidationSetsCallback(valid_dict)
gradient = GradientLogger(live_metrics=['loss', 'exact_match_metric'], live_gaps=10)

In [828]:
epochs = settings_dict['epochs']  # Number of epochs to train for.
latent_dim = settings_dict['latent_dim']  # Latent dimensionality of the encoding space.

In [829]:
lstm = LstmWithAttention(num_encoder_tokens, num_decoder_tokens, latent_dim, embedding_dim)

In [830]:
model = lstm.get_model()

encoder Tensor("lstm_213/Identity:0", shape=(None, 51, 256), dtype=float32)
decoder Tensor("lstm_214/Identity:0", shape=(None, 14, 256), dtype=float32)
attention Tensor("attention_24/Identity:0", shape=(None, 14, 51), dtype=float32)
context Tensor("dot_48/Identity:0", shape=(None, 14, 256), dtype=float32)
decoder_combined_context Tensor("concatenate_22/Identity:0", shape=(None, 14, 512), dtype=float32)
output 1 Tensor("time_distributed_41/Identity:0", shape=(None, 14, 256), dtype=float32)
output Tensor("time_distributed_42/Identity:0", shape=(None, 14, 14), dtype=float32)
(None, 51) (None, 14) (None, 14, 14)


In [833]:
model.summary()

Model: "model_30"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_192 (InputLayer)          [(None, 51)]         0                                            
__________________________________________________________________________________________________
input_193 (InputLayer)          [(None, 14)]         0                                            
__________________________________________________________________________________________________
embedding_202 (Embedding)       (None, 51, 256)      13056       input_192[0][0]                  
__________________________________________________________________________________________________
embedding_203 (Embedding)       (None, 14, 256)      3584        input_193[0][0]                  
___________________________________________________________________________________________

In [831]:
adam = Adam(lr=6e-4, beta_1=0.9, beta_2=0.995, epsilon=1e-9, decay=0.0, amsgrad=False, clipnorm=0.1)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=[exact_match_metric])

In [832]:
print('start training...')
train_hist = model.fit_generator(training_generator,
                                 epochs=epochs,
                                 #use_multiprocessing=True, workers=8,
                                 callbacks=[history, gradient],
                                 verbose=0,
                                )

start training...
{"chart": "live_loss", "axis": "batch"}
{"chart": "live_exact_match_metric", "axis": "batch"}
{"chart": "loss", "axis": "epoch"}
{"chart": "exact_match_metric", "axis": "epoch"}


ValueError: Error when checking input: expected input_192 to have 2 dimensions, but got array with shape (1024, 160, 51)

In [1]:
plt.plot(train_hist.history['loss'],color='C0', label='train')
plt.plot(train_hist.history['validation_loss'], color='C0', label='valid', linestyle='--')
plt.plot(train_hist.history['extrapolation_loss'], color='C1', label='extra',)
plt.plot(train_hist.history['interpolation_loss'], color='C2', label='inter')

plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend(loc='best')
plt.ylim([0,1])
plt.grid(True, linestyle='--')
plt.tight_layout()
plt.savefig(settings_dict['save_path'] + 'losses.png', dpi=300)

SyntaxError: invalid syntax (<ipython-input-1-aabe21b7fb28>, line 13)

In [None]:
plt.plot(train_hist.history['exact_match_metric'],color='C0', label='train')
plt.plot(train_hist.history['validation_exact_match_metric'], color='C0', label='valid', linestyle='--')
plt.plot(train_hist.history['extrapolation_exact_match_metric'], color='C1', label='extra',)
plt.plot(train_hist.history['interpolation_exact_match_metric'], color='C2', label='inter')

plt.xlabel('epochs')
plt.ylabel('exact match metric')
plt.legend(loc='best')
plt.ylim([0,1])
plt.grid(True, linestyle='--')
plt.tight_layout()
plt.savefig(settings_dict['save_path'] + 'metrics.png', dpi=300)

In [50]:
with open(settings_dict['save_path']+'experiments_output.pkl','wb') as file:
    pickle.dump(train_hist.history, file)

In [53]:
model.save(settings_dict['save_path']+'this_model.model')

In [78]:
with open(settings_dict['save_path']+'settings.json','w') as file:
    json.dump(settings_dict, file)

FileNotFoundError: [Errno 2] No such file or directory: '../../artifacts/settings.json'