Load dataset. It contains translations from English to Spanish, so swap the order of the phrases. Also add `\t` and `\n` as the start and stop tokens in the target sequences. 

In [None]:
start_token = "\t"
stop_token = "\n"

with open("data/spa.txt", "r", encoding="utf-8") as f:
    samples = f.read().split("\n")

samples = [sample.strip().split("\t")
           for sample in samples if len(sample.strip()) > 0]

samples = [(es, start_token + en + stop_token)
           for en, es in samples if len(es) < 45]

In [None]:
len(samples)

In [None]:
print(samples[:2])

Split data into train and validation sets.

In [None]:
from sklearn.model_selection import train_test_split

train_samples, valid_samples = train_test_split(samples, train_size=.8, random_state=42)

In [None]:
len(train_samples)

In [None]:
len(valid_samples)

Determine the training vocabulary. Those are the only tokens you can trust the model will know how to handle. 

In [None]:
in_vocab = set()
out_vocab = set()

for in_seq, out_seq in train_samples:
    in_vocab.update(in_seq)
    out_vocab.update(out_seq)
    
in_vocab_size = len(in_vocab)
out_vocab_size = len(out_vocab)
print("Input vocab size:", in_vocab_size)
print("Output vocab size:", out_vocab_size)

In [None]:
print(sorted(in_vocab))

In [None]:
print(sorted(out_vocab))

Go through validation set and remove any tokens not present in the training set.

In [None]:
tmp_samples = []
for in_seq, out_seq in valid_samples:
    tmp_in_seq = [c for c in in_seq if c in in_vocab]
    tmp_out_seq = [c for c in out_seq if c in out_vocab]

    tmp_samples.append(("".join(tmp_in_seq), "".join(tmp_out_seq)))
    
valid_samples = tmp_samples

Build sequence-to-sequence model.

In [None]:
import keras
from keras.layers import Dense, Input, LSTM, Masking
from keras.models import Model

In [None]:
latent_dim = 256

encoder_in = Input(shape=(None, in_vocab_size), name="encoder_in")
encoder_mask = Masking(name="encoder_mask")(encoder_in)
encoder_lstm = LSTM(latent_dim, return_state=True, recurrent_dropout=0.3, name="encoder_lstm")
_, encoder_h, encoder_c = encoder_lstm(encoder_mask)

In [None]:
decoder_in = Input(shape=(None, out_vocab_size), name="decoder_in")

decoder_mask = Masking(name="decoder_mask")(decoder_in)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,
                    dropout=0.2, recurrent_dropout=0.3, name="decoder_lstm")
decoder_lstm_out, _, _ = decoder_lstm(decoder_mask, initial_state=[encoder_h, encoder_c])
decoder_dense = Dense(out_vocab_size, activation="softmax", name="decoder_out")
decoder_out = decoder_dense(decoder_lstm_out)

In [None]:
seq2seq_model = Model([encoder_in, decoder_in], decoder_out)
seq2seq_model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

In [None]:
seq2seq_model.summary()

Create maps to convert characters to and from ints. 

In [None]:
in_token2int = {token : i for i, token in enumerate(sorted(in_vocab))}
out_token2int = {token : i for i, token in enumerate(sorted(out_vocab))}
out_int2token = {i : token for (token, i) in out_token2int.items()}

Create helper functions for one-hot encoding sequences for use with the model.

In [None]:
import numpy as np

def make_batch_storage(batch_size, in_seq_len, out_seq_len):
    
    enc_in_seqs = np.zeros(
        (batch_size, in_seq_len, in_vocab_size),
        dtype=np.float32)

    dec_in_seqs = np.zeros(
        (batch_size, out_seq_len, out_vocab_size),
        dtype=np.float32)

    dec_out_seqs = np.zeros(
        (batch_size, out_seq_len, out_vocab_size),
        dtype=np.float32)
        
    return enc_in_seqs, dec_in_seqs, dec_out_seqs

In [None]:
def encode_batch(samples):
    batch_size = len(samples)
    max_in_length = max([len(seq) for seq, _ in samples])
    max_out_length = max([len(seq) for _, seq in samples])

    enc_in_seqs, dec_in_seqs, dec_out_seqs = make_batch_storage(
        batch_size, max_in_length, max_out_length)
    
    for i, (in_seq, out_seq) in enumerate(samples):
        for time_step, token in enumerate(in_seq):
            enc_in_seqs[i, time_step, in_token2int[token]] = 1

        for time_step, token in enumerate(out_seq):
            dec_in_seqs[i, time_step, out_token2int[token]] = 1

        for time_step, token in enumerate(out_seq[1:]):
            dec_out_seqs[i, time_step, out_token2int[token]] = 1
            
    return enc_in_seqs, dec_in_seqs, dec_out_seqs

Train model

In [None]:
from seq2seq_util import Seq2SeqBatchGenerator

batch_size = 64
train_generator = Seq2SeqBatchGenerator(train_samples, batch_size, encode_batch)
valid_generator = Seq2SeqBatchGenerator(valid_samples, batch_size, encode_batch)

In [None]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

seq2seq_model.fit_generator(train_generator, epochs=500,
                            validation_data=valid_generator,
                            callbacks=[early_stopping])

Create encoder/decoder models for inference

In [None]:
inf_encoder = Model(encoder_in, [encoder_h, encoder_c])

In [None]:
inf_encoder.summary()

In [None]:
inf_dec_h_in = Input(shape=(latent_dim,), name="decoder_h_in")
inf_dec_c_in = Input(shape=(latent_dim,), name="decoder_c_in")

inf_dec_lstm_out, inf_dec_h_out, inf_dec_c_out = decoder_lstm(
    decoder_in, initial_state=[inf_dec_h_in, inf_dec_c_in])

inf_dec_out = decoder_dense(inf_dec_lstm_out)

inf_decoder = Model(
    [decoder_in, inf_dec_h_in, inf_dec_c_in],
    [inf_dec_out, inf_dec_h_out, inf_dec_c_out])

In [None]:
inf_decoder.summary()

Test trained model on the first 100 samples from both the training and validation sets.

In [None]:
max_out_seq_len = max([len(seq) for _, seq in samples])
print("Max output length: ", max_out_seq_len)

start_token_idx = out_token2int[start_token]
stop_token_idx = out_token2int[stop_token]

In [None]:
def translate_sequence(one_hot_seq, encoder, decoder):
    encoding = encoder.predict(one_hot_seq)

    decoder_in = np.zeros((1, 1, out_vocab_size), dtype=np.float32)

    translated_text = ''
    done_decoding = False
    decoded_idx = start_token_idx
    while not done_decoding:
        decoder_in[0, 0, decoded_idx] = 1
        decoding, h, c = decoder.predict([decoder_in] + encoding)
        encoding = [h, c]
        decoder_in[0, 0, decoded_idx] = 0

        decoded_idx = np.argmax(decoding[0, -1, :])
        
        if decoded_idx == stop_token_idx:
            done_decoding = True
        else:
            translated_text += out_int2token[decoded_idx]

        if len(translated_text) >= max_out_seq_len:
            done_decoding = True
            
    return translated_text

Validation samples:

In [None]:
from seq2seq_util import test_predictions

test_predictions(valid_samples[:100], inf_encoder, inf_decoder, encode_batch, translate_sequence)

Training samples:

In [None]:
test_predictions(train_samples[:100], inf_encoder, inf_decoder, encode_batch, translate_sequence)

Export model in Core ML format.

In [None]:
coreml_enc_in = Input(shape=(None, in_vocab_size), name="encoder_in")
coreml_enc_lstm = LSTM(latent_dim, return_state=True, name="encoder_lstm")
coreml_enc_out, _, _ = coreml_enc_lstm(coreml_enc_in)

coreml_encoder_model = Model(coreml_enc_in, coreml_enc_out)
coreml_encoder_model.output_layers = coreml_encoder_model._output_layers

inf_encoder.save_weights("Es2EnCharEncoderWeights.h5")
coreml_encoder_model.load_weights("Es2EnCharEncoderWeights.h5")

In [None]:
import coremltools

coreml_encoder = coremltools.converters.keras.convert(
    coreml_encoder_model,
    input_names="encodedSeq",
    output_names="ignored")

coreml_encoder.save("Es2EnCharEncoder.mlmodel")

In [None]:
coreml_dec_in = Input(shape=(None, out_vocab_size))

coreml_dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm")
coreml_dec_lstm_out, _, _ = coreml_dec_lstm(coreml_dec_in)
coreml_dec_dense = Dense(out_vocab_size, activation="softmax")
coreml_dec_out = coreml_dec_dense(coreml_dec_lstm_out)

coreml_decoder_model = Model(coreml_dec_in, coreml_dec_out)
coreml_decoder_model.output_layers = coreml_decoder_model._output_layers

inf_decoder.save_weights("Es2EnCharDecoderWeights.h5")
coreml_decoder_model.load_weights("Es2EnCharDecoderWeights.h5")

In [None]:
coreml_decoder = coremltools.converters.keras.convert(
    coreml_decoder_model,
    input_names="encodedChar",
    output_names="nextCharProbs")

coreml_decoder.save("Es2EnCharDecoder.mlmodel")

Convert weights to 16bit floats. This shouldn't hurt performance much, if at all, and it reduces the app's download size.

In [None]:
def convert_to_fp16(mlmodel_filename):
    basename = mlmodel_filename[:-len(".mlmodel")]
    spec = coremltools.utils.load_spec(mlmodel_filename)
    spec_16bit = \
      coremltools.utils.convert_neural_network_spec_weights_to_fp16(spec)
    coremltools.utils.save_spec(spec_16bit, f"{basename}16Bit.mlmodel")

In [None]:
convert_to_fp16("Es2EnCharEncoder.mlmodel")
convert_to_fp16("Es2EnCharDecoder.mlmodel")

Save the maps so you can transform text to and from ints. You'll need them later in the iOS app.

In [None]:
import json

with open("esCharToInt.json", "w") as f:
    json.dump(in_token2int, f)
with open("intToEnChar.json", "w") as f:
    json.dump(out_int2token, f)