In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
from nltk.tokenize import word_tokenize
from string import punctuation
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Loading

In [2]:
with open('spa.txt', 'r', encoding='utf8') as f:
    text = f.readlines()

In [3]:
english = []
spanish = []
for line in text:
    line = line.split('\t')
    english.append(line[0])
    spanish.append(line[1] )

In [4]:
x_train, x_test, y_train, y_test = train_test_split(english, spanish, test_size=0.1, random_state=123)

In [5]:
# Preprocess Text
def preprocess(line):
    line = ''.join([char for char in line if char not in punctuation + '¿'])
    return word_tokenize(line.lower())
    
eng_train, spa_train, eng_lens, spa_lens = [], [], [], []

for eng_line, spa_line in zip(x_train, y_train):
    spa_line = ['START'] + preprocess(spa_line) + ['END']
    eng_line = preprocess(eng_line)
    eng_train.append(eng_line)
    spa_train.append(spa_line)
    
    # For finding longest spanish and english sequences
    eng_lens.append(len(eng_line))
    spa_lens.append(len(spa_line))
    
max_len_eng = max(eng_lens)
max_len_spa = max(spa_lens)

# Create token lookup dictionaries

In [6]:
def make_tok_lookup(corpus):
    flat_corpus = []
    for line in corpus:
        flat_corpus.extend(line)
        
    vocab = list(set(flat_corpus))
    tok2idx = dict([(tok, idx) for idx, tok in enumerate(vocab, start=2)])
    tok2idx['PAD'] = 0
    tok2idx['OOV'] = 1
    
    return tok2idx

tok2idx_eng = make_tok_lookup(eng_train)
idx2tok_eng = dict([(key, word) for word, key in tok2idx_eng.items()])
vocab_size_eng = len(tok2idx_eng)

tok2idx_spa = make_tok_lookup(spa_train)
idx2tok_spa = dict([(key, word) for word, key in tok2idx_spa.items()])
vocab_size_spa = len(tok2idx_spa)

# Create Encoder Inputs, Decoder Inputs, Targets

In [7]:
encoder_inputs = np.zeros((len(eng_train), max_len_eng))

decoder_inputs = np.zeros((len(spa_train), max_len_spa))

decoder_targets = np.zeros((len(spa_train), max_len_spa))

In [8]:
for i, sequence in enumerate(eng_train):
    for j, tok in enumerate(sequence):
        encoder_inputs[i][j] = tok2idx_eng[tok]
        
for i, sequence in enumerate(spa_train):
    for j, tok in enumerate(sequence):
        decoder_inputs[i][j] = tok2idx_spa[tok]
    # Targets sequences are decode inputs shifted by 1 (excluding START token)
    # ex. if decode input is 'START my name is patrick', target is 'name is patrick'
        if j > 0:
            decoder_targets[i][j-1] = tok2idx_spa[tok]

In [9]:
decoder_targets.shape

(125111, 70)

# Build model

## Training Model

In [10]:
hidden_size = 128
embed_dim = 100

# Encoder (encodes english sentence)
enc_inputs = Input(shape=(None,), name='encoder_inputs')
enc_embedding = Embedding(vocab_size_eng, embed_dim, name='encoder_embedding')
x = enc_embedding(enc_inputs)
enc_lstm = LSTM(hidden_size, return_state=True, name='encoder_lstm')
_, enc_h, enc_c = enc_lstm(x)

# Decoder (using LSTM initialized with encoder hidden states, predict next character in sequence)
# Note, at training stage, the decoder sees the actual character in the sequence, too
dec_inputs = Input(shape=(None,), name='decoder_inputs')
dec_embedding = Embedding(vocab_size_spa, embed_dim, name='decoder_embedding')
x = dec_embedding(dec_inputs)
dec_lstm = LSTM(hidden_size, return_sequences=True, return_state=True, name='decoder_lstm')
# initialize decoder LSTM with states from the encoded sequence
dec_outputs, _, _ = dec_lstm(x, initial_state=[enc_h, enc_c])
# Predict the next token
dec_classifier = Dense(vocab_size_spa, activation='softmax', name='decoder_classifier')
outputs =  dec_classifier(dec_outputs)


model = Model([enc_inputs, dec_inputs], outputs)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 encoder_embedding (Embedding)  (None, None, 100)    1394700     ['encoder_inputs[0][0]']         
                                                                                                  
 decoder_embedding (Embedding)  (None, None, 100)    2710800     ['decoder_inputs[0][0]']         
                                                                                              

In [11]:
es = EarlyStopping(
    monitor='val_loss',
    min_delta=0.001,
    patience=5,
    restore_best_weights=False
)

mc = ModelCheckpoint(
    filepath='eng_to_spa.h5',
    monitor='val_loss',
    save_best_only=True
)

model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics='accuracy')
model.fit([encoder_inputs, decoder_inputs], decoder_targets, validation_split=0.1, epochs=50, callbacks=[es, mc])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50

KeyboardInterrupt: 

# Inference

In [12]:
# Inference encoder (input sequence --> LSTM states)
encoder = Model(enc_inputs, [enc_h, enc_c])

# Decoder inference (encoder LSTM states --> predictions)
dec_input_h = Input(shape=(hidden_size,), name='inf_dec_h_input')
dec_input_c = Input(shape=(hidden_size,), name='inf_dec_c_input')

x = dec_embedding(dec_inputs)
dec_outputs, dec_h, dec_c = dec_lstm(x, initial_state=[dec_input_h, dec_input_c])
outputs = dec_classifier(dec_outputs)

decoder = Model([dec_inputs] + [dec_input_h, dec_input_c],
                [outputs] + [dec_h, dec_c])

print('ENCODER MODEL'.center(100, '*'))
print(encoder.summary(), '\n\n')

print('DECODER MODEL'.center(100, '*'))
print(decoder.summary())

*******************************************ENCODER MODEL********************************************
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_inputs (InputLayer)  [(None, None)]           0         
                                                                 
 encoder_embedding (Embeddin  (None, None, 100)        1394700   
 g)                                                              
                                                                 
 encoder_lstm (LSTM)         [(None, 128),             117248    
                              (None, 128),                       
                              (None, 128)]                       
                                                                 
Total params: 1,511,948
Trainable params: 1,511,948
Non-trainable params: 0
_________________________________________________________________
None 


******

In [13]:
def eng_to_vec(sentence):
    '''
    Vectorizes an english sentence
    '''
    no_punct = ''.join([char for char in sentence if char not in punctuation])
    no_punct = no_punct.lower()
    tokenized = word_tokenize(no_punct)
    vector = np.zeros((1, max_len_eng))
    
    for i, word in enumerate(tokenized):
        idx = tok2idx_eng.get(word)
        # Map OOV words to OOV index
        if word is None:
            idx = 1
        vector[0][i] = idx
        
    return vector
    
print([idx2tok_eng[idx] for idx in eng_to_vec('this is a test')[0]])

['this', 'is', 'a', 'test', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


In [14]:
def vec_to_spa(vector):
    '''
    Converts a vector into a spanish string
    Params, vector shape: (sequence length)
    '''
    output = ' '.join([idx2tok_spa[int(idx)] for idx in vector])
    return output

def translate(sentence):
    # Vectorize and encode sentence, calculate initial hidden state
    vector = eng_to_vec(sentence)
    encoder_states = encoder.predict(vector, verbose=0)
    h, c = encoder_states[0], encoder_states[1]    
    
    # Initialize output sequence
    output_sequence = np.asarray([tok2idx_spa['START']])
    output_sequence = np.expand_dims(output_sequence, axis=0)
    seq_len = 0
    output_vector = []
    end = False
    
    while not end:
        # Predict next token 
        outputs, h, c = decoder.predict([output_sequence, h, c], verbose=0) # (output is (1, sequence length, vocab_size_spa))
        
        # Get next token using argmax (naive solution, constraint on performance)
        next_tok_idx = np.argmax(outputs[0, -1, :]) # second index here means we want highestt probability of next word in sequence

        # Update output seqwuence
        output_sequence = np.zeros((1, 1))
        output_sequence[0][0] = next_tok_idx
        
        # Check if sequence is over
        seq_len += 1
        if seq_len == max_len_spa or next_tok_idx == tok2idx_spa['END']:
            end = True
        else:
            output_vector.append(next_tok_idx)
    output_string = vec_to_spa(output_vector)
    return output_string
tests = ['hello', 'hi', 'i am going to the store', 'how are you']

for i,test in enumerate(x_test[:5]):
    print('English:', test)
    print('Predicted Spanish:', translate(test))
    print('True Spanish:', y_test[i], '\n')

English: It's up to you to make a choice.
Predicted Spanish: es necesario que te pongas loco
True Spanish: Depende de ti tomar una decisión. 

English: Since he says so, it must be true.
Predicted Spanish: eso lo hace decir que te diga
True Spanish: Ya que él lo dice, debe ser verdad. 

English: The peace talks failed once again.
Predicted Spanish: las cosas hacen nada que lo que perder
True Spanish: Los diálogos de paz fracasaron otra vez. 

English: Tom is doing very well.
Predicted Spanish: tom está muy bien
True Spanish: A Tom le va muy bien. 

English: I'm ready to vote.
Predicted Spanish: estoy listo para votar
True Spanish: Estoy preparado para votar. 

