# English to French using Neural Machine Translation

This example was taken from the wonderful Cutting Edge Deep Learning for Coders course as taught by Jeremy Howard http://course.fast.ai/part2.html The course is now live and I encourage you to check it out.

In [1]:
%matplotlib inline
import importlib
#import sutils; importlib.reload(sutils)
from sutils import *

import keras
import gensim
import re
import pickle
import keras.backend as K

from keras import initializers
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, Callback, ReduceLROnPlateau, LearningRateScheduler, EarlyStopping, TensorBoard
from keras.callbacks import LambdaCallback


from recurrentshop import *
import seq2seq
from seq2seq.models import AttentionSeq2Seq,SimpleSeq2Seq, Seq2Seq

import tensorflow as tf
from keras_tqdm import TQDMNotebookCallback

Using TensorFlow backend.


In [2]:
print(keras.__version__)
print(tf.__version__)

2.0.8
1.3.0


In [3]:
from gensim.models import word2vec

In [4]:
#limit_gpu_mem()

In [5]:
path = '/data/TensorFlowTalks/neural_translation_en_de/'
dpath = '/data/TensorFlowTalks/neural_translation_en_de/translate/'

### Set up Regex and tokenize for use later

In [6]:
re_mult_space = re.compile(r"  *")
re_mw_punc = re.compile(r"(\w[’'])(\w)")
re_punc = re.compile("([\"().,;:/_?!—])")
re_apos = re.compile(r"(\w)'s\b")


def simple_toks(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' ')
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()

## Load the PreProcessed data

Here we load all the data 

In [9]:
data = load(dpath+'nmt_data.pkl')
look_ups = load(dpath+'look_ups.pkl')
fr_train = data['fr_train']
fr_test = data['fr_test']
en_train = data['en_train']
en_test = data['en_test']
en_w2id = look_ups['en_w2id']
fr_vocab = look_ups['fr_vocab']
en_vocab = look_ups['en_vocab']
en_embs = look_ups['en_embs']
fr_embs = look_ups['fr_embs']

questions = load(dpath+'questions.pkl')
print(questions[10])
en_qs, fr_qs = zip(*questions)

('I will fight.', 'Ich werde kämpfen.')


In [10]:
# for running model test on small set of data
#fr_train = fr_train[:5000]
#en_train = fr_train[:5000]

fr_train.shape

(96277, 30)

In [11]:
en_train.shape

(96277, 30)

## Model

#### Create some Keras Callbacks to handle early stopping and Learning Rate changes

In [12]:
reduce_LR = ReduceLROnPlateau(monitor='val_loss',factor = 0.5, patience=0,cooldown=1, min_lr = 0.00001)
early_stopping = EarlyStopping(monitor='val_loss',min_delta=0,patience=4,verbose=0,mode='auto')

import math

# learning rate schedule for dropping every 10 epochs
def LRDropping(epoch):
    initial_lrate = 0.001
    drop = 0.9
    epochs_drop = 3.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

# try at manual setting of LR for Epochs
def fixed_dropping(epoch):
    if epoch < 2: 
        lrate = 0.01
    elif epoch < 4: 
        lrate = 0.001
    elif epoch < 7: 
        lrate = 0.0005
    else:
        lrate = 0.0001
    print(lrate)
    return lrate

LRDrop = LearningRateScheduler(fixed_dropping)

In [13]:
# creating different sets of Params to easily import into the model at train time
params = {'verbose': 1, 'callbacks': [TQDMNotebookCallback(),reduce_LR,early_stopping]}
params2 = {'verbose': 1, 'callbacks': [LRDrop,TQDMNotebookCallback(),reduce_LR,early_stopping]}
params3 = {'verbose': 1, 'callbacks': [LRDrop,TQDMNotebookCallback(),reduce_LR,early_stopping]}

#### Set some parameters for the model

In [14]:
lr = 1e-3
maxlen = 30
dim_en_vec = 100
n_en_vec = 400000
dim_fr_vec = 200

vocab_size = len(fr_vocab) #the output vocab # embeddings.shape[0]
embedding_size = 100 #The english inputs embeddings embeddings.shape[1]

In [15]:
fr_wgts = [fr_embs.T, np.zeros((len(fr_vocab,)))]

### The model itself

In [16]:
# Test different settings:

# - only two LSTM's and one TimeDistributed
# - LSTM instead of Bidirectional
# - only one timeDistributed
# - without weights for german embeddings
# - categorical_crossentropy instead of sparse



# while training implement my preprocessing into jupyter notebook and try my prepro with this standard model

In [17]:
# out model but sparse_categorical_crossentropy
inp = Input((maxlen,))
x = Embedding(len(en_vocab), dim_en_vec, input_length=maxlen,
              weights=[en_embs], trainable=False)(inp)
x = LSTM(128, return_sequences=True)(x)
x = LSTM(128, return_sequences=True)(x)
x = TimeDistributed(Dense(len(fr_vocab)))(x)
x = Activation('softmax')(x)

model = Model(inp, x)
model.compile('adam', 'sparse_categorical_crossentropy')

In [21]:
# Base Model big

lr = 1e-3
maxlen = 100
dim_en_vec = 200
n_en_vec = 400000
dim_fr_vec = 200

inp = Input((maxlen,))
x = Embedding(40002, dim_en_vec, input_length=maxlen,
              weights=[en_embs], trainable=False)(inp)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = LSTM(128, return_sequences=True)(x)
x = TimeDistributed(Dense(dim_fr_vec))(x)
x = TimeDistributed(Dense(40002, weights=fr_wgts))(x)
x = Activation('softmax')(x)

model = Model(inp, x)
model.compile('adam', 'sparse_categorical_crossentropy')

In [None]:
# Base Model
inp = Input((maxlen,))
x = Embedding(len(en_vocab), dim_en_vec, input_length=maxlen,
              weights=[en_embs], trainable=False)(inp)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = LSTM(128, return_sequences=True)(x)
x = TimeDistributed(Dense(dim_fr_vec))(x)
x = TimeDistributed(Dense(len(fr_vocab), weights=fr_wgts))(x)
x = Activation('softmax')(x)

model = Model(inp, x)
model.compile('adam', 'sparse_categorical_crossentropy')

In [22]:
K.set_value(model.optimizer.lr, lr)

In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 200)          8000400   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 256)          336896    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 256)          394240    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100, 128)          197120    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 100, 200)          25800     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 100, 40002)        8040402   
__________

In [18]:
hist=model.fit(en_train, np.expand_dims(fr_train,axis=-1), batch_size=64, epochs=20, **params, 
               validation_data=[en_test, np.expand_dims(fr_test,axis=-1)])

Train on 96277 samples, validate on 10698 samples


Epoch 1/20


Epoch 2/20


Epoch 3/20


Epoch 4/20


Epoch 5/20


Epoch 6/20


Epoch 7/20


Epoch 8/20


Epoch 9/20


Epoch 10/20


Epoch 11/20


Epoch 12/20


Epoch 13/20


Epoch 14/20


Epoch 15/20


Epoch 16/20


Epoch 17/20


Epoch 18/20


Epoch 19/20


Epoch 20/20



In [19]:
plot_train(hist)

NameError: name 'plt' is not defined

In [20]:
weight_identifier = "trans_testing_basic_like_ours_but_sparse_categorical_crossentropy"
model.save_weights(dpath + weight_identifier + '.h5')

In [21]:
model.load_weights(dpath + weight_identifier + '.h5')

In [24]:
model.load_weights('model_checkpoint.000-1.585.hdf5')

## Testing

In [25]:
def sent2ids(sent):
    sent = simple_toks(sent)
    ids = [en_w2id[t] for t in sent]
    return pad_sequences([ids], maxlen, padding="post", truncating="post")

In [26]:
def en2fr(sent): 
    ids = sent2ids(sent)
    tr_ids = np.argmax(model.predict(ids), axis=-1)
    return ' '.join(fr_vocab[i] for i in tr_ids[0] if i>0)

In [27]:
en2fr("what is the size of canada?")


'was ist die <UNK> <UNK> ? ?'

In [47]:
en2fr("what is the size of australia?")


'was ist die <UNK> <UNK>'

In [48]:
en2fr("What is light?")

'was ist ?'

In [38]:
print(questions[0][0])
print(questions[0][1])
en2fr(questions[0][0])

However, what guarantee would we have that they would not use monopoly power to price these books above the range of ordinary citizens?
Welche Garantie hätten wir jedoch, dass das Unternehmen seine Monopolstellung nicht dazu ausnutzen würde, die Preise dieser Bücher über dem festzulegen, was normale Bürger sich leisten können?


'aber wir wir , , , , dass dass nicht nicht nicht , , , die die die die'

In [64]:
en2fr("hello mr president")

'hallo präsident präsident'

In [27]:
print(fr_qs[50000])
en2fr("Why is the Arctic ozone layer thicker than the Antarctic ozone layer?")


Gibt es noch etwas anderes, das ich über Tom wissen sollte?


KeyError: 'antarctic'

In [28]:
print(qs[9])
en2fr("Which province is the most populated?")

NameError: name 'qs' is not defined

In [29]:
en2fr("Who are we?")

'wer wem wir ?'

In [30]:
print(fr_qs[3])
en2fr("What would we do without it?")

Du scheinst wirklich ein Bierfreund zu sein.


'was würdest wir das ohne ?'

## Predict validation data

In [None]:
import os

source_file = "/data/wrapper/PA_BA/DataSets/DE_EN_(tatoeba)_validation_english_only.txt"
if os.path.exists(source_file) is False:
    exit("source file does not exists")

source_sentences = open(source_file, encoding='UTF-8').read().split('\n')
print(len(source_sentences))

translated_sentences = []
for sent in source_sentences:
    translated_sentences.append(en2fr(sent))
    break
print(translated_sentences)

In [None]:
out_file = os.path.join(os.path.abspath(os.path.join(source_file, os.pardir)), weight_identifier + ".pred")
with(open(out_file, 'w')) as file:
    for sent in translated_sentences:
        file.write(sent + '\n')