# Neural Translation Model

In [22]:
from pickle import load
import numpy
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [8]:
#data loaded
def load_clean_sentences(filename):
    return load(open(filename,'rb'))



dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

'''
Keras Tokenize class is used to map words to integers, as needed for modeling
Separate tokenizer for English and German sequences.
'''

def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

#max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)


#english tokenizer
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:,0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

#german tokenizer
ger_tokenizer = create_tokenizer(dataset[:,1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:,1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

English Vocabulary Size: 7058
English Max Length: 1
German Vocabulary Size: 9054
German Max Length: 1


In [9]:
'''
Each input and output sequence must be encoded to integers and padded to the maximum phrase length. 
This is because we will use a word embedding for the input sequences and one hot encode the output sequences
The function below named encode_sequences() will perform these operations and return the result
'''

#encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen = length,padding="post")
    return X

'''
The o/p seq needs to be one-hot encoded.
Model wil predict the probability of each word in the vocabulary as o/p
'''

#one hot encode the target sequence
def encode_output(sequences,vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence,num_classes=vocab_size)
        ylist.append(encoded)
    y = numpy.array(ylist)
    y = y.reshape(sequences.shape[0],sequences.shape[1],vocab_size)
    return y


#prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:,1])
trainY = encode_sequences(eng_tokenizer,eng_length,train[:,0])
trainY = encode_output(trainY,eng_vocab_size)

#prepare test data
testX = encode_sequences(ger_tokenizer,ger_length, test[:,1])
testY = encode_sequences(eng_tokenizer,eng_length, test[:,0])
testY = encode_output(testY,eng_vocab_size)


In [10]:
trainX

array([[ 751],
       [ 752],
       [ 753],
       ...,
       [8205],
       [8206],
       [8207]], dtype=int32)

In [12]:
testX.shape

(1000, 1)

In [13]:
testY.shape

(1000, 1, 7058)

### Define the model
LSTM encoder-decoder model is used
The i/p sequence is encoded by a front-end model called the encoder then decode word by word by a backend model called the decoder.



In [16]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 256)            2317824   
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 1, 256)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 1, 256)            525312    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 1, 7058)           1813906   
Total params: 5,182,354
Trainable params: 5,182,354
Non-trainable params: 0
_________________________________________________________________
None


OSError: `pydot` failed to call GraphViz.Please install GraphViz (https://www.graphviz.org/) and ensure that its executables are in the $PATH.

In [17]:
#fit model
filename = "model.h5"
checkpoint = ModelCheckpoint(filename,monitor="val_loss",verbose=1,save_best_only=True,mode='min')
model.fit(trainX,trainY,epochs=30,batch_size=64,validation_data=(testX,testY),callbacks=[checkpoint],verbose=2)



Train on 9000 samples, validate on 1000 samples
Epoch 1/30
 - 17s - loss: 8.8729 - val_loss: 8.9087

Epoch 00001: val_loss improved from inf to 8.90874, saving model to model.h5
Epoch 2/30
 - 13s - loss: 8.7917 - val_loss: 8.8430

Epoch 00002: val_loss improved from 8.90874 to 8.84296, saving model to model.h5
Epoch 3/30
 - 12s - loss: 8.4378 - val_loss: 9.1307

Epoch 00003: val_loss did not improve from 8.84296
Epoch 4/30
 - 16s - loss: 8.0324 - val_loss: 9.3555

Epoch 00004: val_loss did not improve from 8.84296
Epoch 5/30
 - 13s - loss: 7.7040 - val_loss: 9.6040

Epoch 00005: val_loss did not improve from 8.84296
Epoch 6/30
 - 13s - loss: 7.4624 - val_loss: 9.6977

Epoch 00006: val_loss did not improve from 8.84296
Epoch 7/30
 - 15s - loss: 7.1617 - val_loss: 10.0520

Epoch 00007: val_loss did not improve from 8.84296
Epoch 8/30
 - 13s - loss: 6.8085 - val_loss: 10.8070

Epoch 00008: val_loss did not improve from 8.84296
Epoch 9/30
 - 15s - loss: 6.3983 - val_loss: 11.5967

Epoch 00

<keras.callbacks.callbacks.History at 0x7f08fcdc5eb0>

## Evaluate

In [23]:
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))
 
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer
 
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)
 
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X
 
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None
 
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)
 
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
 
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
 
# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[uberlassesihm], target=[lethimdoit], predicted=[goaway]
src=[lassensieunsloslegen], target=[letsbegin], predicted=[goaway]
src=[ichwarverwirrt], target=[iwasconfused], predicted=[sweetdreams]
src=[machdenmundauf], target=[openyourmouth], predicted=[nobodycalled]
src=[woisttom], target=[wherestom], predicted=[beatit]
src=[ichsagtenichts], target=[isaidnothing], predicted=[imatourist]
src=[nimmtom], target=[taketom], predicted=[whydidyoucry]
src=[sagtesallen], target=[telleverybody], predicted=[goaway]
src=[ichlebehier], target=[ilivehere], predicted=[whydidyoucry]
src=[wirkaufeneinecd], target=[webuycds], predicted=[goaway]


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.003889
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000
test
src=[uberlassdasmir], target=[letmedoit], predicted=[whydidyoucry]
src=[gibobacht], target=[becareful], predicted=[goaway]
src=[ichhabedichgehort], target=[iheardyou], predicted=[whydidyoucry]
src=[ihrseidunschuldig], target=[youreinnocent], predicted=[whydidyoucry]
src=[kannstdumirfolgen], target=[doyoufollow], predicted=[whydidyoucry]
src=[kommnichtherein], target=[keepout], predicted=[goaway]
src=[bistdugrozugig], target=[areyougenerous], predicted=[hewasenglish]
src=[fangnocheinmalan], target=[startagain], predicted=[hewasenglish]
src=[lassessein], target=[letitbe], predicted=[beatit]
src=[woistpapa], target=[wheresdaddy], predicted=[tomsupstairs]
BLEU-1: 0.003000
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000
