# Import the essential libraries 

In [1]:
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import string

# load the dataset

In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

filename = 'first dataset.txt'
doc = load_doc(filename)

In [3]:
# split a loaded document into sentences
def to_pairs(doc):
    #to seprate lines
    lines = doc.strip().split('\n')
    #to seprate english and german phrases in each line
    pairs = [line.split('\t') for line in  lines]
    return pairs

# split into english-german pairs
pairs = to_pairs(doc) 

In [4]:
pairs[0:5]

[['Hi.', 'Hallo!'],
 ['Hi.', 'Grüß Gott!'],
 ['Run!', 'Lauf!'],
 ['Wow!', 'Potzdonner!'],
 ['Wow!', 'Donnerwetter!']]

In [5]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering: we use it as a func. to remove unprintable chars in each phrase
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation,
    #the keys will be ASCII code of every punctuations and the values are None: it will work as a mapper
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token(word): map each punctuation to ''
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token(word)
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string : put back the words to a sentnce
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

# clean sentences
clean_pairs = clean_pairs(pairs)

In [6]:
clean_pairs[177]

array(['am i fat', 'bin ich dick'], dtype='<U367')

In [7]:
pairs[177]

['Am I fat?', 'Bin ich dick?']

In [8]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')

Saved: english-german.pkl


In [9]:
# To check
for i in range(10):
    print('[%s] => [%s]' % (clean_pairs[i,1], clean_pairs[i,0]))

[hallo] => [hi]
[gr gott] => [hi]
[lauf] => [run]
[potzdonner] => [wow]
[donnerwetter] => [wow]
[feuer] => [fire]
[hilfe] => [help]
[zu hlf] => [help]
[stopp] => [stop]
[warte] => [wait]


# Split Dataset

In [10]:
clean_pairs.shape

(152820, 2)

In [11]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
  
# create dataset
raw_dataset = clean_pairs
 
# reduce dataset size
n_sentences = 20000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:18500], dataset[18500:]

# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl



# Prepare inputs of the model

In [12]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
 
# find the maximum length of phrases (in both germany and english)
def max_length(lines):
    return max(len(line.split()) for line in lines)
 
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    #integer encode sequences: map words to integers
    X = tokenizer.texts_to_sequences(lines)
    #pad sequences with 0 values: make all the sequences' length the same by adding 0 at the end of the shorter ones
    X = pad_sequences(X, maxlen=length, padding='post')
    return X
 
# one hot encode target sequence
#This is because the model will predict the probability of each word in the vocabulary as output.
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        #each word will be one hot encoded and convert to a vector of eng/ger vocab size: 
        #for each phrase we will have (eng/ger length * eng/ger vocab size) vector
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [13]:
# prepare english tokenizer
#it will map each word to an integer
eng_tokenizer = create_tokenizer(dataset[:, 0])

#count the unique integers to find out the vocabulary size
eng_vocab_size = len(eng_tokenizer.word_index) + 1

#find the maximum length of phrases in English
eng_length = max_length(dataset[:, 0])

print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# prepare german tokenizer : do all the above for German phrases
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])

print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

English Vocabulary Size: 3816
English Max Length: 6
German Vocabulary Size: 6240
German Max Length: 10


In [14]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare test data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [24]:
trainX[0]

array([ 10,  30,   5, 575, 156,   0,   0,   0,   0,   0])

In [17]:
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
 

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 512)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
#you should run in colab to get the picture of model structure
plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 512)           3194880   
_________________________________________________________________
lstm_2 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 6, 512)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 6, 512)            2099200   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 6, 3816)           1957608   
Total params: 9,350,888
Trainable params: 9,350,888
Non-trainable params: 0
_________________________________________________________________
None
('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/d

# Define model

In [21]:
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
 

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 512)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=35, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 512)           3194880   
_________________________________________________________________
lstm (LSTM)                  (None, 512)               2099200   
_________________________________________________________________
repeat_vector (RepeatVector) (None, 6, 512)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 6, 512)            2099200   
_________________________________________________________________
time_distributed (TimeDistri (None, 6, 3816)           1957608   
Total params: 9,350,888
Trainable params: 9,350,888
Non-trainable params: 0
_________________________________________________________________
None
('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/dow

<tensorflow.python.keras.callbacks.History at 0x23f5ed9d8e0>

# Model evaluation

In [22]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
 
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)
 
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i,:2]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
 
# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[wir haben das spiel verloren], target=[we lost the game], predicted=[we lost the game]
src=[ich bin schtig], target=[im addicted], predicted=[im addicted]
src=[er schien ehrlich zu sein], target=[he appeared honest], predicted=[he appeared honest]
src=[tom ist starrkpfig], target=[tom is stubborn], predicted=[tom is stubborn]
src=[ich bin lehrerin], target=[i am a teacher], predicted=[im am teacher teacher]
src=[ich halte das nicht aus], target=[i cant stand it], predicted=[i cant stand that]
src=[wo ist der rest], target=[wheres the rest], predicted=[wheres the rest]
src=[tom wird schreien], target=[tom will scream], predicted=[tom will scream]
src=[niemand glaubt mir], target=[no one believes me], predicted=[nobody one believes me]
src=[tom wurde gewaltttig], target=[tom became violent], predicted=[tom became violent]
BLEU-1: 0.903378
BLEU-2: 0.861825
BLEU-3: 0.818883
BLEU-4: 0.658737
test
src=[entspannen sie sich bitte], target=[please relax], predicted=[please have]
src=

# Hyperparameter tuning

## n_unit

In [40]:
# n_units in embedding layer
for i in [256, 128, 64, 512]:
    # define model
    model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, i)
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    # summarize defined model
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    # fit model
    filename = 'model.h5'
    checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    model.fit(trainX, trainY, epochs=35, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=1)

    # load model
    model = load_model('model.h5')
    # test on some training sequences
    print('train')
    evaluate_model(model, eng_tokenizer, trainX, train)
    # test on some test sequences
    print('test')
    evaluate_model(model, eng_tokenizer, testX, test)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 256)           1597440   
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 6, 256)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 6, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 6, 3816)           980712    
Total params: 3,628,776
Trainable params: 3,628,776
Non-trainable params: 0
_________________________________________________________________
None
('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/d

BLEU-1: 0.891970
BLEU-2: 0.843942
BLEU-3: 0.798830
BLEU-4: 0.635490
test
src=[das ist meine tasche], target=[this is my bag], predicted=[this bag my bag]
src=[hast du kinder], target=[do you have kids], predicted=[do you have kids]
src=[seid ihr noch da], target=[are you still there], predicted=[are you still there]
src=[da sind wir], target=[here we are], predicted=[here we are]
src=[tom ist innovativ], target=[tom is innovative], predicted=[tom is devious]
src=[sie ist aufgeschlossen], target=[shes openminded], predicted=[she is]
src=[ich versuche es noch mal], target=[ill try again], predicted=[i try it it]
src=[wo ist meine uhr], target=[where is my watch], predicted=[wheres is my clock]
src=[ich brauche einen job], target=[i need a job], predicted=[i need a job]
src=[das wusste ich schon], target=[i already knew that], predicted=[i already knew that]
BLEU-1: 0.587024
BLEU-2: 0.468558
BLEU-3: 0.410816
BLEU-4: 0.266693
Model: "sequential_2"
__________________________________________


Epoch 00034: val_loss did not improve from 1.77976
Epoch 35/35

Epoch 00035: val_loss improved from 1.77976 to 1.76920, saving model to model.h5
train
src=[vermissen sie boston], target=[do you miss boston], predicted=[do you like boston]
src=[kann ich ihn nach hause mitnehmen], target=[can i take it home], predicted=[can i take home home]
src=[fhlst du dich krank], target=[do you feel sick], predicted=[do you feeling]
src=[sie spricht schnell], target=[she talks quickly], predicted=[she speaks quickly]
src=[ich werde mit ihm schimpfen], target=[ill scold him], predicted=[ill scold him]
src=[sie ist nicht arm], target=[she isnt poor], predicted=[she not poor]
src=[lass uns jetzt gehen], target=[lets go now], predicted=[lets go now]
src=[was ist so lustig], target=[whats so funny], predicted=[what so funny]
src=[tom war eiferschtig], target=[tom was jealous], predicted=[tom was jealous]
src=[mach das licht aus], target=[turn off the light], predicted=[turn off the light]
BLEU-1: 0.8263


Epoch 00021: val_loss did not improve from 1.57912
Epoch 22/35

Epoch 00022: val_loss did not improve from 1.57912
Epoch 23/35

Epoch 00023: val_loss did not improve from 1.57912
Epoch 24/35

Epoch 00024: val_loss did not improve from 1.57912
Epoch 25/35

Epoch 00025: val_loss did not improve from 1.57912
Epoch 26/35

Epoch 00026: val_loss did not improve from 1.57912
Epoch 27/35

Epoch 00027: val_loss did not improve from 1.57912
Epoch 28/35

Epoch 00028: val_loss did not improve from 1.57912
Epoch 29/35

Epoch 00029: val_loss did not improve from 1.57912
Epoch 30/35

Epoch 00030: val_loss did not improve from 1.57912
Epoch 31/35

Epoch 00031: val_loss did not improve from 1.57912
Epoch 32/35

Epoch 00032: val_loss did not improve from 1.57912
Epoch 33/35

Epoch 00033: val_loss did not improve from 1.57912
Epoch 34/35

Epoch 00034: val_loss did not improve from 1.57912
Epoch 35/35

Epoch 00035: val_loss did not improve from 1.57912
train
src=[vermissen sie boston], target=[do you mis