<a href="https://colab.research.google.com/github/omkarwazulkar/NaturalLanguageProcessing/blob/main/Neural_Machine_Translation_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

In [25]:
# Loading Doc into Memory
def load_doc(filename):
  file = open(filename, mode='rt', encoding='utf-8')
  text = file.read()
  file.close()
  return text

In [26]:
# Split Doc into Sentences
def to_pairs(doc):
  lines = doc.strip().split('\n')
  pairs = [line.split('\t') for line in lines]
  return pairs

# **Cleaning Text**

In [27]:
# Cleaning List
def clean_pairs(lines):
  cleaned = list()
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  re_print = re.compile('[^%s]' % re.escape(string.printable))
  for pair in lines:
    clean_pair = list()
    for line in pair:
      line = normalize('NFD', line).encode('ascii', 'ignore')
      line = line.decode('UTF-8')
      line = line.split()
      line = [word.lower() for word in line]
      line = [re_punc.sub('', w) for w in line]
      line = [re_print.sub('', w) for w in line]
      line = [word for word in line if word.isalpha()]
      clean_pair.append(' '.join(line))
    cleaned.append(clean_pair)
  return array(cleaned)

In [28]:
# Saving Clean Sentences to File
def save_clean_data(sentences, filename):
  dump(sentences, open(filename, 'wb'))
  print('Saved: %s' % filename)

In [29]:
# Load Dataset
filename = '/content/drive/MyDrive/Deu-Eng/deu.txt'
doc = load_doc(filename)

In [30]:
# German - English Pairs Split
pairs = to_pairs(doc)

In [31]:
# Clean Sentences
clean_pairs = clean_pairs(pairs)

In [32]:
# Save Clean Pairs to File
save_clean_data(clean_pairs, 'english-german.pkl')
for i in range(100):
  print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-german.pkl
[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[duck] => [kopf runter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stay] => [bleib]
[stop] => [stopp]
[stop] => [anhalten]
[wait] => [warte]
[wait] => [warte]
[begin] => [fang an]
[do it] => [mache es]
[do it] => [tue es]
[go on] => [mach weiter]
[hello] => [hallo]
[hello] => [sers]
[hurry] => [beeil dich]
[hurry] => [schnell]
[i hid] => [ich versteckte mich]
[i hid] => [ich habe mich versteckt]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich versuche es]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[i won] => [ich habe gewonnen]
[oh no] => [oh nein]
[relax] => [entspann dich]
[shoot] => [feuer]
[shoot] => [schie]
[smile] => [lacheln]
[sorry] => [entschuldigung]
[ask me] => [frag mich]
[ask me] => [fragt mich]
[ask me] => [fragen sie mich]
[at

# **Splitting Text**

In [33]:
from pickle import load
from pickle import dump
from numpy.random import shuffle


In [34]:
# Load Clean Data
def load_clean_sentences(filename):
  return load(open(filename, 'rb'))

In [35]:
# Save Clean Data
def save_clean_data(sentences, filename):
  dump(sentences, open(filename, 'wb'))
  print('Saved: %s' % filename)

In [37]:
# Load Dataset
raw_dataset = load_clean_sentences('english-german.pkl')

In [38]:
# Reduce Data Size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]

In [39]:
# Random Shuffle
shuffle(dataset)

In [40]:
# Split into Train/Test
train, test = dataset[:9000], dataset[9000:]

In [41]:
# Saving
save_clean_data(dataset, 'English-German-Both.pkl')
save_clean_data(train, 'English-German-Train.pkl')
save_clean_data(test, 'English-German-Test.pkl')

Saved: English-German-Both.pkl
Saved: English-German-Train.pkl
Saved: English-German-Test.pkl


# **Training Neural Translation Model**

In [42]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [43]:
# Load Clean Data
def load_clean_sentences(filename):
  return load(open(filename, 'rb'))

In [44]:
# Fitting Tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [45]:
# Maximum Length of Sentence
def max_length(lines):
  return max(len(line.split()) for line in lines)

In [46]:
# Encode and Pad Sequences
def encode_sequences(tokenizer, length, lines):
  X = tokenizer.texts_to_sequences(lines)
  X = pad_sequences(X, maxlen=length, padding='post')
  return X

In [47]:
# One-Hot Encode Sequence
def encode_output(sequences, vocab_size):
  ylist = list()
  for sequence in sequences:
    encoded = to_categorical(sequence, num_classes=vocab_size)
    ylist.append(encoded)
  y = array(ylist)
  y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
  return y

In [48]:
# Define NMT Model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
  model = Sequential()
  model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
  model.add(LSTM(n_units))
  model.add(RepeatVector(tar_timesteps))
  model.add(LSTM(n_units, return_sequences=True))
  model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
  #  Compiling
  model.compile(optimizer='adam', loss='categorical_crossentropy')
  # Summary
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [89]:
# Load Data
dataset = load_clean_sentences('English-German-Both.pkl')
train = load_clean_sentences('English-German-Train.pkl')
test = load_clean_sentences('English-German-Test.pkl')
print(test)

[['is this normal' 'ist das normal'
  'ccby france attribution tatoebaorg hybrid pfirsichbaeumchen']
 ['lets go' 'lass uns losgehen'
  'ccby france attribution tatoebaorg ck pfirsichbaeumchen']
 ['tom is slim' 'tom ist schlank'
  'ccby france attribution tatoebaorg ck luiaard']
 ...
 ['toms smoking' 'tom raucht'
  'ccby france attribution tatoebaorg ck pfirsichbaeumchen']
 ['thats silly' 'das ist dumm'
  'ccby france attribution tatoebaorg ck zaghawa']
 ['im a prisoner' 'ich bin eine gefangene'
  'ccby france attribution tatoebaorg ck pfirsichbaeumchen']]


In [53]:
# Prepare English Eokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

English Vocabulary Size: 2171
English Max Length: 5


In [54]:
# Prepare German Tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

German Vocabulary Size: 3533
German Max Length: 9


In [55]:
# Prepare Training Data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

In [56]:
# Prepare Validation Data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [57]:
# Define Model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 9, 256)            904448    
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 5, 256)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 5, 256)            525312    
                                                                 
 time_distributed (TimeDistr  (None, 5, 2171)          557947    
 ibuted)                                                         
                                                                 
Total params: 2,513,019
Trainable params: 2,513,019
Non-

In [58]:
# Fit Model
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1,
save_best_only=True, mode='min')

model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY),
          callbacks=[checkpoint], verbose=2)

Epoch 1/30

Epoch 00001: val_loss improved from inf to 3.33502, saving model to model.h5
141/141 - 33s - loss: 4.1070 - val_loss: 3.3350 - 33s/epoch - 235ms/step
Epoch 2/30

Epoch 00002: val_loss improved from 3.33502 to 3.18507, saving model to model.h5
141/141 - 25s - loss: 3.1974 - val_loss: 3.1851 - 25s/epoch - 179ms/step
Epoch 3/30

Epoch 00003: val_loss improved from 3.18507 to 3.05784, saving model to model.h5
141/141 - 27s - loss: 3.0506 - val_loss: 3.0578 - 27s/epoch - 190ms/step
Epoch 4/30

Epoch 00004: val_loss improved from 3.05784 to 2.95784, saving model to model.h5
141/141 - 26s - loss: 2.8929 - val_loss: 2.9578 - 26s/epoch - 186ms/step
Epoch 5/30

Epoch 00005: val_loss improved from 2.95784 to 2.85460, saving model to model.h5
141/141 - 26s - loss: 2.7637 - val_loss: 2.8546 - 26s/epoch - 185ms/step
Epoch 6/30

Epoch 00006: val_loss improved from 2.85460 to 2.72091, saving model to model.h5
141/141 - 24s - loss: 2.6176 - val_loss: 2.7209 - 24s/epoch - 174ms/step
Epoch 7/

<keras.callbacks.History at 0x7f50d2fd7fd0>

# **Evaluating Neural Tranlation Model**

In [60]:
from pickle import load
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import SmoothingFunction

In [61]:
# Load a Clean Data
def load_clean_sentences(filename):
  return load(open(filename, 'rb'))

In [62]:
# Fitting Tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [63]:
# Maximum Length of Sentence
def max_length(lines):
  return max(len(line.split()) for line in lines)

In [64]:
# Encode and Pad Sequences
def encode_sequences(tokenizer, length, lines):
  X = tokenizer.texts_to_sequences(lines)
  X = pad_sequences(X, maxlen=length, padding='post')
  return X

In [65]:
# Map Integer to Word
def word_for_id(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == integer:
      return word
  return None

In [66]:
# Generate Target
def predict_sequence(model, tokenizer, source):
  prediction = model.predict(source, verbose=0)[0]
  integers = [argmax(vector) for vector in prediction]
  target = list()
  for i in integers:
    word = word_for_id(i, tokenizer)
    if word is None:
        break
    target.append(word)
  return ' '.join(target)

In [90]:
# Evaluating Model
def evaluate_model(model, sources, raw_dataset):
  actual, predicted = list(), list()
  for i, source in enumerate(sources):
    source = source.reshape((1, source.shape[0]))
    translation = predict_sequence(model, eng_tokenizer, source)
    raw_target, raw_src = raw_dataset[i, :-1]
    if i < 10:
      print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
    actual.append(raw_target.split())
    predicted.append(translation.split())
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
# Load Datas
dataset = load_clean_sentences('English-German-Both.pkl')
train = load_clean_sentences('English-German-Train.pkl')
test = load_clean_sentences('English-German-Test.pkl')

In [70]:
# Prepare English Tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

In [71]:
# Prepare German Eokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])

In [83]:
# Prepare Data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
print(type(trainX))
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

<class 'numpy.ndarray'>


In [73]:
# Load Model
model = load_model('model.h5')

In [91]:
# Test on Training Sequences
print('train')
evaluate_model(model, trainX, train)

train
src=[ich zahle spater], target=[ill pay later], predicted=[ill pay later]
src=[ich bin ein student], target=[i am a student], predicted=[i am a student]
src=[war tom hier], target=[was tom here], predicted=[was tom here]
src=[konnen wir gehen], target=[can we go], predicted=[can we go]
src=[tom ist gro], target=[tom is tall], predicted=[tom is tall]
src=[das war schwer], target=[this was hard], predicted=[that was hard]
src=[nur tom ist gegangen], target=[only tom left], predicted=[bring tom in]
src=[ich gehe jetzt], target=[im going now], predicted=[im going now]
src=[wartet noch ein bisschen], target=[wait a bit], predicted=[wait a bit]
src=[seien sie diskret], target=[be discreet], predicted=[be discreet]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.085428
BLEU-2: 0.277811
BLEU-3: 0.445254
BLEU-4: 0.500983


In [92]:
# Test on Test sequences
print('test')
evaluate_model(model, testX, test)

test
src=[ist das normal], target=[is this normal], predicted=[is that normal]
src=[lass uns losgehen], target=[lets go], predicted=[lets go]
src=[tom ist schlank], target=[tom is slim], predicted=[tom is crafty]
src=[haben sie gewahlt], target=[did you vote], predicted=[did you vote]
src=[er skatet], target=[he is skating], predicted=[he disappeared]
src=[tom hat geschwitzt], target=[tom sweated], predicted=[tom was fired]
src=[ich komm schon klar], target=[ill be fine], predicted=[i will you]
src=[sonst noch etwas], target=[anything else], predicted=[some you]
src=[er kommt zu spat], target=[he is late], predicted=[its is big]
src=[wer stand], target=[who stood], predicted=[who stayed]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.079367
BLEU-2: 0.265860
BLEU-3: 0.431180
BLEU-4: 0.486586
