# **import** **libraries**

In [2]:
import string
import re
import numpy as np
import pandas as pd
from pickle import dump
from unicodedata import normalize
from pickle import load
from numpy import array

# load file

In [3]:
def load_doc(filename):
  file=open(filename, mode='rt', encoding='utf-8')
  text=file.read()
  file.close()
  return text

# Data Preprocessing

In [4]:
def to_pairs(doc):
  lines=doc.strip().split('\n')
  pairs=[line.split('\t') for line in lines]
  return pairs


def clean_pairs(lines):
  cleaned = list()
  re_print = re.compile('[^%s]' % re.escape(string.printable))
  table = str.maketrans('', '', string.punctuation)
  for pair in lines:
    clean_pair = list()
    for line in pair:
      line = normalize('NFD', line).encode('ascii', 'ignore')
      line = line.decode('UTF-8')
      line = line.split()
      line=[word.lower() for word in line]
      line = [word.translate(table) for word in line]
      line = [re_print.sub('', w) for w in line]
      line = [word for word in line if word.isalpha()]
      clean_pair.append(' '.join(line))
    cleaned.append(clean_pair)
  return array(cleaned)

def save_clean_data(sentences, filename):
  dump(sentences, open(filename, 'wb'))
  print('Saved: %s' % filename)

# loading the German–English dataset

In [6]:
filename='/content/deu.txt'
doc=load_doc(filename)
pairs=to_pairs(doc)
clean_pairs=clean_pairs(pairs)
save_clean_data(clean_pairs, 'english-german.pkl')

for i in range(100):
  print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))


Saved: english-german.pkl
[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[duck] => [kopf runter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[hide] => [versteck dich]
[hide] => [versteckt euch]
[stay] => [bleib]
[stop] => [stopp]
[stop] => [anhalten]
[wait] => [warte]
[wait] => [warte]
[begin] => [fang an]
[do it] => [mache es]
[do it] => [tue es]
[go on] => [mach weiter]
[hello] => [hallo]
[hello] => [sers]
[hello] => [hallo]
[hurry] => [beeil dich]
[hurry] => [schnell]
[i hid] => [ich versteckte mich]
[i hid] => [ich habe mich versteckt]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich versuche es]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[i won] => [ich habe gewonnen]
[oh no] => [oh nein]
[relax] => [entspann dich]
[shoot] => [feuer]
[shoot] => [schie]
[smile] => [lacheln]
[sorry] => [entschuldigung]
[ask me] =

# Prepare Dataset

In [7]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

def load_clean_sentences(filename):
  return load(open(filename, 'rb'))

def save_clean_data(sentences, filename):
  dump(sentences, open(filename, 'wb'))
  print('Saved: %s' % filename)

raw_dataset=load_clean_sentences('/content/english-german.pkl')
n_sentences=15000
dataset=raw_dataset[:n_sentences,:]
shuffle(dataset)

train,test=dataset[:12000],dataset[12000:]


save_clean_data(dataset, '/content/english-german-both.pkl')
save_clean_data(train, '/content/english-german-train.pkl')
save_clean_data(test, '/content/english-german-test.pkl')

Saved: /content/english-german-both.pkl
Saved: /content/english-german-train.pkl
Saved: /content/english-german-test.pkl


In [8]:
# Load the saved data
loaded_data = load_clean_sentences('/content/english-german-both.pkl')

# Display a sample of the loaded data
print("Sample of the combined dataset:")
display(loaded_data[:5])

# Load and display a sample of the training data
loaded_train_data = load_clean_sentences('/content/english-german-train.pkl')
print("\nSample of the training dataset:")
display(loaded_train_data[:5])

# Load and display a sample of the test data
loaded_test_data = load_clean_sentences('/content/english-german-test.pkl')
print("\nSample of the test dataset:")
display(loaded_test_data[:5])

Sample of the combined dataset:


array([['this is my dad', 'das hier ist mein papa',
        'ccby france attribution tatoebaorg sethlang blundainte'],
       ['i was joking', 'ich habe spa gemacht',
        'ccby france attribution tatoebaorg ck wolfgangth'],
       ['they slept', 'sie haben geschlafen',
        'ccby france attribution tatoebaorg disconostalgia mhr'],
       ['go to your room', 'gehe auf dein zimmer',
        'ccby france attribution tatoebaorg ck pfirsichbaeumchen'],
       ['come along', 'kommt mit',
        'ccby france attribution tatoebaorg ck mikemolto']], dtype='<U72')


Sample of the training dataset:


array([['this is my dad', 'das hier ist mein papa',
        'ccby france attribution tatoebaorg sethlang blundainte'],
       ['i was joking', 'ich habe spa gemacht',
        'ccby france attribution tatoebaorg ck wolfgangth'],
       ['they slept', 'sie haben geschlafen',
        'ccby france attribution tatoebaorg disconostalgia mhr'],
       ['go to your room', 'gehe auf dein zimmer',
        'ccby france attribution tatoebaorg ck pfirsichbaeumchen'],
       ['come along', 'kommt mit',
        'ccby france attribution tatoebaorg ck mikemolto']], dtype='<U72')


Sample of the test dataset:


array([['visit us', 'besuchen sie uns',
        'ccby france attribution tatoebaorg marijnkp oskar'],
       ['im eating', 'ich esse gerade',
        'ccby france attribution tatoebaorg vortarulo'],
       ['forget it', 'vergiss es',
        'ccby france attribution tatoebaorg eldad fingerhut'],
       ['im happy too', 'ich bin auch glucklich',
        'ccby france attribution tatoebaorg ck manfredo'],
       ['how strange', 'wie seltsam',
        'ccby france attribution tatoebaorg ck hansadler']], dtype='<U72')

In [9]:
!pip install tensorflow



# import libraries

In [10]:
from pickle import load
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint

# Data Preprocessing  for Translation Model

In [11]:
def load_clean_sentences(filename):
  return load(open(filename, 'rb'))

def create_tokenizer(lines):
  tokenizer=Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

def max_length(lines):
  return max(len(line.split()) for line in lines)

def encode_sequences(tokenizer, length, lines):
  X=tokenizer.texts_to_sequences(lines)
  X=pad_sequences(X, maxlen=length, padding='post')
  return X

def encode_output(sequences, vocab_size):
  ylist=[]
  for sequence in sequences:
    encoded=to_categorical(sequence, num_classes=vocab_size)
    ylist.append(encoded)
  y=array(ylist)
  y=y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
  return y

In [12]:
def define_model(src_vocab_size, tar_vocab_size, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab_size, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab_size, activation='softmax')))
    return model


In [13]:
dataset = load_clean_sentences('/content/english-german-both.pkl')
train = load_clean_sentences('/content/english-german-train.pkl')
test = load_clean_sentences('/content/english-german-test.pkl')

eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])

print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

English Vocabulary Size: 2884
English Max Length: 5
German Vocabulary Size: 4625
German Max Length: 10


# Seq2Seq LSTM model for translation

In [14]:
model=define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 512) # Increased units
model.build((None, ger_length)) # Explicitly build the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

print(model.summary())

plot_model(model, to_file='model.png', show_shapes=True)

filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=50, batch_size=64, validation_split=0.2, callbacks=[checkpoint], verbose=2) # Increased epochs



None
Epoch 1/50

Epoch 1: val_loss improved from inf to 3.39040, saving model to model.h5




150/150 - 10s - 67ms/step - loss: 3.9835 - val_loss: 3.3904
Epoch 2/50

Epoch 2: val_loss improved from 3.39040 to 3.18269, saving model to model.h5




150/150 - 6s - 40ms/step - loss: 3.2333 - val_loss: 3.1827
Epoch 3/50

Epoch 3: val_loss improved from 3.18269 to 2.91839, saving model to model.h5




150/150 - 5s - 31ms/step - loss: 2.9309 - val_loss: 2.9184
Epoch 4/50

Epoch 4: val_loss improved from 2.91839 to 2.69663, saving model to model.h5




150/150 - 5s - 34ms/step - loss: 2.6295 - val_loss: 2.6966
Epoch 5/50

Epoch 5: val_loss improved from 2.69663 to 2.54051, saving model to model.h5




150/150 - 2s - 17ms/step - loss: 2.3594 - val_loss: 2.5405
Epoch 6/50

Epoch 6: val_loss improved from 2.54051 to 2.40698, saving model to model.h5




150/150 - 3s - 18ms/step - loss: 2.1076 - val_loss: 2.4070
Epoch 7/50

Epoch 7: val_loss improved from 2.40698 to 2.29405, saving model to model.h5




150/150 - 5s - 36ms/step - loss: 1.8898 - val_loss: 2.2941
Epoch 8/50

Epoch 8: val_loss improved from 2.29405 to 2.19502, saving model to model.h5




150/150 - 5s - 33ms/step - loss: 1.6830 - val_loss: 2.1950
Epoch 9/50

Epoch 9: val_loss improved from 2.19502 to 2.13637, saving model to model.h5




150/150 - 3s - 17ms/step - loss: 1.4936 - val_loss: 2.1364
Epoch 10/50

Epoch 10: val_loss improved from 2.13637 to 2.06923, saving model to model.h5




150/150 - 3s - 17ms/step - loss: 1.3174 - val_loss: 2.0692
Epoch 11/50

Epoch 11: val_loss improved from 2.06923 to 2.01600, saving model to model.h5




150/150 - 3s - 19ms/step - loss: 1.1495 - val_loss: 2.0160
Epoch 12/50

Epoch 12: val_loss improved from 2.01600 to 1.98166, saving model to model.h5




150/150 - 5s - 33ms/step - loss: 0.9923 - val_loss: 1.9817
Epoch 13/50

Epoch 13: val_loss improved from 1.98166 to 1.93748, saving model to model.h5




150/150 - 5s - 34ms/step - loss: 0.8491 - val_loss: 1.9375
Epoch 14/50

Epoch 14: val_loss improved from 1.93748 to 1.91094, saving model to model.h5




150/150 - 5s - 33ms/step - loss: 0.7237 - val_loss: 1.9109
Epoch 15/50

Epoch 15: val_loss improved from 1.91094 to 1.88856, saving model to model.h5




150/150 - 5s - 34ms/step - loss: 0.6111 - val_loss: 1.8886
Epoch 16/50

Epoch 16: val_loss improved from 1.88856 to 1.86727, saving model to model.h5




150/150 - 5s - 35ms/step - loss: 0.5114 - val_loss: 1.8673
Epoch 17/50

Epoch 17: val_loss improved from 1.86727 to 1.85831, saving model to model.h5




150/150 - 5s - 33ms/step - loss: 0.4299 - val_loss: 1.8583
Epoch 18/50

Epoch 18: val_loss did not improve from 1.85831
150/150 - 5s - 34ms/step - loss: 0.3649 - val_loss: 1.8592
Epoch 19/50

Epoch 19: val_loss improved from 1.85831 to 1.85231, saving model to model.h5




150/150 - 5s - 34ms/step - loss: 0.3093 - val_loss: 1.8523
Epoch 20/50

Epoch 20: val_loss did not improve from 1.85231
150/150 - 6s - 38ms/step - loss: 0.2633 - val_loss: 1.8582
Epoch 21/50

Epoch 21: val_loss did not improve from 1.85231
150/150 - 5s - 31ms/step - loss: 0.2262 - val_loss: 1.8764
Epoch 22/50

Epoch 22: val_loss did not improve from 1.85231
150/150 - 3s - 18ms/step - loss: 0.1987 - val_loss: 1.8683
Epoch 23/50

Epoch 23: val_loss did not improve from 1.85231
150/150 - 5s - 34ms/step - loss: 0.1750 - val_loss: 1.8846
Epoch 24/50

Epoch 24: val_loss did not improve from 1.85231
150/150 - 5s - 32ms/step - loss: 0.1591 - val_loss: 1.8879
Epoch 25/50

Epoch 25: val_loss did not improve from 1.85231
150/150 - 2s - 16ms/step - loss: 0.1447 - val_loss: 1.8983
Epoch 26/50

Epoch 26: val_loss did not improve from 1.85231
150/150 - 3s - 17ms/step - loss: 0.1347 - val_loss: 1.9106
Epoch 27/50

Epoch 27: val_loss did not improve from 1.85231
150/150 - 3s - 18ms/step - loss: 0.1261 

<keras.src.callbacks.history.History at 0x780b548542f0>

In [15]:
from pickle import load
from numpy import array
from numpy import argmax
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu


In [16]:
def load_clean_sentences(filename):
  return load(open(filename, 'rb'))

def create_tokenizer(lines):
  tokenizer=Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

def encode_sequences(tokenizer,length, lines):
  X=tokenizer.texts_to_sequences(lines)
  X=pad_sequences(X, maxlen=length, padding='post')
  return X

def word_for_id(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == integer:
      return word
  return None

def predict_sequence(model, tokenizer, source):
  prediction=model.predict(source, verbose=0)[0]
  integers=[argmax(vector) for vector in prediction]
  target = list()
  for i in integers:
    word = word_for_id(i, tokenizer)
    if word is None:
      break
    target.append(word)
  return ' '.join(target)

def evaluate_model(model, tokenizer, sources, raw_dataset):
  actual, predicted = list(), list()
  for i, source in enumerate(sources):
    source = source.reshape((1, source.shape[0]))
    translation = predict_sequence(model, eng_tokenizer, source)
    raw_target, raw_src, _ = raw_dataset[i] # Modified to unpack three values and ignore the third
    if i < 10:
      print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
    actual.append([raw_target.split()])
    predicted.append(translation.split())
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [17]:
dataset = load_clean_sentences('/content/english-german-both.pkl')
train = load_clean_sentences('/content/english-german-train.pkl')
test = load_clean_sentences('/content/english-german-test.pkl')

eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])

trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

model = load_model('/content/model.h5')


