<a href="https://colab.research.google.com/github/namita0210/german_to_english_machine_translation/blob/main/German_to_English_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import re
import string
from numpy import array
from pickle import dump
from unicodedata import normalize
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [27]:
from google.colab import drive
drive.mount('/content/drive')
data = '/content/drive/MyDrive/deu.txt'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
#function to load the file and preserve the unicode german characters
def load_file(filename):
  file = open(filename , 'r', encoding='utf-8')
  text = file.read()
  file.close()
  return text

In [29]:
text=load_file(data)
print(text[:99])

Hi.	Hallo!
Hi.	Grüß Gott!
Run!	Lauf!
Wow!	Potzdonner!
Wow!	Donnerwetter!
Fire!	Feuer!
Help!	Hilfe!



In [30]:
#split the text by phrases
def to_phrase(doc):
  lines = doc.strip().split('\n')
  phrases =[line.split('\t') for line in lines]
  return phrases

In [31]:
phrases=to_phrase(text)
print(phrases[:3])
print(type(phrases))

[['Hi.', 'Hallo!'], ['Hi.', 'Grüß Gott!'], ['Run!', 'Lauf!']]
<class 'list'>


In [32]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [34]:
def clean(lines):
    cleaned = []
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('','', string.punctuation)

    for l in lines:
        clean_pair=[]
        for x in l :
            x = normalize('NFD',x).encode('ascii','ignore')
            x = x.decode('UTF-8')
            x = x.split()
            x = [word.lower() for word in x]
            x = [word.translate(table) for word in x]
            x = [re_print.sub('' , word) for word in x]
            x = [word for word in x if word.isalpha()]
            clean_pair.append(' '.join(x))
        cleaned.append(clean_pair)
    return array(cleaned)

In [35]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
  dump(sentences, open(filename, 'wb'))
  print('Saved: %s' % filename)

In [37]:
doc = load_file(filename=data)
pairs = to_phrase(doc)
clean_pairs = clean(pairs)

In [38]:
clean_pairs

array([['hi', 'hallo'],
       ['hi', 'gru gott'],
       ['run', 'lauf'],
       ...,
       ['if someone who doesnt know your background says that you sound like a native speaker it means they probably noticed something about your speaking that made them realize you werent a native speaker in other words you dont really sound like a native speaker',
        'wenn jemand der deine herkunft nicht kennt sagt dass du wie ein muttersprachler sprichst bedeutet das dass man wahrscheinlich etwas an deiner sprechweise bemerkt hat das erkennen lie dass du kein muttersprachler bist mit anderen worten du horst dich nicht wirklich wie ein muttersprachler an'],
       ['if someone who doesnt know your background says that you sound like a native speaker it means they probably noticed something about your speaking that made them realize you werent a native speaker in other words you dont really sound like a native speaker',
        'wenn jemand fremdes dir sagt dass du dich wie ein muttersprachler 

In [40]:
save_clean_data(clean_pairs, 'ENGLISH-GERMAN.pkl') # Run Only Once.

Saved: ENGLISH-GERMAN.pkl


In [41]:
#train-test-split

def load_clean_sentences(filename):
 return load(open(filename, 'rb'))

raw_dataset = load_clean_sentences('ENGLISH-GERMAN.pkl')

In [45]:
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
shuffle(dataset)
train, test = dataset[:9000], dataset[9000:]
save_clean_data(dataset, 'EG-BOTH.pkl')
save_clean_data(train, 'EG-TRAIN.pkl')
save_clean_data(test, 'EG-TEST.pkl')

Saved: EG-BOTH.pkl
Saved: EG-TRAIN.pkl
Saved: EG-TEST.pkl


In [46]:
#Train Neural Translation model
#Start by loading the train - test data stored in pkl files
# Step 7 : Load the pkl files

dataset = load_clean_sentences('EG-BOTH.pkl')
train = load_clean_sentences('EG-TRAIN.pkl')
test = load_clean_sentences('EG-TEST.pkl')

In [47]:
#We can use the Keras Tokenize class to map words to integers, as needed for modeling.
#We will use separate tokenizer for the English sequences and the German sequences.
#The function below-named create_tokenizer() will train a tokenizer on a list of phrases
#Step 8 : Create tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [48]:
# Find the length of longest sequence in the list of phrases
# max sentence length
def max_length(lines):
 return max(len(line.split()) for line in lines)

In [49]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

English Vocabulary Size: 2404
English Max Length: 5


In [50]:
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 0])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

German Vocabulary Size: 3856
German Max Length: 5
