In [114]:
import numpy as np 
import pandas as pd 
import os
import re
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, LSTM, Bidirectional,RepeatVector,TimeDistributed
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

In [71]:
data = pd.read_csv('deu.txt', sep="\t", header=None)
data.columns = ["english", "german", "garbage"]

In [72]:
data.head()

Unnamed: 0,english,german,garbage
0,Go.,Geh.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Hi.,Hallo!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Hi.,Grüß Gott!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
3,Run!,Lauf!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
4,Run.,Lauf!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...


In [73]:
data.drop('garbage', axis=1, inplace=True)

In [74]:
data.head()

Unnamed: 0,english,german
0,Go.,Geh.
1,Hi.,Hallo!
2,Hi.,Grüß Gott!
3,Run!,Lauf!
4,Run.,Lauf!


In [75]:
def clean_text(text):
    text = text.lower()
    
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = pattern.sub('', text)
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    text = emoji.sub(r'', text)
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)        
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text) 
    text = re.sub(r"\'ll", " will", text)  
    text = re.sub(r"\'ve", " have", text)  
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"did't", "did not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"have't", "have not", text)
    text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", text)
    return text

In [76]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [77]:
def CleanTokenize(df):
    sentences = list()
    lines = df["english"].values.tolist()

    for line in lines:
        line = clean_text(line)
        # tokenize the text
        tokens = word_tokenize(line)
        # remove puntuations
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # remove non alphabetic characters
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words("english"))
        # remove stop words
        words = [w for w in words if not w in stop_words]
        sentences.append(words)
    return sentences

In [78]:
eng_sentences = CleanTokenize(data)
eng_sentences[0:10]

[['go'],
 ['hi'],
 ['hi'],
 ['run'],
 ['run'],
 ['wow'],
 ['wow'],
 ['fire'],
 ['help'],
 ['help']]

In [79]:
from pickle import dump

In [80]:
dump(eng_sentences, open('english.pkl', 'wb'))

In [81]:
def clean_text_german(text):
    text = text.lower()
    
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = pattern.sub('', text)
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    text = emoji.sub(r'', text)
    text = text.lower()
    text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", text)
    return text

In [82]:
def CleanTokenize_german(df):
    sentences = list()
    lines = df["german"].values.tolist()

    for line in lines:
        line = clean_text_german(line)
        # tokenize the text
        tokens = word_tokenize(line)
        # remove puntuations
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # remove non alphabetic characters
        words = [word for word in stripped if word.isalpha()]
        sentences.append(words)
    return sentences

In [83]:
ger_sentences = CleanTokenize_german(data)
ger_sentences[0:10]

[['geh'],
 ['hallo'],
 ['grüß', 'gott'],
 ['lauf'],
 ['lauf'],
 ['potzdonner'],
 ['donnerwetter'],
 ['feuer'],
 ['hilfe'],
 ['zu', 'hülf']]

In [84]:
dump(eng_sentences, open('german.pkl', 'wb'))

In [85]:
from pickle import load

In [86]:
max(len(line) for line in eng_sentences)

44

In [87]:
max(len(line) for line in ger_sentences)

76

In [88]:
max_length = 44

tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(eng_sentences)
eng_sequences = tokenizer_eng.texts_to_sequences(eng_sentences)

word_index_eng = tokenizer_eng.word_index
print("unique tokens - ",len(word_index_eng))
vocab_size = len(tokenizer_eng.word_index) + 1
print('vocab size -', vocab_size)

eng_pad = pad_sequences(eng_sequences, maxlen=max_length, padding='post')

unique tokens -  34239
vocab size - 15862


In [89]:
eng_pad.shape

(208486, 44)

In [90]:
eng_pad[170253]

array([680,  47,   3,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0])

In [91]:
max_length = 76

tokenizer_ger = Tokenizer()
tokenizer_ger.fit_on_texts(ger_sentences)
ger_sequences = tokenizer_ger.texts_to_sequences(ger_sentences)

word_index_ger = tokenizer_ger.word_index
print("unique tokens - ",len(word_index))
vocab_size = len(tokenizer_ger.word_index) + 1
print('vocab size -', vocab_size)

ger_pad = pad_sequences(ger_sequences, maxlen=max_length, padding='post')

unique tokens -  34239
vocab size - 34240


In [92]:
ger_pad.shape

(208486, 76)

In [93]:
ger_pad[170253]

array([   1,   18,   22, 1591, 1214,  871,    8,   61,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [148]:
indices = np.arange(eng_pad.shape[0])
np.random.shuffle(indices)
eng_pad = eng_pad[indices]
ger_pad = ger_pad[indices]

num_validation_samples = int(0.2 * ger_pad.shape[0])

X_train_pad = eng_pad[:-num_validation_samples]
y_train = ger_pad[:-num_validation_samples]
y_train = np.asarray(y_train, dtype=np.float32)
print(type(y_train))
y_train = np.expand_dims(y_train, axis=2)
X_test_pad = eng_pad[-num_validation_samples:]
y_test = ger_pad[-num_validation_samples:]
y_test = np.asarray(y_test, dtype=np.float32)
print(type(y_test))
y_test = np.expand_dims(y_test, axis=2)
y_test.shape

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


(41697, 76, 1)

In [149]:
X_train_pad.shape

(166789, 44)

In [150]:
y_train.shape

(166789, 76, 1)

In [151]:
X_test_pad.shape

(41697, 44)

In [152]:
y_test.shape

(41697, 76, 1)

In [99]:
embeddings_index = {}
embedding_dim = 300
GLOVE_DIR = "O:\Glove"
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [107]:
embedding_matrix = np.zeros((len(word_index_eng) + 1, embedding_dim))
c = 0
for word, i in word_index_eng.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        c+=1
        embedding_matrix[i] = embedding_vector
print(c)

15309


In [108]:
embedding_matrix.shape

(15862, 300)

In [109]:
print(embeddings_index.get("hello"))

[-3.3712e-01 -2.1691e-01 -6.6365e-03 -4.1625e-01 -1.2555e+00 -2.8466e-02
 -7.2195e-01 -5.2887e-01  7.2085e-03  3.1997e-01  2.9425e-02 -1.3236e-02
  4.3511e-01  2.5716e-01  3.8995e-01 -1.1968e-01  1.5035e-01  4.4762e-01
  2.8407e-01  4.9339e-01  6.2826e-01  2.2888e-01 -4.0385e-01  2.7364e-02
  7.3679e-03  1.3995e-01  2.3346e-01  6.8122e-02  4.8422e-01 -1.9578e-02
 -5.4751e-01 -5.4983e-01 -3.4091e-02  8.0017e-03 -4.3065e-01 -1.8969e-02
 -8.5670e-02 -8.1123e-01 -2.1080e-01  3.7784e-01 -3.5046e-01  1.3684e-01
 -5.5661e-01  1.6835e-01 -2.2952e-01 -1.6184e-01  6.7345e-01 -4.6597e-01
 -3.1834e-02 -2.6037e-01 -1.7797e-01  1.9436e-02  1.0727e-01  6.6534e-01
 -3.4836e-01  4.7833e-02  1.6440e-01  1.4088e-01  1.9204e-01 -3.5009e-01
  2.6236e-01  1.7626e-01 -3.1367e-01  1.1709e-01  2.0378e-01  6.1775e-01
  4.9075e-01 -7.5210e-02 -1.1815e-01  1.8685e-01  4.0679e-01  2.8319e-01
 -1.6290e-01  3.8388e-02  4.3794e-01  8.8224e-02  5.9046e-01 -5.3515e-02
  3.8819e-02  1.8202e-01 -2.7599e-01  3.9474e-01 -2

In [110]:
print(embedding_matrix[word_index_eng.get("hello")])

[-3.37119997e-01 -2.16910005e-01 -6.63649989e-03 -4.16249990e-01
 -1.25549996e+00 -2.84659993e-02 -7.21949995e-01 -5.28869987e-01
  7.20850006e-03  3.19970012e-01  2.94250008e-02 -1.32360002e-02
  4.35110003e-01  2.57160008e-01  3.89950007e-01 -1.19680002e-01
  1.50350004e-01  4.47620004e-01  2.84069985e-01  4.93389994e-01
  6.28260016e-01  2.28880003e-01 -4.03849989e-01  2.73640007e-02
  7.36790011e-03  1.39950007e-01  2.33459994e-01  6.81219995e-02
  4.84219998e-01 -1.95780005e-02 -5.47510028e-01 -5.49830019e-01
 -3.40909995e-02  8.00170004e-03 -4.30649996e-01 -1.89689994e-02
 -8.56700018e-02 -8.11230004e-01 -2.10800007e-01  3.77840012e-01
 -3.50459993e-01  1.36840001e-01 -5.56609988e-01  1.68349996e-01
 -2.29519993e-01 -1.61840007e-01  6.73449993e-01 -4.65970010e-01
 -3.18339989e-02 -2.60369986e-01 -1.77970007e-01  1.94359999e-02
  1.07270002e-01  6.65340006e-01 -3.48360002e-01  4.78329994e-02
  1.64399996e-01  1.40880004e-01  1.92039996e-01 -3.50089997e-01
  2.62360007e-01  1.76259

In [121]:
model = Sequential()
model.add(Embedding(len(word_index_eng) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=44,
                            trainable=False))
model.add(Bidirectional(LSTM(1024, dropout=0.2, recurrent_dropout=0.25,return_sequences=True),merge_mode="sum"))
model.add(LSTM(512, dropout=0.2, recurrent_dropout=0.25))
model.add(RepeatVector(76))
model.add(LSTM(1024, dropout=0.2, recurrent_dropout=0.25, return_sequences=True))
model.add(TimeDistributed(Dense(34240, activation='softmax')))

In [122]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])

print(model.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 44, 300)           4758600   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 44, 1024)          10854400  
_________________________________________________________________
lstm_16 (LSTM)               (None, 512)               3147776   
_________________________________________________________________
repeat_vector_5 (RepeatVecto (None, 76, 512)           0         
_________________________________________________________________
lstm_17 (LSTM)               (None, 76, 1024)          6295552   
_________________________________________________________________
time_distributed_4 (TimeDist (None, 76, 34240)         35096000  
Total params: 60,152,328
Trainable params: 55,393,728
Non-trainable params: 4,758,600
__________________________________

In [None]:
checkpoint = ModelCheckpoint('model-{epoch:03d}.model',monitor='val_loss',verbose=0,save_best_only=True,mode='auto')

history = model.fit(X_train_pad, y_train, batch_size=32, epochs=30, callbacks=[checkpoint], validation_split=0.2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 133431 samples, validate on 33358 samples
Epoch 1/30
   416/133431 [..............................] - ETA: 620:15:30 - loss: 4.2749 - acc: 0.8438