In [1]:
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth' , 200)

In [2]:
def read_text(filename):
    #open the file
    file = open(filename, mode='rt', encoding='utf-8')
    #read all text
    text = file.read()
    file.close()
    return text

In [3]:
#split a text into sentenses
def to_lines(text):
    sents = text.strip().split('\n')
    sents = [i.split('\t')for i in sents]
    return sents

In [4]:
data = read_text("../input/deutch-to-english/deu.txt")
deu_eng = to_lines(data)
deu_eng = array(deu_eng)

In [5]:
deu_eng =deu_eng[:50000,:]

In [6]:
deu_eng

In [7]:
#empty lists
eng_1 = []
deu_1 = []
# populate the lists with sentence lengths
for i in deu_eng[:,0]:
    eng_1.append(len(i.split()))
    
for i in deu_eng[:,1]:
    deu_1.append(len(i.split()))

In [11]:
lenght_df = pd.DataFrame({'eng':eng_1, 'deu':deu_1})
length_df.hist(bins = 30)
plt.show()
return length_df

In [None]:
#function to build tokenizer
def tokenization(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [None]:
# prepare english tokenizer
eng_tokenizer = tokenization(deu_eng[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 8
print ('English Vocabulary Size: %d' % eng_vocab_size)

In [None]:
# prepare Deutch tokenizer
deu_tokenizer = tokenization(deu_eng[:,0])
deu_vocab_size = len(deu_tokenizer.word_index) + 1

deu_length = 8
print ('Deutch Vocabulary Size: %d' % deu_vocab_size)

In [None]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
  # integer encode sequences
  seq = tokenizer.texts_to_sequences(lines)
  #pad sequences with 0 values 
seq = pad_sequences(seq, maxlen=length, padding='post')
retuen seq

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(deu_eng, test_size=0.2, random_state =12)

In [None]:
# prepare training data
trainX = encode_sequences(deu_tokenizer, deu_length, train[:, 1])
trainY = encode_sequences(deu_tokenizer, deu_length, train[:, 0])

In [None]:
# prepare VALIDATION  data
testX = encode_sequences(deu_tokenizer, deu_length, train[:, 1])
testY = encode_sequences(deu_tokenizer, deu_length, train[:, 0])

In [None]:
#build NMT model
def build_model(in_vocab,out_vocab, in_timesteps, out_timesteps, units):
    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    return model
    

In [None]:
model = build_model(deu_vocab_size, eng_vocab_size, deu_length, eng_length, 512)
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')


In [None]:
filename = 'model.h1.21_mayurm'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True,mode='min')

history = model.fit(trainX, trainY.reshape(trainY.shape[0],trainY.shape[1],1),
                   epoch=5, batch_size=512,
                   validation_split = 0.2,
                   callbacks=[checkpoint], verbose=1)

In [12]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train','validation'])
plt.show()


In [None]:
model = load_model('model.h1.21_mayurm')
preds = model.predict_classes(testX.reshape((testX.shape[0],shape[0],testX.shape[1])))

In [None]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
        return None

In [None]:
# convert predictions into text (English)
preds_text = []
for i in preds :
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], eng_tokenizer)) or (t== None):
                temp.append('')
            else:
                temp.append(t)
                
                
                
            else:
                if(t == None):
                    temp.append('')
                else:
                    temp.append(t)
                    
                    
                    
                    
        preds_text.append('',.join(temp))            

In [None]:
predf_df = pd.DataFrame({'actual' : test[:,0], 'predicted' : preds_text})

In [None]:
pd.set_option('display.max_colwidth', 200)

In [None]:
pred_df.head(15)

In [13]:
pred_df.tail(15)