In [2]:
#!wget  http://www.manythings.org/anki/fra-eng.zip


In [1]:
import re
import string
from numpy import array, argmax, random, take
import pandas as pd
import tensorflow
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, RepeatVector
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
#import matplotlib.pyplot as plt
#%matplotlib inline
pd.set_option('display.max_colwidth', 200)



In [1]:
data_path = 'fra.txt' 
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read()
    
#lines

In [3]:
def to_line(text):
    sents = text.strip().split('\n')
    sents = [i.split('\t') for i in sents]
    return sents

In [4]:
fra_eng = to_line(lines)
fra_eng[:5]

[['Go.',
  'Va !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'],
 ['Go.',
  'Marche.',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)'],
 ['Go.',
  'En route !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)'],
 ['Go.',
  'Bouge !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)'],
 ['Hi.',
  'Salut !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)']]

In [5]:
fra_eng = array(fra_eng)
fra_eng[:5]

array([['Go.', 'Va !',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'],
       ['Go.', 'Marche.',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)'],
       ['Go.', 'En route !',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)'],
       ['Go.', 'Bouge !',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)'],
       ['Hi.', 'Salut !',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)']],
      dtype='<U349')

In [6]:
fra_eng.shape

(208906, 3)

In [7]:
fra_eng = fra_eng[:90000, :]  #here we are taking only 90000 for easyness
fra_eng = fra_eng[:,[0,1]]   #3rd col s not imp so we are eleminating it
fra_eng[:5]


array([['Go.', 'Va !'],
       ['Go.', 'Marche.'],
       ['Go.', 'En route !'],
       ['Go.', 'Bouge !'],
       ['Hi.', 'Salut !']], dtype='<U349')

In [9]:
#data cleaning

fra_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,0]]
fra_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,1]]

fra_eng

array([['Go', 'Va '],
       ['Go', 'Marche'],
       ['Go', 'En route '],
       ...,
       ['Have you ever shot anybody', 'Avezvous déjà tiré sur quelquun '],
       ['Have you finished dressing', 'Astu fini de thabiller '],
       ['Have you finished dressing', 'Avezvous fini de vous habiller ']],
      dtype='<U349')

In [13]:
# to lower case
for i in range(len(fra_eng)):
    fra_eng[i,0] = fra_eng[i,0].lower()
    fra_eng[i,1] = fra_eng[i,1].lower()
    
fra_eng

array([['go', 'va '],
       ['go', 'marche'],
       ['go', 'en route '],
       ...,
       ['have you ever shot anybody', 'avezvous déjà tiré sur quelquun '],
       ['have you finished dressing', 'astu fini de thabiller '],
       ['have you finished dressing', 'avezvous fini de vous habiller ']],
      dtype='<U349')

In [14]:
#function to build a tokenizer

def tokenization(line):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

#prepare english tokenizer

eng_tokenizer = tokenization(fra_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 8
print("Engish vocab size is ", eng_vocab_size)

Engish vocab size is  72


In [15]:
fra_tokenizer = tokenization(fra_eng[:, 1])
fra_vocab_size = len(fra_tokenizer.word_index) + 1

fra_length = 8
print("Frech vocab size is ", fra_vocab_size)

Frech vocab size is  72


In [19]:
#encode and pad sequence, padding to max sent len as above
def encode_sequences(tokenizer, length, line):
    # integer encode sequence
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen = length, padding = 'post' )
    return seq

In [20]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(fra_eng, test_size=0.2, random_state=12)

In [21]:
#prepare traning data
trainX = encode_sequences(fra_tokenizer, fra_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])


#prepare traning data
testX = encode_sequences(fra_tokenizer, fra_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

In [24]:
#Build model
def define_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
        model = Sequential()
        model.add(Embedding(in_vocab, units, input_length = in_timesteps, mask_zero = True))
        model.add(LSTM(units))
        model.add(RepeatVector(out_timesteps))
        model.add(LSTM(units, return_sequences = True))
        model.add(Dense(out_vocab, activation = 'softmax'))
        return model

In [26]:
#We are uisng RMSprop optimizer good choice in RNN

#modelcompilation

model = define_model(fra_vocab_size, eng_vocab_size, fra_length, eng_length, 512)
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss = 'sparse_categorical_crossentropy')

In [27]:
#train model
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1), epochs = 10, batch_size=512, validation_split = 0.2)

Epoch 1/10
 1081/48930 [..............................] - ETA: 23:46:01 - loss: 0.0649

KeyboardInterrupt: 

In [None]:
preds = model.predict_classes(testX,reshape((testX.shape[0], testX.shape[1])))

In [None]:
preds

In [None]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

In [None]:
#convert prediction into sentence
preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
        else:
            if (t == None):
                temp.append('')
            else:
                temp.append(t)
                
    preds_text.append(' '.join(temp))

In [None]:
pred_df = pd.DataFrame({'actual' : test[:, 0], 'predicted' : preds_text })

In [None]:
pred_df.sample(15)