In [1]:
import os, sys

from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
#import matplotlib.pyplot as plt



In [33]:
#Set the value for following parameter
BATCH_SIZE = 64
EPOCHS = 2
LSTM_NODES =256
NUM_SENTENCES = 10000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

In [34]:
##Load the data set
input_sentences = []
output_sentences = []
output_sentences_inputs = []

count = 0
for line in open(r'E:\Pycharm_Projects\tf_prac\Encoder,Decoder,Seq2Seq\eng-fra\fra.txt', encoding="utf-8"):
    count += 1

    if count > NUM_SENTENCES:
        break

    if '\t' not in line:
        continue
    
    
    line.rstrip().split('\t')
    input_sentence  = line.split('\t')[0]
    output = line.split('\t')[1]

    output_sentence = output + ' <eos>'
    output_sentence_input = '<sos> ' + output

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentence_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 10000
num samples output: 10000
num samples output input: 10000


In [35]:
#now randomly print sentence
print(input_sentences[172])
print(output_sentences[172])
print(output_sentences_inputs[172])

Be fair.
Sois sincère. <eos>
<sos> Sois sincère.


In [36]:
## Tokenization

input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences) #token to seq of interger



word2idx_inputs = input_tokenizer.word_index  #unique integer assigned to each word
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 2014
Length of longest sentence in input: 4


In [37]:
word2idx_inputs.items()

dict_items([('i', 1), ('tom', 2), ('it', 3), ('you', 4), ("i'm", 5), ('is', 6), ('a', 7), ('me', 8), ('we', 9), ('go', 10), ("you're", 11), ('are', 12), ('was', 13), ("it's", 14), ('be', 15), ('he', 16), ('this', 17), ('get', 18), ("we're", 19), ('up', 20), ('can', 21), ('that', 22), ("don't", 23), ('do', 24), ('take', 25), ('they', 26), ("i'll", 27), ('come', 28), ('to', 29), ('let', 30), ("that's", 31), ('have', 32), ('the', 33), ('keep', 34), ('here', 35), ('no', 36), ('stay', 37), ('not', 38), ('out', 39), ('in', 40), ("they're", 41), ('did', 42), ('who', 43), ('stop', 44), ('got', 45), ('him', 46), ("tom's", 47), ('need', 48), ('my', 49), ('she', 50), ('am', 51), ('us', 52), ('love', 53), ('lost', 54), ('like', 55), ('help', 56), ('what', 57), ("he's", 58), ("let's", 59), ('home', 60), ('on', 61), ('look', 62), ('all', 63), ('back', 64), ('see', 65), ('saw', 66), ('how', 67), ('try', 68), ('must', 69), ('one', 70), ('down', 71), ('please', 72), ('away', 73), ('feel', 74), ('busy',

In [38]:
## it is output similar as input

output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 5641
Length of longest sentence in the output: 11


In [39]:
##padding the input and the output is that text sentences can be of varying length, however LSTM (the algorithm that we are going to train our model) expects input instances with the same length
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[172])

encoder_input_sequences.shape: (10000, 4)
encoder_input_sequences[172]: [  0   0  15 196]


In [40]:
print(word2idx_inputs["i'm"])
print(word2idx_inputs["ill"])

5
313


In [51]:
## same for output
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')
#print(decoder_output_sequences)

decoder_input_sequences.shape: (10000, 11)
decoder_input_sequences[172]: [  2  45 604   0   0   0   0   0   0   0   0]
[[  44    4    1 ...    0    0    0]
 [ 420    1    0 ...    0    0    0]
 [  22  330    4 ...    0    0    0]
 ...
 [ 165  132  106 ...    0    0    0]
 [  11  165 2394 ...    0    0    0]
 [  11  165 5641 ...    0    0    0]]


In [42]:
print(word2idx_outputs["<sos>"])
print(word2idx_outputs["je"])
print(word2idx_outputs["suis"])
print(word2idx_outputs["malade."])

2
3
6
137


In [43]:
##Create word Embedding for I/P i.e. English, we are using GLoVe embedding, for french we will be using custom embedding
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open(r'E:\Pycharm_Projects\tf_prac\glove.6B\glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [44]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
print(num_words)

embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
print(embedding_matrix)
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

2015
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [45]:
print(embeddings_dictionary["ill"])

[ 0.12648    0.1366     0.22192   -0.025204  -0.7197     0.66147
  0.48509    0.057223   0.13829   -0.26375   -0.23647    0.74349
  0.46737   -0.462      0.20031   -0.26302    0.093948  -0.61756
 -0.28213    0.1353     0.28213    0.21813    0.16418    0.22547
 -0.98945    0.29624   -0.62476   -0.29535    0.21534    0.92274
  0.38388    0.55744   -0.14628   -0.15674   -0.51941    0.25629
 -0.0079678  0.12998   -0.029192   0.20868   -0.55127    0.075353
  0.44746   -0.71046    0.75562    0.010378   0.095229   0.16673
  0.22073   -0.46562   -0.10199   -0.80386    0.45162    0.45183
  0.19869   -1.6571     0.7584    -0.40298    0.82426   -0.386
  0.0039546  0.61318    0.02701   -0.3308    -0.095652  -0.082164
  0.7858     0.13394   -0.32715   -0.31371   -0.20247   -0.73001
 -0.49343    0.56445    0.61038    0.36777   -0.070182   0.44859
 -0.61774   -0.18849    0.65592    0.44797   -0.10469    0.62512
 -1.9474    -0.60622    0.073874   0.50013   -1.1278    -0.42066
 -0.37322   -0.50538    0

In [46]:
print(embedding_matrix[539])

[ 0.16237999 -0.3373     -0.24415    -0.036053    0.12717     0.41297001
  0.15265     0.23246001 -0.42772999 -0.71423     0.69331998  0.16859999
  0.39746001  0.14218999 -0.31621    -0.72087997 -0.16003001  0.17733
 -0.14495    -0.26379001  0.67311001  0.90779001  0.52047002 -0.13696
 -0.31428999 -0.71758002 -0.14982    -0.70389003  0.44791001  0.073398
 -0.033242    0.14199001  0.34531     0.65033001  0.010394    0.94755
  0.058567   -0.0047846   0.17679    -0.34283999 -0.31970999 -0.37520999
  0.75291997 -0.1035      0.45971    -0.36877999  0.11353     0.53530997
  1.04229999 -1.10889995  0.19391    -0.69931     0.11326     0.90522999
  0.12302    -1.81009996 -0.28229001 -0.42172     0.71376997  0.70547998
 -0.068874    0.31896999 -0.54651999 -0.13504     0.17066     0.24518
 -0.12982     0.053308   -0.41339001  0.47496    -0.21569     0.0032189
  0.17123    -0.063068   -0.1067      0.26807001 -0.12807    -0.21364
  0.19554    -0.24145     0.050128    0.2931     -0.66754001  0.13753

In [47]:
#creare the embedding layer for I/P
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

In [48]:
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)
print(decoder_targets_one_hot.shape)

(10000, 11, 5642)


In [52]:
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [53]:
#Next, we need to create the encoder and decoders. The input to the encoder will be the sentence in English and the output will be the hidden state and cell state of the LSTM.
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [54]:
#The next step is to define the decoder. The decoder will have two inputs: the hidden state and cell state from the encoder and the input sentence
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [55]:
#Finally, the output from the decoder LSTM is passed through a dense layer to predict decoder outputs
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [57]:
#The next step is to compile the model:
model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [58]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 11)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 4, 100)       201500      ['input_1[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, 11, 256)      1444352     ['input_2[0][0]']                
                                                                                            

In [60]:
r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [61]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

In [62]:
decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [63]:
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

In [64]:
decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)

In [65]:
decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

In [66]:
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

In [67]:
from keras.utils import plot_model
plot_model(decoder_model, to_file='model_plot_dec.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [68]:
idx2word_input = {v:k for k, v in word2idx_inputs.items()}
idx2word_target = {v:k for k, v in word2idx_outputs.items()}

In [69]:
def translate_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    eos = word2idx_outputs['<eos>']
    output_sentence = []

    for _ in range(max_out_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word = ''

        if idx > 0:
            word = idx2word_target[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return ' '.join(output_sentence)

In [70]:
i = np.random.choice(len(input_sentences))
input_seq = encoder_input_sequences[i:i+1]
translation = translate_sentence(input_seq)
print('-')
print('Input:', input_sentences[i])
print('Response:', translation)

-
Input: I'm relaxed.
Response: je suis en train !
