In [3]:
import os
import sys
import pandas as pd
import numpy as np

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM

In [4]:
file_path = "C:\\Users\\suresha.bc\\Desktop\\github files AIML\\dataset\\fra-eng\\fra.txt"

In [5]:
#Read the file
with open(file_path, 'r', encoding='UTF-8') as f:
    lines = f.read().split("\n")

In [29]:
lines[:3]

['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)',
 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)',
 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)']

In [30]:
len(lines)# there are 179905 records in the file

179905

In [31]:
latent_dim = 256
ephoc = 30
num_samples = 6000 # number of samples to train
batch_size = 64

In [42]:
input_texts = []
target_texts = []
input_char = set()
target_char = set()

for line in lines[: 10000]: # I am considering only 10000 samples from 122550 records
    input_text , target_text , _ = line.split('\t')
    target_text = '\t' + target_text + '\n'  # using first word as \t and End of Sequece as \n
    
    input_texts.append(input_text)
    target_texts.append(target_text)
    
    for char in input_text:
        if char not in input_char:
            input_char.add(char)
    for char in target_text:
        if char not in target_char:
            target_char.add(char)

In [43]:
input_chars = sorted(list(input_char))
target_chars = sorted(list(target_char))
num_encoder_token = len(input_chars)
num_decoder_token = len(target_chars)
max_encoder_seq_len = max([len(i) for i in input_texts])
max_decoder_seq_len = max([len(i) for i in target_texts])

In [44]:
print("num_encoder_token :" , num_encoder_token)
print("num_decoder_token :" , num_decoder_token)
print("max_encoder_seq_len :" , max_encoder_seq_len)
print("max_decoder_seq_len :" , max_decoder_seq_len)

num_encoder_token : 71
num_decoder_token : 94
max_encoder_seq_len : 15
max_decoder_seq_len : 59


In [64]:
#Indexing the tokens
input_token_index = dict([(index, char) for char ,index in enumerate(input_chars)])
target_token_index = dict([(index, char) for char ,index in enumerate(target_chars)])

In [67]:
#Create encoder & decoder arrays 
encoder_input_data = np.zeros( (len(input_texts) , max_encoder_seq_len , num_encoder_token ) , dtype='float32' )

decoder_input_data = np.zeros( (len(input_texts) , max_decoder_seq_len , num_decoder_token ) , dtype='float32' )

decoder_target_data = np.zeros( (len(input_texts) , max_decoder_seq_len , num_decoder_token ) , dtype='float32' )

In [72]:
encoder_input_data.shape, decoder_input_data.shape , decoder_target_data.shape

((10000, 15, 71), (10000, 59, 94), (10000, 59, 94))

In [74]:
for i , (input_text, target_text) in enumerate(zip(input_texts,target_texts)):
    for t, char in enumerate(input_text):
        # one hot representation of input charecters.
        encoder_input_data[i,t,input_token_index[char]] = 1. 
    encoder_input_data[i , t+1:, input_token_index[' ']] = 1.
    for t, char in enumerate(target_text):
        # one hot representation of input charecters.
        decoder_input_data[i,t,target_token_index[char]] = 1.
          
        if t>0:                              
            # Decoder target will be ahead of one timestamp (Since we have tab appended)
            decoder_target_data[i, t-1, target_token_index[char]] = 1.
            
    decoder_input_data[i, t+1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t: , target_token_index[' ']] = 1.

In [113]:
encoder_inputs = Input(shape=(None, num_encoder_token))
encoder = LSTM(latent_dim,return_state=True)
encoder_output , state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [114]:
decoder_inputs = Input(shape=(None , num_decoder_token))
decoder_lstm = LSTM(latent_dim,return_sequences=True , return_state=True )
decoder_output , _ ,_ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

In [115]:
decoder_dense = Dense(num_decoder_token, activation='softmax' )
decoder_outputs = decoder_dense(decoder_output)

In [116]:
model = Model([encoder_inputs ,decoder_inputs] , decoder_outputs )
model.compile(optimizer='rmsprop' , loss= 'categorical_crossentropy',metrics=['accuracy'])

In [118]:
model.fit([encoder_input_data , decoder_input_data] , decoder_target_data , 
          batch_size=batch_size , 
          epochs=ephoc, 
          validation_split=0.2
         )

Train on 8000 samples, validate on 2000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x273e116ba08>

In [121]:

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_input = [ decoder_state_input_h , decoder_state_input_c ]

decoder_outputs , state_h , state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_input)

decoder_state = [state_h , state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model( 
                    [decoder_inputs] + decoder_states_input, 
                    [decoder_outputs] + decoder_state )

reverse_input_char_index = dict( (i, char) for char, i in input_token_index.items() )
reverse_target_char_index = dict( (i, char) for char, i in target_token_index.items() )

In [122]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1,1,num_decoder_token))
    
    target_seq[0, 0, target_token_index['\t']] = 1.
    
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value )
        
        sampled_token_index = np.argmax(output_tokens[0 ,-1 , :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        
        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_len):
            stop_condition = True
            
        taeget_seq = np.zeros((1, 1, num_decoder_token))
        target_seq[0, 0, sampled_token_index] = 1.
        
        states_value = [h, c]
    
    return decoded_sentence

In [123]:
for seq_index in range(100):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_seq = decode_sequence(input_seq) 
    
    print('-')
    print('Input Sentense:' , input_texts[seq_index] )
    print('Decoded Sentense:' , decoded_seq )

-
Input Sentense: Go.
Decoded Sentense: Attrraiverrrrrrrrrss-ttttt-ttttttttttttttttttttttttttttttttt
-
Input Sentense: Hi.
Decoded Sentense: Saluitt.. 

-
Input Sentense: Hi.
Decoded Sentense: Saluitt.. 

-
Input Sentense: Run!
Decoded Sentense: Trouiiisssrrrrrrrrrerrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr
-
Input Sentense: Run!
Decoded Sentense: Trouiiisssrrrrrrrrrerrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr
-
Input Sentense: Who?
Decoded Sentense: Qui llennn         ss sssssssssssssss ssssssssssssssssssssss
-
Input Sentense: Wow!
Decoded Sentense: Bonnnnnnnsôttereeeeeeeeee tttsssssssssssssssssssssssssssssss
-
Input Sentense: Fire!
Decoded Sentense: Attrre-virrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr
-
Input Sentense: Help!
Decoded Sentense: Gorrûllèrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr
-
Input Sentense: Jump.
Decoded Sentense: Suyye.. 

-
Input Sentense: Stop!
Decoded Sentense: Attrraviiiiiirrrrrrrerrrrrrrrrrttttttttttttttttttttdssss-eee
-
Input Sentense: Stop!
De

-
Input Sentense: Be calm.
Decoded Sentense: Soymiez nn nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
-
Input Sentense: Be calm.
Decoded Sentense: Soymiez nn nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
-
Input Sentense: Be cool.
Decoded Sentense: Soymiez znnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
-
Input Sentense: Be fair.
Decoded Sentense: Soyez zs-ccccunnn.n.............................ffi.........
-
Input Sentense: Be fair.
Decoded Sentense: Soyez zs-ccccunnn.n.............................ffi.........
-
Input Sentense: Be fair.
Decoded Sentense: Soyez zs-ccccunnn.n.............................ffi.........
-
Input Sentense: Be fair.
Decoded Sentense: Soyez zs-ccccunnn.n.............................ffi.........
-
Input Sentense: Be fair.
Decoded Sentense: Soyez zs-ccccunnn.n.............................ffi.........
-
Input Sentense: Be fair.
Decoded Sentense: Soyez zs-ccccunnn.n.............................ffi.........
-
Input Sentense: Be kind.
Decoded Sentense: T