In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('cleaned_data.csv')

In [3]:
df.head()

Unnamed: 0,eng,french
0,tom found that,Tom a trouvé ça
1,begin,Commence
2,we all wished for peace,Nous souhaitions toutes la paix
3,i already called him,Je lai déjà appelé
4,youre very resourceful,Vous êtes pleines de ressources


# Prepare Input data for the Encoder :- the input data to encoder which are preprocessed by following steps

Tokenize the English Sentences from eng.
Determine the maximum length of English sentence it will be used for padding.
Determine the Vocabulary size for english.

In [4]:
eng_lines = list()
for eng_data in df.eng:
    eng_lines.append(eng_data)

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(eng_lines)
tokenized_eng_lines = tokenizer.texts_to_sequences(eng_lines)

length_list = []
for token_seq in tokenized_eng_lines:
    length_list.append(len(token_seq))
    
max_input_length = np.array(length_list).max()
print(f"English max length is {format(max_input_length)}")

padded_eng_lines = preprocessing.sequence.pad_sequences(tokenized_eng_lines,maxlen = max_input_length,padding='post')
encoder_input_data = np.array(padded_eng_lines)
print(f"Input data Shape : {format(encoder_input_data.shape)}")

eng_word_dict = tokenizer.word_index
num_eng_tokens = len(eng_word_dict) + 1
print(f"No.of Encoder tokens : {format(num_eng_tokens)}")

English max length is 47
Input data Shape : (8000, 47)
No.of Encoder tokens : 4328


# Prepare Input data for Decoder
The decoder model will be fed with processed french lines. The steps are same as above

In [5]:
french_lines = []
for french_data in df.french:
    french_lines.append('_START_ '+french_data+'_END_')
    
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( french_lines ) 
tokenized_french_lines = tokenizer.texts_to_sequences( french_lines ) 

length_list = []
for token_seq in tokenized_french_lines:
    length_list.append( len( token_seq ))
max_output_length = np.array( length_list ).max()
print( f"French max length is {format( max_output_length )}")

padded_french_lines = preprocessing.sequence.pad_sequences( tokenized_french_lines , maxlen=max_output_length, padding='post' )
decoder_input_data = np.array( padded_french_lines )
print( f"Output data Shape : {format( decoder_input_data.shape )}")

french_word_dict = tokenizer.word_index
num_french_tokens = len( french_word_dict )+1
print( f"Number of Decoder tokens : {format( num_french_tokens)}")

French max length is 53
Output data Shape : (8000, 53)
Number of Decoder tokens : 6850


# Prepare Target Data for decoder we modify it like:-
We remove start token
Convert the padded french lines *One Possible result may look like:- ['START','Some random text','END'] -> ['Some random text','END']



In [6]:
decoder_target_data = []
for token_seq in tokenized_french_lines:
    decoder_target_data.append(token_seq[1:])
    
padded_french_lines = preprocessing.sequence.pad_sequences(decoder_target_data , maxlen=max_output_length, padding='post' )
onehot_french_lines = utils.to_categorical( padded_french_lines , num_french_tokens )
decoder_target_data = np.array( onehot_french_lines )
print( f"Decoder target data shape -> {format( decoder_target_data.shape )}")

Decoder target data shape -> (8000, 53, 6850)


# Lets Define Our Encoder-Decoder Model
The model is LSTM based and configuration follows as:-

No of input layers -> 2(one for encoded input data and another for decoded)

Embedding layer will be used for fix sized dense vectors.

LSTM will be used as mentioned above

Working :-

encoder input data -> Embedding layer

Output of Embedding layer goes to LSTM which produces 2 state h and c which are encoded states

These states (h and c) are set in the LSTM cell of the decoder

The decoder input data comes in through Embedding layer

The Embedding goes in LSTM to produce sequences

In [7]:

encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( num_eng_tokens, 256 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 128 , return_state=True  )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( num_french_tokens, 256 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 128 , return_state=True , return_sequences=True)
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( num_french_tokens , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    1107968     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    1753600     input_2[0][0]                    
______________________________________________________________________________________________

 Now Let us Train our Model with 50 epochs and RMSprop optimizer and categorical crossentropy loss function.¶

In [8]:
model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=250, epochs=40 ) 
model.save( 'model.h5' )

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40



Lets define Inference¶
We will create a inference which will help predicting translation

In [9]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 128 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 128 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

# Some Translation¶
First we take english sentences and predict the state values using encoder model (see inference)
We set the state value in the decoder's LSTM
Then we generate a sequence which contain start token
We input the sequence in the dec model
We replace the start token
We carry out the above steps again and again

In [10]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( eng_word_dict[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_input_length , padding='post')

In [11]:
enc_model , dec_model = make_inference_models()

for epoch in range( encoder_input_data.shape[0] ):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter eng sentence : ' ) ) )
    #states_values = enc_model.predict( encoder_input_data[ epoch ] )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = french_word_dict['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in french_word_dict.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > max_output_length:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )

Enter eng sentence : How are you
 comment vous end
Enter eng sentence : we all wished for peace
 nous avons une grosse erreur end
Enter eng sentence : i called him
 je lai ai en train end
Enter eng sentence : help the poor
 les dents end
Enter eng sentence : god is watching us
 restez end
Enter eng sentence : stay where you are
 où vous vous end
Enter eng sentence : i already called him
 je lai fait de la maison end
Enter eng sentence : end this now
 cest end
Enter eng sentence : bjbjhbjh


KeyError: 'bjbjhbjh'