Encoder Decoder Language Translator

In [39]:
# importing important libraries

from keras.models import Model
from keras.layers import Input,LSTM,Dense
import numpy as np

In [40]:
# importing Dataset

# Batch size for training
batch_s = 64 

# number of epochs to train our model
epochs = 25

# Latent dimensionality of encoding space
latent_dimension = 256

# number of samples to train
number_samples = 10000

# dataset
data_path = '/content/fra.txt'

In [41]:
# vectorize the data

input_texts = []
target_texts = []

input_characters = set()
target_characters = set()

with open(data_path,'r',encoding='utf-8') as f:
  lines = f.read().split('\n')

for line in lines[: min(number_samples,len(lines) - 1)]:
  input_text,target_text,_ = line.split('\t') # splitting the english and french seperately
  target_text  = '\t' + target_text + '\n' # creating taget text which is french in this case
  input_texts.append(input_text)
  target_texts.append(target_text)

  for char in input_text:
    if char not in input_characters:
      input_characters.add(char)
    
  for char in target_text:
    if char not in target_characters:
      target_characters.add(char)

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_len = max([len(txt) for txt in input_texts])
max_decoder_seq_len = max([len(txt) for txt in target_texts])
print('Number of sample:',len(input_texts))
print('Number of unique token:',num_encoder_tokens)
print('Number of unique output:',num_decoder_tokens)
print('Max seq length for input:',max_encoder_seq_len)
print('Max seq length for output:',max_decoder_seq_len)

Number of sample: 10000
Number of unique token: 70
Number of unique output: 93
Max seq length for input: 16
Max seq length for output: 59


In [42]:
# generating corresponding numerical value

input_token_index = dict([(char,i) for i,char in enumerate(input_characters)])
target_token_index = dict([(char,i) for i,char in enumerate(target_characters)])

In [43]:
# creating zero matrixes

encoder_input_data = np.zeros(
    (len(input_texts),max_encoder_seq_len,num_encoder_tokens),dtype = 'float32')

decoder_input_data = np.zeros(
    (len(input_texts),max_decoder_seq_len,num_decoder_tokens),dtype = 'float32')

decoder_target_data = np.zeros(
    (len(input_texts),max_decoder_seq_len,num_decoder_tokens),dtype = 'float32')

In [44]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.

In [45]:
# input Sequence

encoder_inputs = Input(shape=(None,num_encoder_tokens))
encoder = LSTM(latent_dimension,return_state=True)
encoder_outputs,state_h,state_c = encoder(encoder_inputs)
encoder_states = [state_h,state_c]

In [46]:
# setup decoder

decoder_inputs = Input(shape=(None,num_decoder_tokens))
decoder_lstm = LSTM(latent_dimension,return_sequences=True,return_state=True)
decoder_outputs,_,_ = decoder_lstm(decoder_inputs,initial_state = encoder_states)
decoder_dense = Dense(num_decoder_tokens,activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [47]:
# defining the model

model = Model([encoder_inputs,decoder_inputs],decoder_outputs)

In [48]:
# Fitting the model

model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit([encoder_input_data,decoder_input_data],decoder_target_data,
          batch_size = batch_s,epochs = epochs,validation_split=0.2)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f8454d9fa10>