# Language Translation (English to French)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,LSTM,Dense
import numpy as np

In [None]:
batch_size = 64 # Batch size for training
epochs = 10 # Number of epochs to train for
latent_dim = 256 # Latent dimensionality of encoding space
num_samples = 10000 # Mumber of samples to train

In [None]:
# Vectorize the data
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open('eng-fra.txt', 'r', encoding='utf8') as f:
    lines = f.read().split('\n') # make a list of data and splits the content(input and output) into individual lines. Replace space with '\t'

# min(num_samples, len(lines)-1) takes the smaller of the two values
# (num_samples or the total number of available lines minus one), effectively creating a range that won't exceed the length of the list.
for line in lines[ :min(num_samples, len(lines)-1)]:
    input_text, target_text = line.split('\t') # split using the tab character ('\t').

    # We use '\t' as the 'start sequence' and '\n' as the 'end sequence' character for the targets.
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)

    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [None]:
lines[0:2]

['Go.\tVa !', 'Run!\tCours\u202f!']

In [None]:
input_texts[0:2]

['Go.', 'Run!']

In [None]:
target_texts[0:2]

['\tVa !\n', '\tCours\u202f!\n']

In [None]:
input_characters

{' ',
 '!',
 '$',
 '&',
 "'",
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '9',
 ':',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '’'}

In [None]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts]) # Length of sentence which have maximum length in input_texts
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [None]:
print('Number of Samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of Samples: 10000
Number of unique input tokens: 71
Number of unique output tokens: 91
Max sequence length for inputs: 16
Max sequence length for outputs: 59


In [None]:
# Providing Index
input_token_index = dict(
[(char,i)for i,char in enumerate(input_characters)])

target_token_index = dict(
[(char,i)for i,char in enumerate(target_characters)])

In [None]:
input_token_index

{' ': 0,
 '!': 1,
 '$': 2,
 '&': 3,
 "'": 4,
 ',': 5,
 '-': 6,
 '.': 7,
 '0': 8,
 '1': 9,
 '2': 10,
 '3': 11,
 '4': 12,
 '5': 13,
 '6': 14,
 '7': 15,
 '9': 16,
 ':': 17,
 '?': 18,
 'A': 19,
 'B': 20,
 'C': 21,
 'D': 22,
 'E': 23,
 'F': 24,
 'G': 25,
 'H': 26,
 'I': 27,
 'J': 28,
 'K': 29,
 'L': 30,
 'M': 31,
 'N': 32,
 'O': 33,
 'P': 34,
 'Q': 35,
 'R': 36,
 'S': 37,
 'T': 38,
 'U': 39,
 'V': 40,
 'W': 41,
 'Y': 42,
 'Z': 43,
 'a': 44,
 'b': 45,
 'c': 46,
 'd': 47,
 'e': 48,
 'f': 49,
 'g': 50,
 'h': 51,
 'i': 52,
 'j': 53,
 'k': 54,
 'l': 55,
 'm': 56,
 'n': 57,
 'o': 58,
 'p': 59,
 'q': 60,
 'r': 61,
 's': 62,
 't': 63,
 'u': 64,
 'v': 65,
 'w': 66,
 'x': 67,
 'y': 68,
 'z': 69,
 '’': 70}

In [None]:
# Return a new array of given shape and type, filled with zeros
# 10000 ---> 10000 sequences (10000 lists)
# 16-----> Each sequence have upto 16 tokens (16 lists in each 10000 lists)
# 71 -----> Each token is represented by a one-hot vector of length 71 (71 unique characters)
# In this case all the values are zeroes
# encoder_input_data ----> (10000,16,71)
# decoder_input_data ----> (10000,59,91)
# decoder_target_data ----> (10000,59,91)


encoder_input_data = np.zeros(
[len(input_texts), max_encoder_seq_length, num_encoder_tokens], dtype='float32')

decoder_input_data = np.zeros(
[len(input_texts), max_decoder_seq_length, num_decoder_tokens], dtype='float32')

decoder_target_data = np.zeros(
[len(target_texts), max_decoder_seq_length, num_decoder_tokens], dtype='float32')

In [None]:
encoder_input_data

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [None]:
# One Hot Representation
for i, (input_text,target_text) in enumerate(zip(input_texts,target_texts)):
  for t, char in enumerate(input_text):
    encoder_input_data[i, t, input_token_index[char]] = 1.
  encoder_input_data[i, t+1:, input_token_index[' ']] = 1.
  for t, char in enumerate(target_text):
    # decoder_target_data is ahead of decoder_input data by one timestep
    decoder_input_data[i, t, target_token_index[char]] = 1.
    if t > 0:
      # decoder_target_data will be ahead of one timestep
      # and will not include the start character.
      decoder_target_data[i, t-1, target_token_index[char]] = 1.

  decoder_input_data[i, t+1:, target_token_index[' ']] = 1.
  decoder_target_data[i, t:, target_token_index[' ']] = 1.

# Encoders
# Explaination
# For word 'Omkar'
# 'i' is iterate for 10000 sequences and 't' is iterate for '16' tokens
# for 1st sequence (out of 10000) and for first token/list (out of 16), input_token_index['O'],  encoder_input_data[0,0,(index number for character'O')] = 1
# for 1st sequence (out of 10000) and for second token (out of 16), input_token_index['m'],  encoder_input_data[0,1,(index number for character'M')] = 1
# for 1st sequence (out of 10000) and for third token (out of 16), input_token_index['k'],  encoder_input_data[0,2,(index number for character'k')] = 1
# for 1st sequence (out of 10000) and for fourth token (out of 16), input_token_index['a'],  encoder_input_data[0,3,(index number for character'a')] = 1
# for 1st sequence (out of 10000) and for fifth token (out of 16), input_token_index['r'],  encoder_input_data[0,4,(index number for character'r')] = 1
# encoder_input_data[i, t+1:, input_token_index[' ']] = 1. -----> In first sequence, from 6 to 16th token if there is any space is present then it set to 1

# Traslation Omkar--->abcdfe
# Decoders
# The input to the decoder at timestep t predicts the output at timestep t+1. This is typical in seq2seq model.
# for 1st sequence (out of 10000) and for first token/list (out of 59), target_token_index['a'],  decoder_input_data[0,0,(index number for character'a')] = 1
# for 1nd sequence (out of 10000) and for second token/list (out of 59), target_token_index['b'],  decoder_input_data[0,1,(index number for character'b')] = 1
# Now here the t>1
# for 1nd sequence (out of 10000) and for first token/list (out of 59), target_token_index['b'],  decoder_target_data[0,1,(index number for character'b')] = 1

In [None]:
encoder_input_data.shape

(10000, 16, 71)

In [None]:
# Define an input sequence
# None: The sequence length is flexible, meaning it can vary depending on the input.
# num_encoder_tokens: Number of unique input tokens
encoder_inputs = Input(shape=(None, num_encoder_tokens))

# return_state= True means that, apart from the regular output,
# the LSTM will also return its internal hidden states (state_h) and cell states (state_c),
# which will be used later by the decoder.
encoder = LSTM(latent_dim, return_state=True)

# The LSTM processes the input and returns three things:
# encoder_outputs: The output at each timestep (not used in this case, so it’s discarded).
# state_h: The hidden state of the LSTM after processing the sequence.
# state_c: The cell state of the LSTM after processing the sequence.
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard 'encoder_outputs' and only keep the states
encoder_states = [state_h, state_c]

In [None]:
# Set up the decoder, using 'encoder_states' as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))

# The decoder LSTM is initialized with latent_dim units, the same size as the encoder to maintain consistency in the sequence-to-sequence model.
# return_sequences=True: This means that the LSTM will return the output at every timestep (needed for generating a sequence of outputs).
# return_state=True: Like the encoder, the decoder also returns the internal states (hidden state state_h and cell state state_c).
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

# This line applies the LSTM to the decoder_inputs and initializes the LSTM with the encoder_states
# from the encoder (i.e., the hidden and cell states state_h and state_c).
# By passing initial_state=encoder_states, the decoder is primed with the context captured by the encoder
# (e.g., in a translation task, the information from the source sentence).
# _ , _: These represent the decoder's hidden and cell states at each timestep, which are discarded here during training but will be useful during inference.
decoder_outputs , _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

# The Dense layer is a fully connected layer that converts the LSTM's output into a prediction for each timestep.
# activation='softmax': This applies the softmax function to generate a probability distribution over the possible tokens at each timestep.
decoder_dense = Dense(num_decoder_tokens, activation='softmax')

# decoder_outputs: This is the final output of the decoder, a sequence of predicted tokens for each timestep.
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
# Define the model that will turn
# 'encoder_input_data' and 'decoder_input_data' into 'decoder_target_data'
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Training the model
model.compile(optimizer = 'rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs = epochs,
          validation_split=0.2)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 424ms/step - accuracy: 0.7165 - loss: 1.2293 - val_accuracy: 0.6888 - val_loss: 1.1869
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 415ms/step - accuracy: 0.7305 - loss: 0.9940 - val_accuracy: 0.7137 - val_loss: 1.0498
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 423ms/step - accuracy: 0.7535 - loss: 0.8870 - val_accuracy: 0.7130 - val_loss: 1.3986
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 415ms/step - accuracy: 0.7819 - loss: 0.7807 - val_accuracy: 0.7604 - val_loss: 0.8393
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 413ms/step - accuracy: 0.7919 - loss: 0.7309 - val_accuracy: 0.7679 - val_loss: 0.7988
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 419ms/step - accuracy: 0.8041 - loss: 0.6697 - val_accuracy: 0.7741 - val_loss: 0.7709
Epoch 7/10

<keras.src.callbacks.history.History at 0x7ccc63d67d90>