###Importing Libraries


In [2]:
import tensorflow as tf
import numpy as np
import unicodedata
import re

###Reading the Data


In [4]:
file = open('./data/Dataset.txt','r').read()

In [5]:
raw_data = [f.split('\t') for f in file.split('\n')]    #separating questions and answers
questions = [x[0] for x in raw_data] # questions for the first section
answers = [x[1] if len(x) > 1 else "" for x in raw_data] #answers for the second section

In [6]:
print("Question: ", questions[0])
print("Answer: ", answers[0])

Question:  hi, how are you doing?
Answer:  i'm fine. how about yourself?


###Tokenizing


In [7]:
#Function convert text into numerical representation suitable for machine learning for working with neural networks in tensorflow
def tokenize(lang): #lang is the text sentences in a particular language
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='') #tokenizer created responsible to convert them into numerical tokens
    lang_tokenizer.fit_on_texts(lang) # learn the vocabulary of the language
    tensor = lang_tokenizer.texts_to_sequences(lang) #convert input text into sequences of numerical tokens
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post') #ensure sequences have same length, post to match longest sequence
    return tensor, lang_tokenizer

###PreProcessing


In [8]:
def preprocess_sentence(sentence):
    #normalize the sentence by decomposing 
    #any composed characters into their basic components.  eg cafe with the ' on top of e
    #For example, it converts accented characters into their non-accented counterparts.
    sentence = ''.join(c for c in unicodedata.normalize('NFD', sentence) if unicodedata.category(c) != 'Mn')  #exclude combining diacritical marks. This step removes diacritics from characters. (symbols below)
    sentence = sentence.lower().strip() #converts the sentence to lowercase and removes leading and trailing whitespace.
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence) #adds spaces around punctuation marks (such as '.', '?', '!', ',') to separate them from words.
    sentence = re.sub(r'[" "]+', " ", sentence) #replaces consecutive spaces with a single space.
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)# replaces any characters that are not letters, punctuation marks, or whitespace with a space.
    sentence = '<start> ' + sentence + ' <end>' #adds special tokens <start> and <end> to the sentence. These tokens can be used to indicate the start and end of a sequence during model training.
    return sentence

In [9]:
pre_questions = [preprocess_sentence(w) for w in questions] #processing all the questions
pre_answers = [preprocess_sentence(w) for w in answers] #processing all the answers

In [10]:
data = pre_answers, pre_questions

In [11]:
    #tuple data consisting of two elements: targ_lang and inp_lang. 
    
def prepare_data(data): 

    targ_lang, inp_lang = data #These are presumably the target language (answer) and the input language (question).

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang) #tokenize function from earlier
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang) #tokenize function from earlier

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

# input_tensor: Tokenized and padded representations of the input sequences.
# target_tensor: Tokenized and padded representations of the target sequences.
# inp_lang_tokenizer: The tokenizer for the input language.
# targ_lang_tokenizer: The tokenizer for the target language.

#Inverse Mapping: Tokenizers also facilitate the inverse mapping, 
# i.e., converting numerical IDs back to text. This is useful for generating human-readable responses or interpreting model outputs.

In [12]:
input_tensor, target_tensor, inp_lang, targ_lang = prepare_data(data) #Calling the above function

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
#retrieve the sequence lengths from the dimensions of the tensors. 
# The maximum sequence length is important for model training, as it determines the length of the sequences that the model can handle.

In [13]:
print(max_length_targ)
print(max_length_inp)

24
24


### Downloading the Tokenizers


In [14]:
import pickle
#uses it to save the tokenizers for the input and target languages as binary files 
# using the pickle library. Here's what each part of the code does:
def save_tokenizer(tokenizer, filename): #tokenizer is the tokenizer object that you want to save. filename is the name of the file where the tokenizer will be saved.
    with open(filename, 'wb') as handle: #()'wb'), which is suitable for saving binary data.
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
#specifies that the highest available protocol should be used for pickling. 
# This can be helpful for ensuring compatibility with future Python versions.
save_tokenizer(inp_lang, 'input_tokenizer.pkl')
save_tokenizer(targ_lang, 'target_tokenizer.pkl')

#save the tokenizers to disk as binary files. 
# Saving tokenizers is useful when you want to reuse them for processing new data in the future.

###Splitting the Data


In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
# The function returns four variables:
# input_tensor_train: The input sequences for the training set.
# input_tensor_val: The input sequences for the validation set.
# target_tensor_train: The target sequences for the training set.
# target_tensor_val: The target sequences for the validation set.
# Splitting the data into 90% train, 10% validation
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
    input_tensor, target_tensor, test_size=0.1, random_state=42)
#input_tensor and target_tensor are the data to be split. input_tensor likely contains the input sequences, 
# and target_tensor contains the corresponding target sequences (answers or responses).
#test_size is set to 0.1, indicating that 10% of the data will be used for validation, and the remaining 90% for training.
#random_state is set to 42, which provides a seed for the random number generator. 
#This ensures reproducibility when splitting the data. Using the same seed will result in the same split every time the code is run.

###Defining the PipeLine


In [17]:
#sets up various configuration parameters and creates a TensorFlow dataset for training a machine learning model

# length of the training input tensor (input_tensor_train). 
#This is often used to shuffle the dataset. In this case, the entire training dataset will be used for shuffling.
BUFFER_SIZE = len(input_tensor_train)  #Shuffling, in the context of training a machine learning model, is the process of randomizing the order of the training data.

BATCH_SIZE = 64 #It defines the number of training examples to be processed in each training batch.
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 400 
#embedding_dim is set to 400. It specifies the dimensionality of the word embeddings. 
# Word embeddings are dense vector representations of words.
units = 1500 # These units can represent the number of hidden units in a neural network layer.
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
#represent the vocabulary sizes for the input and target languages, respectively. 
# They are determined by the length of the word indices obtained from the tokenizers (inp_lang.word_index and targ_lang.word_index) and incremented by 1. 
# The addition of 1 is to account for a special token (usually <unk>) that may be added for unknown words.

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
#Creating a TensorFlow Dataset:
#The tf.data.Dataset.from_tensor_slices function is used to create a dataset from the training data. 
# It combines the input and target tensors (sequences) into pairs.


example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

#Example Batches:

#example_input_batch and example_target_batch are obtained from the dataset using next(iter(dataset)). 
# These are example input and target batches for demonstration purposes.
#Shapes:

#The code prints the shapes of the example input and target batches using example_input_batch.shape and example_target_batch.shape. 
# This is often done to verify that the data has been properly batched.

#his shape corresponds to the input batch of sequences.
#  It's saying that you have a batch of 64 sequences, and each sequence has a length of 24.

(TensorShape([64, 24]), TensorShape([64, 24]))

In [18]:
class Encoder(tf.keras.Model): # custom model class that inherits from tf.keras.Model
  # The encoder's role is to process the input sequences and produce a fixed-length context representation.
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__() # custom model class that inherits from tf.keras.Model
        self.batch_sz = batch_sz # batch size.
        self.enc_units = enc_units # number of units (hidden dimensions) in the LSTM layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) #dimension of the word embeddings. converts input token IDs into dense word embeddings.
        self.lstm = tf.keras.layers.LSTM(self.enc_units, #LSTM (Long Short-Term Memory) layer that processes the embedded input sequences.
                                         return_sequences=True, # LSTM should return the full sequence of outputs for each time step.
                                         return_state=True,  #indicates that the LSTM should return the final hidden state and cell state.
                                         recurrent_initializer='glorot_uniform') #specifies the initializer for the recurrent weights.

    def call(self, x, hidden): #defines the forward pass of the encoder.
      x = self.embedding(x) # two arguments: x (the input sequences) and hidden (the initial hidden state and cell state of the LSTM).
      output, state_h, state_c = self.lstm(x, initial_state=hidden)  
      #input sequences are passed through the embedding layer and then through the LSTM layer.  
      #The LSTM returns the output sequences, final hidden state (state_h), and final cell state (state_c).
      state = [state_h, state_c] #The final state is stored in a list called state and returned along with the output sequences.
      return output, state

    def initialize_hidden_state(self): #initializes the hidden state and cell state with zeros. It's used to create the initial state for the LSTM layer.
        return [tf.zeros((self.batch_sz, self.enc_units)),
                tf.zeros((self.batch_sz, self.enc_units))]

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE) #Encoder Initialization:

In [19]:
class Attention(tf.keras.layers.Layer): # custom layer in a machine learning model,inherits from tf.keras.layers.Layer.
    def __init__(self, units):  #number of units (hidden dimensions) for the Dense layers within the attention mechanism.
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units) 
        self.W2 = tf.keras.layers.Dense(units)
        #dense layers that are used to transform the input query and the values (typically encoder outputs) into compatible representations 
        # for calculating attention scores.
        self.V = tf.keras.layers.Dense(1) #nother Dense layer used to produce the final attention scores.

    def call(self, query, values): #defines the forward pass of the attention layer.
        #query for which attention needs to be calculated. In sequence-to-sequence models, this is often a decoder hidden state.
        #values represent the values to which attention is applied. In sequence-to-sequence models, this is often the encoder outputs.
        query_with_time_axis = tf.expand_dims(query, 1) #expands the dimensions of the query to make it compatible with the values.
        score = self.V(tf.nn.tanh( #alculates the attention scores using the tanh activation function and the weights from self.W1 and self.W2.
        # The result is passed through the self.V layer to obtain final attention scores #!! YOU CAN MODIFY THE WAY SCORE IS CALCULATED. EG DOT PRODUCT
            self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1) #computed by applying the softmax function to the attention scores along axis=1. 
        #This produces a weight for each position in the values sequence, indicating the importance of each position.
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1) #calculated by multiplying the attention weights with the values and then summing along axis=1. I

        return context_vector, attention_weights

In [20]:
class Decoder(tf.keras.Model):  #vocab_size is the size of the output vocabulary. embedding_dim is the dimension of the word embeddings for the decoder. 
    #dec_units is the number of units (hidden dimensions) in the LSTM layer of the decoder. batch_sz is the batch size.
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__() #custom model class that inherits from tf.keras.Model. 
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.dec_units,
                                         return_sequences=True,
                                         return_state=True,
                                         recurrent_initializer='glorot_uniform') #recurrent_initializer='glorot_uniform' specifies the initializer for the recurrent weights.
        self.fc = tf.keras.layers.Dense(vocab_size) #Dense layer that produces the final output predictions by transforming the LSTM output into the shape of the output vocabulary.
        self.attention = Attention(self.dec_units) #Attention layer, which was defined earlier in your code. This is the attention mechanism that helps the decoder focus on relevant parts of the input sequence.

    def call(self, x, hidden, enc_output):
        #x (the previous predicted token), hidden (the initial hidden state and cell state of the LSTM), and enc_output (the encoder's output sequences).
        context_vector, attention_weights = self.attention(hidden[0], enc_output)

        x = self.embedding(x)

        # Concatenate context vector and embedding
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # Passing the concatenated vector to the LSTM
        output, state_h, state_c = self.lstm(x, initial_state=hidden)  # Use LSTM with state_h and state_c

        state = [state_h, state_c]

        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)

        return x, state, attention_weights

# The attention mechanism is applied to calculate a context vector and attention weights.
# The previous predicted token is passed through the embedding layer.
# The context vector and embedded token are concatenated to create the input to the LSTM.
# The LSTM processes this input and produces an output sequence, final hidden state (state_h), and final cell state (state_c).
# The LSTM output is reshaped and passed through the Dense layer to produce the final output predictions.

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

###Adjusting Learning Rates


In [22]:
initial_learning_rate = 0.001 #adaptive learning rate
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( #used to create a learning rate schedule. It's an exponential decay schedule that reduces the learning rate over time.
    initial_learning_rate, decay_steps=1000, decay_rate=0.9, staircase=True #earning rate is reduced at discrete intervals (staircase decay).
)

# Define the optimizer with adaptive learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) #optimizer will be used to update the model's weights during training.

# Define your loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none' #function expects the model's output to be unnormalized log probabilities (logits).  loss should not be reduced to a single scalar but kept as a per-sample loss.
)

def loss_function(real, pred):  #defining loss function
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)
# It takes two arguments, real (the true target values) and pred (the model's predicted values).
# A mask is created to ignore loss contributions from padding tokens (tokens with a value of 0).
# The loss is calculated using loss_object and then multiplied by the mask to zero out the loss for padding tokens.
# The final loss is obtained by computing the mean of the non-padded losses.

###Defining Train Step


In [24]:
@tf.function # defines a training step for the sequence-to-sequence model using TensorFlow. 
def train_step(inp, targ, enc_hidden): 
#inp: The input sequence.
# targ: The target (output) sequence.
# enc_hidden: The initial hidden state of the encoder.

    loss = 0

    with tf.GradientTape() as tape: #tf.GradientTape is used to record operations for automatic differentiation, allowing the computation of gradients.
        enc_output, enc_hidden = encoder(inp, enc_hidden)
#The input sequence inp is passed through the encoder (encoder) to obtain enc_output (encoder outputs) and the final enc_hidden state.

        # Initialize LSTM's initial state
        dec_hidden = [enc_hidden[0][:, :units], enc_hidden[1][:, :units]]

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        #he initial hidden state of the decoder (dec_hidden) is initialized with a slice of enc_hidden to ensure compatibility with the decoder's LSTM layer.

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # Passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)

            # Using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

            #Teacher forcing is employed, which means that during training, the model's predicted output at time step t is fed as input at time step t+1. 
            # This loop iterates over the target sequence length (targ.shape[1]).
            # For each time step, the decoder (decoder) is called with the current dec_input, dec_hidden, and enc_output. 
            # This generates predictions for the next token in the sequence.
            # The loss for the current time step is computed using the loss_function based on the predicted token and the actual target token. 
            # This loss is added to the loss variable.

    batch_loss = (loss / int(targ.shape[1]))
    #calculated by dividing the accumulated loss by the length of the target sequence (targ.shape[1]). 
    # This provides the average loss per token in the target sequence for the current batch.

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    #gradients of the loss with respect to the model's trainable variables (encoder and decoder) are computed using the tape.gradient function.
    #The optimizer is then used to apply these gradients to update the model's weights.
    return batch_loss

    # This train_step function encapsulates a single training step for the sequence-to-sequence model, including forward and backward passes. 
    # During training, this function is typically called within an epoch loop to train the model on batches of data.

In [25]:
@tf.function #used to compute the validation loss for the sequence-to-sequence model during validation or evaluation
def validation_step(inp, targ, enc_hidden):
# The validation_step function takes three arguments:
# inp: The input sequence.
# targ: The target (output) sequence.
# enc_hidden: The initial hidden state of the encoder.
    val_loss = 0 #accumulate the validation loss.

    val_samples = 0 #keep track of the number of validation samples.

    enc_output, enc_hidden = encoder(inp, enc_hidden) #Encoder Pass:

#The input sequence inp is passed through the encoder (encoder) to obtain enc_output (encoder outputs) and the final enc_hidden state.

    # Initialize LSTM's initial state
    dec_hidden = [enc_hidden[0][:, :units], enc_hidden[1][:, :units]]
    #initial hidden state of the decoder (dec_hidden) is initialized with a slice of enc_hidden to ensure compatibility with the decoder's LSTM layer.
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]): #Teacher Forcing Loop for Validation:
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
        loss = loss_function(targ[:, t], predictions)
        val_loss += loss
        val_samples += 1
        dec_input = tf.expand_dims(targ[:, t], 1)

    val_loss /= val_samples #Validation Loss and Samples:

    return val_loss

###Training the Pipeline


In [28]:
#represents the training loop for your sequence-to-sequence model and 
# visualizes the training and validation losses over a specified number of epochs (EPOCHS).

import matplotlib.pyplot as plt
EPOCHS = 60

train_losses = []
val_losses = []

for epoch in range(1, EPOCHS + 1):
    enc_hidden = encoder.initialize_hidden_state() #initializes the hidden state for the encoder (enc_hidden) using the encoder.initialize_hidden_state() function.
    total_loss = 0

    # Training loop
    #Batch Training Loop:

# Within each epoch, there's an inner loop that iterates over the training dataset (dataset) for each batch.
# In each batch, it calls the train_step function to compute the training loss for the current batch. 
# The train_step function handles forward and backward passes for training.

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    num_samples = 0
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
      enc_output, enc_hidden = encoder(inp, enc_hidden)
      dec_hidden = enc_hidden
      dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

      for t in range(1, targ.shape[1]):
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
        predicted_id = tf.argmax(predictions, axis=-1)
        num_samples += 1
        dec_input = tf.expand_dims(targ[:, t], 1)

    validation_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
    validation_dataset = validation_dataset.batch(BATCH_SIZE, drop_remainder=True)
#After training for one epoch, a separate validation loop is performed to evaluate the model's performance on the validation dataset.
# It initializes the hidden state for the encoder (enc_hidden) for each batch in the validation dataset.
# For each batch, it calls the validation_step function to compute the validation loss for the current batch.
    val_loss = 0
    val_samples = 0

    for (batch, (inp, targ)) in enumerate(validation_dataset):
      enc_hidden = encoder.initialize_hidden_state()  # Initialize hidden state for each batch
      val_batch_loss = validation_step(inp, targ, enc_hidden)
      val_loss += val_batch_loss
      val_samples += 1

    val_loss /= val_samples

    if epoch % 1 == 0:
        train_losses.append(total_loss / steps_per_epoch)
        val_losses.append(val_loss)
        print('Epoch:{:3d} Loss:{:.4f} Val Loss:{:.4f}'.format(
            epoch, total_loss / steps_per_epoch,  val_loss))

# Plotting the accuracy and loss graphs
plt.figure(figsize=(12, 6))

# Plot training and validation losses
plt.plot(range(1, EPOCHS + 1), train_losses, label='Train')
plt.plot(range(1, EPOCHS + 1), val_losses, label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.tight_layout()
plt.show()

#training took 3 hours to complete with colab GPU

Epoch:  1 Loss:1.7199 Val Loss:1.7099
Epoch:  2 Loss:1.5591 Val Loss:1.6226
Epoch:  3 Loss:1.4329 Val Loss:1.5790
Epoch:  4 Loss:1.3366 Val Loss:1.5512
Epoch:  5 Loss:1.2278 Val Loss:1.5518
Epoch:  6 Loss:1.1175 Val Loss:1.5585
Epoch:  7 Loss:1.0044 Val Loss:1.5651
Epoch:  8 Loss:0.8902 Val Loss:1.5885
Epoch:  9 Loss:0.7825 Val Loss:1.6053
Epoch: 10 Loss:0.6875 Val Loss:1.6335
Epoch: 11 Loss:0.6072 Val Loss:1.6574
Epoch: 12 Loss:0.5374 Val Loss:1.6918
Epoch: 13 Loss:0.4852 Val Loss:1.7324
Epoch: 14 Loss:0.4775 Val Loss:1.7416
Epoch: 15 Loss:0.4293 Val Loss:1.7674
Epoch: 16 Loss:0.4035 Val Loss:1.7962
Epoch: 17 Loss:0.3826 Val Loss:1.8100
Epoch: 18 Loss:0.3690 Val Loss:1.8319
Epoch: 19 Loss:0.3513 Val Loss:1.8428
Epoch: 20 Loss:0.3385 Val Loss:1.8569
Epoch: 21 Loss:0.3272 Val Loss:1.8767
Epoch: 22 Loss:0.3181 Val Loss:1.8974
Epoch: 23 Loss:0.3097 Val Loss:1.9002
Epoch: 24 Loss:0.2999 Val Loss:1.9210
Epoch: 25 Loss:0.2907 Val Loss:1.9264
Epoch: 26 Loss:0.2816 Val Loss:1.9481
Epoch: 27 Lo

In [23]:
encoder.save("encoder_final")
decoder.save("decoder_final")



In [24]:
!zip -r "encoder_final.zip" "encoder_final"

  adding: encoder_final/ (stored 0%)
  adding: encoder_final/variables/ (stored 0%)
  adding: encoder_final/variables/variables.index (deflated 39%)
  adding: encoder_final/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: encoder_final/keras_metadata.pb (deflated 80%)
  adding: encoder_final/fingerprint.pb (stored 0%)
  adding: encoder_final/saved_model.pb (deflated 91%)
  adding: encoder_final/assets/ (stored 0%)


In [25]:
!zip -r "decoder_final.zip" "decoder_final"

  adding: decoder_final/ (stored 0%)
  adding: decoder_final/variables/ (stored 0%)
  adding: decoder_final/variables/variables.index (deflated 52%)
  adding: decoder_final/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: decoder_final/keras_metadata.pb (deflated 87%)
  adding: decoder_final/fingerprint.pb (stored 0%)
  adding: decoder_final/saved_model.pb (deflated 90%)
  adding: decoder_final/assets/ (stored 0%)


In [26]:
def remove_tags(sentence):
    return sentence.split("<start>")[-1].split("<end>")[0] #utility function that removes the "<start>" and "<end>" tags from a sentence.

In [27]:
def evaluate(sentence): #function takes an input sentence as its parameter.
    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')] #tokenized into a sequence of word indices using the inp_lang.word_index.
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], #inp_lang.word_index is a dictionary where words are mapped to their corresponding numerical indices. 
                                                                                                                #For example, if you have a dictionary like {'hello': 1, 'how': 2, 'are': 3, ...}, 
                                                                                                                # it means that "hello" is represented by the index 1, "how" by 2, and so on.
                                                         maxlen=max_length_inp,
                                                         padding='post') 
    inputs = tf.convert_to_tensor(inputs)

    result = '' #This variable will store the generated response.

    hidden = [tf.zeros((1, units)), tf.zeros((1, units))] #Initial hidden states are created as [tf.zeros((1, units)), tf.zeros((1, units))].
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)#nput to the decoder is initialized with the "<start>" token.
 
    for t in range(max_length_targ): #Loop for Generating Output: iterates for a maximum of max_length_targ times. This is the maximum length for the target sequence.
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<end>':
            return remove_tags(result), remove_tags(sentence)

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

#In each iteration:
# The decoder is called with the current dec_input, dec_hidden, and enc_out.
# The model generates predictions for the next token in the sequence.
# The predicted token with the highest probability is selected using tf.argmax.
# The predicted token is appended to the result.
# If the predicted token is "<end>", the generation process stops.
# Otherwise, the predicted token is used as the next input to the decoder for the next iteration.
    return remove_tags(result), remove_tags(sentence)
#the content of the result, and the preprocessed input sentence without tags (tags are removed using remove_tags).

### Testing some random questions


In [28]:
def test(question):
    answer, question = evaluate(question)
    print('Question:', question)
    print('Predicted answer:', answer)

test("good luck with school")


Question:  good luck with school 
Predicted answer: thank you very much . 


In [29]:
test("Hello")
test("How are you doing?")
test("What is your age?")
test("Do you have a tv?")
test("Do you like rain?")

Question:  hello 
Predicted answer: greetings ! 
Question:  how are you doing ? 
Predicted answer: fine , and you ? 
Question:  what is your age ? 
Predicted answer: i am still young by your standards . 
Question:  do you have a tv ? 
Predicted answer: yes , i do . 
Question:  do you like rain ? 
Predicted answer: yes , i love traveling and exploring new places . 


In [30]:
test("I am afraid")

Question:  i am afraid 
Predicted answer: why ? do i frighten you ? try not to be too scared . what are you afraid of ? 


In [31]:
test("I am feeling sick")

Question:  i am feeling sick 
Predicted answer: oh , really ? 


In [32]:
test("Sorry")

Question:  sorry 
Predicted answer: yeah , so do i . 


In [33]:
test("hi, how are you doing?")

Question:  hi , how are you doing ? 
Predicted answer: i m fine . how about yourself ? 


In [34]:
test("i'm pretty good. thanks for asking.")

Question:  i m pretty good . thanks for asking . 
Predicted answer: no problem . so how have you been ? 


In [35]:
test("i've been great. what about you?")

Question:  i ve been great . what about you ? 
Predicted answer: i ve been good . i m in school right now . 


In [36]:
test("what school do you go to?")

Question:  what school do you go to ? 
Predicted answer: i go to pcc . 


In [38]:
test("I don't know")

Question:  i don t know 
Predicted answer: i like the ones i can sing along with . 


In [39]:
test("nice to meet you")

Question:  nice to meet you 
Predicted answer: thank you . 


In [40]:
test("What are your hobbies?")

Question:  what are your hobbies ? 
Predicted answer: i enjoy reading books and playing the guitar . 


In [41]:
test("You are rude")

Question:  you are rude 
Predicted answer: yep . i always behave in socially unacceptable ways . 


In [42]:
test("I love you")

Question:  i love you 
Predicted answer: i love you , too . 
