In [None]:
!pip install tensorflow



In [None]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.2
    Uninstalling scikit-learn-1.3.2:
      Successfully uninstalled scikit-learn-1.3.2
Successfully installed scikit-learn-1.5.1


In [None]:
!pip install model-selection

Collecting model-selection
  Downloading model_selection-0.0.1-py3-none-any.whl.metadata (1.6 kB)
Downloading model_selection-0.0.1-py3-none-any.whl (13 kB)
Installing collected packages: model-selection
Successfully installed model-selection-0.0.1


In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import re
import tqdm
from sklearn import datasets, model_selection, linear_model, metrics, preprocessing

In [2]:
data_df = pd.read_csv('/content/data.csv')
data_df

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
118959,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
118960,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
118961,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
118962,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(' ',' ',text)
    text = re.sub('\[.*?\]',' ', text)
    text = re.sub('https://\s+|www\.\s+',' ',text)
    text = re.sub('<.+?>+',' ',text)
    text = re.sub('\n',' ',text)
    text = re.sub(r'[^\w]',' ',text)
    text = re.sub('\w*\d\w*',' ',text)
    text = re.sub(' ',' ',text)
    return text

data_df.english = data_df.english.map(clean_text)
data_df.spanish = data_df.spanish.map(clean_text)

In [4]:
def add_start_end(text):
  text = f"<start>{text}<end>"
  return text

data_df.english = data_df.english.map(add_start_end)
data_df.spanish = data_df.spanish.map(add_start_end)

In [5]:
data_df

Unnamed: 0,english,spanish
0,<start>go <end>,<start>ve <end>
1,<start>go <end>,<start>vete <end>
2,<start>go <end>,<start>vaya <end>
3,<start>go <end>,<start>váyase <end>
4,<start>hi <end>,<start>hola <end>
...,...,...
118959,<start>there are four main causes of alcohol r...,<start>hay cuatro causas principales de muerte...
118960,<start>there are mothers and fathers who will ...,<start>hay madres y padres que se quedan despi...
118961,<start>a carbon footprint is the amount of car...,<start>una huella de carbono es la cantidad de...
118962,<start>since there are usually multiple websit...,<start>como suele haber varias páginas web sob...


In [6]:
def tokenize(lang):
    lang_tokenizer =tf.keras.preprocessing.text.Tokenizer(
        filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~',  oov_token= '<oov>'
    )
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding= 'post')
    return tensor, lang_tokenizer

In [7]:
eng_squence, eng_tokenizer = tokenize(data_df.english)
spn_squence, spn_tokenizer = tokenize(data_df.spanish)

In [8]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(eng_squence,
                spn_squence, test_size = 0.1, random_state=42)

x_train.shape, x_test.shape, y_train.shape, y_test.shape


((107067, 48), (11897, 48), (107067, 50), (11897, 50))

In [9]:
def convert (lang, tensor):
    for t in tensor:
        if t!=0:
            print('%d---> %s' % (t, lang.index_word[t]))

print('English')
convert(eng_tokenizer, x_train[0])
print()
print('Spanish')
convert(spn_tokenizer, y_train[0])


English
84---> <start>there
78---> were
158---> two
1550---> pieces
13---> of
519---> cake
2---> <end>

Spanish
591---> <start>había
82---> dos
4257---> pedazos
3---> de
978---> torta
2---> <end>


In [10]:
vocab_inp_size = len(eng_tokenizer.word_index)+1
vocab_tar_size = len (spn_tokenizer.word_index)+1
embedding_dim = 256
units = 1024
batch_size = 32

In [11]:
def create_dataset(x, y, batch_size=32):
  data = tf.data.Dataset.from_tensor_slices((x, y))

  data = data.shuffle(1028)
  data = data.batch(batch_size, drop_remainder=True)

  data = data.prefetch(tf.data.experimental.AUTOTUNE)

  return data

train_dataset = create_dataset(x_train, y_train)
test_dataset = create_dataset(x_test, y_test)

In [12]:
for eng, spn in train_dataset.take(1):
  print(f'English:{eng.shape}\n{eng}')

  print(f'Spanish:{spn.shape}\n{spn}')

English:(32, 48)
[[ 253   16 1987 ...    0    0    0]
 [  34   52   23 ...    0    0    0]
 [   3  107 5898 ...    0    0    0]
 ...
 [   3   67    9 ...    0    0    0]
 [6428   10   25 ...    0    0    0]
 [   3   16   80 ...    0    0    0]]
Spanish:(32, 50)
[[  290    15   543 ...     0     0     0]
 [  486    10   255 ...     0     0     0]
 [   59 18690   282 ...     0     0     0]
 ...
 [   14   239     4 ...     0     0     0]
 [   39  2249    12 ...     0     0     0]
 [ 3888     5   117 ...     0     0     0]]


In [13]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
      super(Encoder, self).__init__()

      self.batch_size = batch_size
      self.encoder_units = encoder_units
      self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True)
      self.gru = tf.keras.layers.GRU(self.encoder_units,
                                           return_sequences=True,
                                           return_state=True,
                                           recurrent_initializer = 'glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.encoder_units))

In [14]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size):
      super(Decoder, self).__init__()

      self.batch_size = batch_size
      self.decoder_units = decoder_units
      self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True)
      self.gru = tf.keras.layers.GRU(self.decoder_units,
                                           return_sequences=True,
                                           return_state=True,
                                           recurrent_initializer = 'glorot_uniform')

      self.fc = tf.keras.layers.Dense(vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, hidden = self.gru(x, initial_state = hidden)
    output = tf.reshape(output, (-1, output.shape[2]))
    x =  tf.nn.softmax(self.fc(output))
    return x, hidden

In [1]:
vocab_inp_size = len(eng_tokenizer.word_index)+1
vocab_tar_size =  len(spn_tokenizer.word_index)+1
embedding_dim = 256
units = 1024
batch_size=32

encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(eng, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_size)

sample_decoder_output, _ = decoder(tf.random.uniform((batch_size, 1)), sample_hidden)

print ('Decoder output shape: (batch size, vocab_size) {}'.format(sample_decoder_output.shape))

In [None]:
# create the optimizer using the Adam optimizer
optimizer = tf.keras.optimizers.Adam()
# create the loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction='none')

# define the loss function for the training
def loss_function(real, pred):
  # create the mask to ignore the padding tokens
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  # mask shape == (batch_size, sequence_length)
  # calculate the loss
  loss_ = loss_object(real, pred)
  # mask the loss
  # how the mask works:
  # if the value is 1, the loss is calculated
  # if the value is 0, the loss is ignored
    #[1,1,1,1,1,1,0,0,0,0,0] mask
    # *
    #[2,6,2,1,6,3,2,1,5,7,9] input
    # =
    #[2,6,2,1,6,3,0,0,0,0,0] output
  mask = tf.cast(mask, dtype=loss_.dtype)
  # mask shape == (batch_size, sequence_length)

  loss_ *= mask
  # calculate the average loss per batch
  return tf.reduce_mean(loss_)


In [None]:
train_loss = tf.metrics.Mean(name='train loss')

test_loss = tf.metrics.Mean(name='test loss')

In [None]:
# create the training step
# using the tf.function decorator to speed up the training process by converting the training function to a TensorFlow graph
@tf.function
# define the training step
def train_step(inputs, target, enc_hidden):
  # the encoder_hidden is the initial hidden state of the encoder
  # enc_hidden shape == (batch_size, hidden_size)

  # inilaize the loss to zero
  loss = 0
  # create the gradient tape to record the gradient of the loss with respect to the weights

  with tf.GradientTape() as tape:
    # pass the input to the encoder
    # enc_output shape == (batch_size, 49, hidden_size)
    # enc_hidden shape == (batch_size, hidden_size)
    # using the encoder to get the encoder_output and the encoder_hidden
    # using the encoder_hidden as the initial hidden state of the decoder
    enc_output, enc_hidden = encoder(inputs, enc_hidden)
    # set the initial decoder hidden state to the encoder hidden state
    dec_hidden = enc_hidden

    # create the start token
    # start_token shape == (batch_size, 1)
    # repeat the start token for the batch size times
    dec_input = tf.expand_dims([spn_tokenizer.word_index['<start>']] * inputs.shape[0], 1)

    # Teacher forcing - feeding the target as the next input

    for t in range(1, target.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden = decoder(dec_input, dec_hidden)
      # calculate the loss for the current time step using the loss function
      loss += loss_function(target[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(target[:, t], 1)
  # calculate the loss for the current batch
  batch_loss = (loss / int(target.shape[1]))

  # get the trainable variables
  variables = encoder.trainable_variables + decoder.trainable_variables
  # calculate the gradients using the tape
  gradients = tape.gradient(loss, variables)
  # update the trainable variables
  optimizer.apply_gradients(zip(gradients, variables))
  # add the loss to the training loss metric
  train_loss(batch_loss)
  return batch_loss

In [None]:
# create the training step
# using the tf.function decorator to speed up the training process by converting the training function to a TensorFlow graph
@tf.function
def test_step(inputs, target, enc_hidden):
    # the encoder_hidden is the initial hidden state of the encoder
    # enc_hidden shape == (batch_size, hidden_size)
    # inilaize the loss to zero
    loss = 0
    # pass the input to the encoder
    # enc_output shape == (batch_size, 49, hidden_size)
    # enc_hidden shape == (batch_size, hidden_size)
    # using the encoder to get the encoder_output and the encoder_hidden
    enc_output, enc_hidden = encoder(inputs, enc_hidden)
    # set the initial decoder hidden state to the encoder hidden state
    dec_hidden = enc_hidden
    # create the start token
    # start_token shape == (batch_size, 1)
    # repeat the start token for the batch size times
    dec_input = tf.expand_dims([spn_tokenizer.word_index['<start>']] * inputs.shape[0], 1)
    for t in range(1, target.shape[1]):
        # passing enc_output to the decoder with dec_hidden as the initial hidden state
        predictions, dec_hidden = decoder(dec_input, dec_hidden)
        # calculate the loss for the current time step using the loss function
        loss += loss_function(target[:, t], predictions)

        # using teacher forcing
        dec_input = tf.expand_dims(target[:, t], 1)
    # calculate the loss for the current batch
    batch_loss = (loss / int(target.shape[1]))
    # add the batch loss to the test loss metric
    test_loss(batch_loss)

In [None]:
# set the epochs to 3
EPOCHS = 3
# set the old test loss to high number
old_test_loss=1000000
# create the training loop
for epoch in range(EPOCHS):
    # reset the training loss metric
    train_loss.reset_states()
    # reset the testing loss metric
    test_loss.reset_states()

    # initalize the hidden state of the encoder to zeros
    enc_hidden = encoder.initialize_hidden_state()
    # create the training progress bar set the total number of batches to the length of the training dataset and the batch size to the test size
    steps_per_epoch = eng_squence.shape[0]//batch_size #=> 3717 batch in the dataset
    bar = tf.keras.utils.Progbar(target=steps_per_epoch)

    count=0
    # iterate over the training dataset
    for (batch, (inputs, target)) in enumerate(train_dataset):
        # update the progress bar
        count += 1
        # run the training step
        batch_loss = train_step(inputs, target, enc_hidden)
        bar.update(count)  # manually update the progress bar




    # iterate over the testing dataset
    for (batch, (inputs, target)) in enumerate(test_dataset):
        count += 1
        # run the testing step
        batch_loss = test_step(inputs, target, enc_hidden)
        bar.update(count)
    # save the best performance model on the test dataset

    if old_test_loss> test_loss.result():
        # set the old test loss to the test loss
        old_test_loss= test_loss.result()
        encoder.save(filepath='/content/models/encoder')
        decoder.save(filepath='/content/models/decoder')
        print('Model is saved')
    # print the training and testing loss
    print('#' * 50)
    print(f'Epoch #{epoch + 1}')
    print(f'Training Loss {train_loss.result()}')
    print(f'Testing Loss {test_loss.result()}')
    print('#' * 50)

In [None]:
#  create the translate function
# the translate function takes in the english as input and translation the sentence to spanish
def translate(sentence):

  # clean the input english sentence
  sentence = clean_text(sentence)
  # add the start token to the sentence
  sentence = add_start_end(sentence)
  # tokenize the sentence
  inputs = eng_tokenizer.texts_to_sequences([sentence])
  # pad the sentence
  inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                         maxlen=49,
                                                         padding='post')

  # initalize the hidden state of the encoder to zeros
  hidden = [tf.zeros((1, units))]
  # pass the sentence to the encoder with the hidden state as the initial hidden state
  enc_out, enc_hidden = encoder(inputs, hidden)
  # set the initial decoder hidden state to the encoder hidden state
  dec_hidden = enc_hidden
  # create the start token
  # start_token shape == (batch_size, 1)
  # repeat the start token for the batch size times
  dec_input = tf.expand_dims([spn_tokenizer.word_index['<start>']], 0)
  # create the result string
  result = ''
  # loop over the length of the sentence (51)

  for t in range(51):
    # passing the encoder output and the decoder hidden state to the decoder make sure the decoder input is the previous predicted word
    predictions, dec_hidden = decoder(dec_input, dec_hidden)

    # getting the predicted word index
    predicted_id = tf.argmax(predictions[0]).numpy()
    # getting the predicted word using the predicted index
    # add the predicted word to the result string
    result += spn_tokenizer.index_word[predicted_id] + ' '
    # if the predicted word is the  token then stop the loop
    if spn_tokenizer.index_word[predicted_id] == '':
      # remove the  and  tokens from the result string
      result = result.replace('<start>', '')
      result = result.replace('<end>','')
      # remove the  and  tokens from the sentence string
      sentence = sentence.replace('<start>', '')
      sentence = sentence.replace('<end>', '')
      return  sentence, result

    # using the predicted word as the next decoder input
    dec_input = tf.expand_dims([predicted_id], 0)
  # remove the  and  tokens from the result string
  result = result.replace('<start>', '')
  result = result.replace('<end>','')
  # remove the  and  tokens from the sentence string
  sentence = sentence.replace('<start>', '')
  sentence = sentence.replace('<end>', '')





  # return the result string and the original sentence
  return sentence, result


In [None]:

translate('that s a myth ')

In [None]:
translate('go')