In [0]:
import numpy as np
import pandas as pd
import re, string, nltk, spacy
import os, sys, csv, random, time, datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns; sns.set(style='whitegrid')
from collections import Counter
from pickle import dump, load

In [0]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [0]:
def tokenize(corpus):
  corpus = ['<start> '+line+' <end>' for line in corpus]
  tokenizer = keras.preprocessing.text.Tokenizer(filters='')
  tokenizer.fit_on_texts(corpus)
  tensor = tokenizer.texts_to_sequences(corpus)
  tensor = keras.preprocessing.sequence.pad_sequences(tensor,  padding='post')

  return tensor, tokenizer

### Preprocess

In [0]:
en = load(open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/en.pkl', 'rb')) 
fr = load(open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/fr.pkl', 'rb')) 

In [6]:
input_tensor, inp_lang = tokenize(en)
target_tensor, targ_lang = tokenize(fr)
input_tensor.shape, target_tensor.shape

((137860, 17), (137860, 23))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(input_tensor, target_tensor, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(110288, 17) (27572, 17) (110288, 23) (27572, 23)


In [0]:
buffer_size = input_tensor.shape[0]
batch_size = 64
steps_per_epoch = len(X_train)//batch_size
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

print(vocab_inp_size, vocab_tar_size)
print(max_length_targ, max_length_inp)

In [0]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size)
dataset = dataset.batch(batch_size, drop_remainder=True)

### Class

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units 
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

  def call(self, x, hidden, enc_output):
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(hidden, 1), x], axis=-1)
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)

    return x, state

In [0]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [0]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * batch_size, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

### Train

In [0]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size)
decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_size)
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [15]:
checkpoint_dir = '/content/drive/My Drive/Datasets/NLP/Checkpoints/RNN/embedding'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f691d9a20f0>

In [0]:
%%time
EPOCHS = 50
  
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [0]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))
  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
  inputs = tf.convert_to_tensor(inputs)
  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)
  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden = decoder(dec_input,dec_hidden,enc_out)
    predicted_id = tf.argmax(predictions[0]).numpy()
    result += targ_lang.index_word[predicted_id] + ' '
    if targ_lang.index_word[predicted_id] == '<end>':
      return result
    dec_input = tf.expand_dims([predicted_id], 0)

  print(len(result))
  return result

In [0]:
def translate(sentence):
  result = evaluate(sentence)
  return result

In [0]:
inp = inp_lang.sequences_to_texts([input_tensor[0]])[0]
tar = targ_lang.sequences_to_texts([target_tensor[0]])[0]
pred = translate(inp)
print(pred)
print(tar)
print(inp)

In [0]:
df = []
with open('/content/drive/My Drive/Datasets/NLP/Predictions/RNN/predictions.txt','w') as f:
  for i, (x,y) in enumerate(zip(X_test, y_test)):
    sentence = inp_lang.sequences_to_texts([x])[0]
    pred = translate(sentence)
    y = ' '.join(targ_lang.sequences_to_texts([y])[0].split(' ')[1:-1])
    pred = ' '.join(pred.split(' ')[:-2])
    df += [[y, pred]]
    if i % 1000 == 0: 
      print(i)
      for line in df:
        f.write(line[0]+'\t'+line[1]+'\n')
      df = []

In [0]:
predictions = pd.read_csv('/content/drive/My Drive/Datasets/NLP/Predictions/RNN/predictions.txt', sep='\t', header=None)
predictions.drop_duplicates(inplace=True)
print(predictions.shape)
predictions.head(1)

In [0]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

In [0]:
true = predictions[0].values
pred = predictions[1].values

In [75]:
score = [0,0,0]
for i, (y,p) in enumerate(zip(true,pred)):
  score[0] += sentence_bleu(y, p)
  score[1] += sentence_bleu(y, p, smoothing_function=SmoothingFunction().method1)
  score[2] += sentence_bleu(y, p, smoothing_function=SmoothingFunction().method2)

print(score[0]/predictions.shape[0])
print(score[1]/predictions.shape[0])
print(score[2]/predictions.shape[0])

0.7090127815964198
0.0058034304373505196
0.032554179613585484
