https://docs.google.com/document/d/1QmoZJCeAjmaZQ2QQA3X2zqo9ET4PqSUT2frL4DX_Tdo/edit

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import re, string, nltk, spacy
import os, sys, csv, random, time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns; sns.set(style='whitegrid')
from collections import Counter
from pickle import dump, load

  import pandas.util.testing as tm


In [0]:
import tensorflow as tf
from tensorflow import keras
import torch
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [0]:
def tokenize(corpus):
  corpus = ['<start> '+line+' <end>' for line in corpus]
  tokenizer = keras.preprocessing.text.Tokenizer(filters='')
  tokenizer.fit_on_texts(corpus)
  tensor = tokenizer.texts_to_sequences(corpus)
  tensor = keras.preprocessing.sequence.pad_sequences(tensor,  padding='post')

  return tensor, tokenizer

# Preprocess

In [5]:
en = load(open('/content/drive/My Drive/Small Vocab/en.pkl', 'rb')) 
fr = load(open('/content/drive/My Drive/Small Vocab/fr.pkl', 'rb')) 
print(len(en), len(fr))
print(en[:2])
print(fr[:2])

137860 137860
['new jersey is sometimes quiet during autumn and it is snowy in april', 'the united states is usually chilly during july and it is usually freezing in november']
['new jersey est parfois calme pendant l automne et il est neigeux en avril', 'les etats unis est generalement froid en juillet et il gele habituellement en novembre']


In [6]:
input_tensor, inp_lang = tokenize(en)
target_tensor, targ_lang = tokenize(fr)
input_tensor.shape, target_tensor.shape

((137860, 17), (137860, 23))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(input_tensor, target_tensor, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print('2:', inp_lang.index_word[2])

(110288, 17) (27572, 17) (110288, 23) (27572, 23)
2: <start>


In [0]:
buffer_size = input_tensor.shape[0]
batch_size = 64
steps_per_epoch = len(X_train)//batch_size
embedding_dim = 256
units = 256
vocab_inp_size = len(inp_lang.word_index)
vocab_tar_size = len(targ_lang.word_index)
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

print(vocab_inp_size, vocab_tar_size)
print(max_length_inp, max_length_targ)

In [0]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size)
dataset = dataset.batch(batch_size, drop_remainder=True)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

# Model


## No embedding

In [0]:
X_train1 = tf.cast(tf.expand_dims(X_train, axis=2), tf.float32)
y_train1 = tf.cast(tf.expand_dims(y_train, axis=2), tf.float32)

dataset1 = tf.data.Dataset.from_tensor_slices((X_train1, y_train1)).shuffle(buffer_size)
dataset1 = dataset.batch(batch_size, drop_remainder=True)

In [0]:
class Encoder_Base(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder_Base, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [0]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [0]:
class Decoder_Base(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder_Base, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))

    x = self.fc(output)

    return x, state, attention_weights

In [0]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims(tf.expand_dims([targ_lang.word_index['<start>']] * batch_size, 1), 2)
    dec_input = tf.cast(dec_input, tf.float32)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [0]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [0]:
encoder = Encoder_Base(vocab_inp_size, embedding_dim, units, batch_size)
decoder = Decoder_Base(vocab_tar_size, embedding_dim, units, batch_size)
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [0]:
checkpoint_dir = '/content/drive/My Drive/Datasets/NLP/Checkpoints/Attention/No-embedding'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)
# checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [0]:
%%time
EPOCHS = 100
  
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset1.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss2

  if (epoch + 1) % 5 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [0]:
def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)
  return result, sentence

In [0]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
  inputs = tf.convert_to_tensor(inputs)
  inputs = tf.cast(tf.expand_dims(inputs, axis=2), tf.float32)
  result = ''
  hidden = tf.zeros((1, units))
  enc_out, enc_hidden = encoder(inputs, hidden)
  dec_hidden = enc_hidden
  dec_input = tf.cast(tf.expand_dims([targ_lang.word_index['<start>']], 1), tf.float32)

  for t in range(max_length_targ):
    dec_input = tf.cast(tf.expand_dims(dec_input, 2), tf.float32)
    dec_hidden = tf.cast(dec_hidden, tf.float32)
    predictions, dec_hidden, attention_weights = decoder(dec_input,dec_hidden,enc_out)
    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()
    predicted_id = tf.argmax(predictions[0]).numpy()
    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

In [0]:
df = []
with open('/content/drive/My Drive/Datasets/NLP/Predictions/Attention_no_emb_predictions.txt','w') as f:
  for i, (x,y) in enumerate(zip(X_test, y_test)):
    sentence = inp_lang.sequences_to_texts([x])[0]
    pred = translate(sentence)
    y = ' '.join(targ_lang.sequences_to_texts([y])[0].split(' ')[1:-1])
    pred = ' '.join(pred.split(' ')[:-2])
    df += [[y, pred]]
    print(df)
    break
    if i % 1000 == 0: 
      print(i)
      for line in df:
        f.write(line[0]+'\t'+line[1]+'\n')
      df = []

In [0]:
predictions = pd.read_csv('/content/drive/My Drive/Predictions/no_emb_predictions.txt', sep='\t', header=None)
predictions.drop_duplicates(inplace=True)
print(predictions.shape)
predictions.head(1)

In [0]:
true = predictions[0].values
pred = predictions[1].values

In [79]:
score = [0,0,0]
for i, (y,p) in enumerate(zip(true,pred)):
  score[0] += sentence_bleu(y, p)
  score[1] += sentence_bleu(y, p, smoothing_function=SmoothingFunction().method2)
  score[2] += sentence_bleu(y, p, smoothing_function=SmoothingFunction().method3)
print(score[0]/predictions.shape[0])
print(score[1]/predictions.shape[0])
print(score[2]/predictions.shape[0])

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.7132713526227332
0.03299160315616553
0.011696062231013894


## Embedding

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [0]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)
    self.attention = BahdanauAttention(self.dec_units)
  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)
    return x, state, attention_weights

In [0]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [0]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)
    self.warmup_steps = warmup_steps
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [0]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * batch_size, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [0]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size)
decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_size)
learning_rate = CustomSchedule(units)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [0]:
checkpoint_dir = '/content/drive/My Drive/Checkpoints/Attention/embedding'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)
# checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [0]:
%%time
EPOCHS = 50
  
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [0]:
# checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [0]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))
  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
  inputs = tf.convert_to_tensor(inputs)
  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)
  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, _ = decoder(dec_input,dec_hidden,enc_out)
    predicted_id = tf.argmax(predictions[0]).numpy()
    result += targ_lang.index_word[predicted_id] + ' '
    if targ_lang.index_word[predicted_id] == '<end>':
      return result
    dec_input = tf.expand_dims([predicted_id], 0)

  return result

def translate(sentence):
  result = evaluate(sentence)
  return result

In [0]:
inp = inp_lang.sequences_to_texts([input_tensor[0]])[0]
tar = targ_lang.sequences_to_texts([target_tensor[0]])[0]
pred = translate(inp)
print(pred)
print(inp, tar)

In [0]:
df = []
with open('/content/drive/My Drive/Predictions/Attention_emb_predictions.txt','w') as f:
  for i, (x,y) in enumerate(zip(X_test, y_test)):
    sentence = inp_lang.sequences_to_texts([x])[0]
    pred = translate(sentence)
    y = ' '.join(targ_lang.sequences_to_texts([y])[0].split(' ')[1:-1])
    pred = ' '.join(pred.split(' ')[:-2])
    df += [[y, pred]]
    
    if i % 1000 == 0: 
      print(i)
      for line in df:
        f.write(line[0]+'\t'+line[1]+'\n')
      df = []

In [0]:
predictions = pd.read_csv('/content/drive/My Drive/Predictions/Attention_emb_predictions.txt', sep='\t', header=None)
predictions.drop_duplicates(inplace=True)
print(predictions.shape)
predictions.head(1)

In [0]:
true = predictions[0].values
pred = predictions[1].values

In [82]:
score = [0,0,0]
for i, (y,p) in enumerate(zip(true,pred)):
  score[0] += sentence_bleu(y, p)
  score[1] += sentence_bleu(y, p, smoothing_function=SmoothingFunction().method2)
  score[2] += sentence_bleu(y, p, smoothing_function=SmoothingFunction().method3)
print(score[0]/predictions.shape[0])
print(score[1]/predictions.shape[0])
print(score[2]/predictions.shape[0])

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.7173235244336582
0.03315686500812764
0.011759316664150355


## Static Embedding

In [0]:
en_ft = load(open('/content/drive/My Drive/Small Vocab/en_ft_vocab.pkl', 'rb'))
fr_ft = load(open('/content/drive/My Drive/Small Vocab/fr_ft_vocab.pkl', 'rb'))
len(en_ft), len(fr_ft)

In [0]:
inp_ft = np.zeros((len(inp_lang.word_index)+1,300))
for i,(k,v) in enumerate(inp_lang.word_index.items()):
  inp_ft[i+1] = en_ft[k]

targ_ft = np.zeros((len(targ_lang.word_index)+1,300))
for i,(k,v) in enumerate(targ_lang.word_index.items()):
  targ_ft[i+1] = fr_ft[k]

len(inp_ft), len(targ_ft), vocab_inp_size, vocab_tar_size

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, embedding_layer, vocab_size, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = embedding_layer
    self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [0]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, embedding_layer, vocab_size, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = embedding_layer
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))

    x = self.fc(output)

    return x, state, attention_weights

In [0]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [0]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)
    self.warmup_steps = warmup_steps
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [0]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * batch_size, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [0]:
enc_embedding_layer = tf.keras.layers.Embedding(vocab_inp_size,300,weights=[inp_ft],input_length=max_length_inp,trainable=False)
dec_embedding_layer = tf.keras.layers.Embedding(vocab_tar_size,300,weights=[targ_ft],input_length=max_length_targ,trainable=False)

In [0]:
encoder = Encoder(enc_embedding_layer, vocab_inp_size+1, units, batch_size)
decoder = Decoder(dec_embedding_layer, vocab_tar_size+1, units, batch_size)
learning_rate = CustomSchedule(units)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [0]:
checkpoint_dir = '/content/drive/My Drive/Checkpoints/Attention/static-embedding'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)
# checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [0]:
%%time
EPOCHS = 50
  
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [0]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))
  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
  inputs = tf.convert_to_tensor(inputs)
  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)
  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, _ = decoder(dec_input,dec_hidden,enc_out)
    predicted_id = tf.argmax(predictions[0]).numpy()
    result += targ_lang.index_word[predicted_id] + ' '
    if targ_lang.index_word[predicted_id] == '<end>':
      return result
    dec_input = tf.expand_dims([predicted_id], 0)

  return result

def translate(sentence):
  result = evaluate(sentence)
  return result

In [0]:
df = []
with open('/content/drive/My Drive/Predictions/static_emb_predictions.txt','w') as f:
  for i, (x,y) in enumerate(zip(X_test, y_test)):
    sentence = inp_lang.sequences_to_texts([x])[0]
    pred = translate(sentence)
    y = ' '.join(targ_lang.sequences_to_texts([y])[0].split(' ')[1:-1])
    pred = ' '.join(pred.split(' ')[:-2])
    df += [[y, pred]]
    if i % 1000 == 0: 
      print(i)
      for line in df:
        f.write(line[0]+'\t'+line[1]+'\n')
      df = []

In [0]:
predictions = pd.read_csv('/content/drive/My Drive/Predictions/static_emb_predictions.txt', sep='\t', header=None)
predictions.drop_duplicates(inplace=True)
print(predictions.shape)
predictions.head(1)

In [0]:
true = predictions[0].values
pred = predictions[1].values

In [70]:
score = [0,0,0]
for i, (y,p) in enumerate(zip(true,pred)):
  score[0] += sentence_bleu(y, p)
  score[1] += sentence_bleu(y, p, smoothing_function=SmoothingFunction().method2)
  score[2] += sentence_bleu(y, p, smoothing_function=SmoothingFunction().method3)
print(score[0]/predictions.shape[0])
print(score[1]/predictions.shape[0])
print(score[2]/predictions.shape[0])

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.7173823339511546
0.03314772458943005
0.011755560316377705
