In [0]:
import numpy as np
import pandas as pd
import re, string, nltk, spacy
import os, sys, csv, random, time
from unicodedata import normalize
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns; sns.set(style='whitegrid')
from collections import Counter
from pickle import dump, load

In [0]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Data PreProcess

## Functions

In [0]:
def clean_corpus(corpus):
	cleaned = list()
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	table = str.maketrans('', '', string.punctuation)
	for line in corpus:
		line = normalize('NFD', line).encode('ascii', 'ignore')
		line = line.decode('UTF-8')
		line = line.split()
		line = [word.lower() for word in line]
		line = [word.translate(table) for word in line]
		line = [re_print.sub('', w) for w in line]
		line = [word for word in line if word.isalpha()]
		cleaned.append(' '.join(line))
	return cleaned

In [0]:
def to_vocab(corpus, min_occurance = 0):
  tokenizer = keras.preprocessing.text.Tokenizer(filters='')
  tokenizer.fit_on_texts(corpus)
  vocab = [k for k,v in tokenizer.word_counts.items() if v > min_occurance]
  return vocab

In [0]:
def update_corpus(corpus, vocab):
	clean_corpus = list()
	for line in corpus:
		new_tokens = list()
		for token in line.split():
			if token in vocab:
				new_tokens.append(token)
			else:
				new_tokens.append('unk')
		new_line = ' '.join(new_tokens)
		clean_corpus.append(new_line)
	return clean_corpus

In [0]:
def drop_nulls(corpus1, corpus2):
    lengths = [len(line) for line in corpus1]
    idx = [i for i,line in enumerate(corpus1) if len(line)>0]

    corpus1 = [corpus1[i] for i in idx]
    corpus2 = [corpus2[i] for i in idx]

    return corpus1, corpus2

In [0]:
def preprocess(corpus, min_occurance=5):
  corpus = clean_corpus(corpus)
  vocab = to_vocab(corpus, min_occurance)
  corpus = update_corpus(corpus, vocab)
  return corpus, vocab

In [0]:
def tokenize(corpus):
  corpus = ['<start> '+line+' <end>' for line in corpus]
  tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(corpus, target_vocab_size=2**13)
  tensor = keras.preprocessing.sequence.pad_sequences([tokenizer.encode(line) for line in corpus],  padding='post')
  return tensor, tokenizer

## Clean Data

In [0]:
path = '/content/drive/My Drive/Small Vocab/'
files = os.listdir(path)
files

In [0]:
en = open(path+files[0],'r').read().split('\n')
fr = open(path+files[1],'r').read().split('\n')
print(len(en), len(fr))
print(en[:1])
print(fr[:1])

In [0]:
en_lengths = [len(line.split()) for line in en]
fr_lengths = [len(line.split()) for line in fr]

print('Eng:',max(en_lengths), min(en_lengths))
print('Fr:',max(fr_lengths), min(fr_lengths))

In [0]:
en, en_vocab = preprocess(en)
fr, fr_vocab = preprocess(fr)

In [0]:
for _ in range(3):
  i = random.randint(0, len(en))
  print(i,en[i])
  print(i,fr[i])

In [0]:
dump(en, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/en.pkl', 'wb'))
dump(fr, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/fr.pkl', 'wb'))

## Prepare data

In [0]:
en = load(open('/content/drive/My Drive/Small Vocab/en.pkl', 'rb')) 
fr = load(open('/content/drive/My Drive/Small Vocab/fr.pkl', 'rb')) 

In [6]:
en_lengths = [len(line.split()) for line in en]
fr_lengths = [len(line.split()) for line in fr]
print('Eng:',max(en_lengths), min(en_lengths), '\tFr:',max(fr_lengths), min(fr_lengths))

Eng: 15 3 	Fr: 21 3


In [0]:
inp_tensor, inp_lang = tokenize(en)
targ_tensor, targ_lang = tokenize(fr)

In [8]:
for _ in range(2):
  i = random.randint(0, len(en))
  print(i,en[i])
  print(i,fr[i])

25255 the peach is her favorite fruit but the lemon is their favorite
25255 la peche est son fruit prefere mais le citron est leur favori
105147 our most loved fruit is the orange but your most loved is the grape
105147 nos fruits le plus aime est lorange mais votre plus aime est le raisin


In [9]:
test_size = 0.2
buffer_size = 10000
batch_size = 64
max_input_len = inp_tensor.shape[1]
max_targ_len = targ_tensor.shape[1]
inp_vocab_size = inp_lang.vocab_size
targ_vocab_size = targ_lang.vocab_size
print('Max lengths: ', max_input_len, max_targ_len)
print('Vocab sizes:', inp_vocab_size, targ_vocab_size)

Max lengths:  21 27
Vocab sizes: 543 703


In [0]:
x_train, x_test, y_train, y_test = train_test_split(inp_tensor, targ_tensor, test_size=test_size, random_state=42)
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(buffer_size).batch(batch_size)
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test)).shuffle(len(x_test))

In [11]:
inp, targ = next(iter(train_data))
print(inp.shape, targ.shape)
print(inp[0], targ[0])

(64, 21) (64, 27)
tf.Tensor(
[347   2   4  22   1  12  49   8  84  10   7   1  12  36   6  76   5   3
 349   0   0], shape=(21,), dtype=int32) tf.Tensor(
[507   2   4  15  33   1  12  53   6  67   9   7   1  12  22   6  88   5
   3 509   0   0   0   0   0   0   0], shape=(27,), dtype=int32)


# Transformer Base

#### Supporting Functions

In [0]:
def get_angles(pos, embedding_dim):
  pos = np.arange(pos)[:, np.newaxis]
  angles = 1/np.power(10000, 2*np.arange(embedding_dim)/embedding_dim)[np.newaxis, :]
  return pos * angles

def positional_encoding(vocab_size, embedding_dim):
  angles = get_angles(vocab_size, embedding_dim)
  angles[:,::2] = np.sin(angles[:,::2])
  angles[:,1::2] = np.cos(angles[:,::2])
  pos_encoding = angles[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

def create_masks(inp, tar):
  enc_padding_mask = create_padding_mask(inp)
  dec_padding_mask = create_padding_mask(inp)
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
  return enc_padding_mask, combined_mask, dec_padding_mask

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

### Class

In [0]:
def scaled_dot_product_attention(q, k, v, mask):
  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
  return output, attention_weights
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    assert d_model % self.num_heads == 0
    self.depth = d_model // self.num_heads
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    self.dense = tf.keras.layers.Dense(d_model)
  def split_heads(self, x, batch_size):
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
    concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
    return output, attention_weights
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()
    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
  def call(self, x, training, mask):
    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    return out2
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()
    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
  def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)
    attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    return out3, attn_weights_block1, attn_weights_block2
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()
    self.d_model = d_model
    self.num_layers = num_layers
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
  def call(self, x, training, mask):
    seq_len = tf.shape(x)[1]
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    x = self.dropout(x, training=training)
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    return x  # (batch_size, input_seq_len, d_model)
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
    super(Decoder, self).__init__()
    self.d_model = d_model
    self.num_layers = num_layers
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
  def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
    seq_len = tf.shape(x)[1]
    attention_weights = {}
    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    x = self.dropout(x, training=training)
    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,look_ahead_mask, padding_mask)
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    return x, attention_weights
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()
    self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
    self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)
    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
  def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    return final_output, attention_weights

## Train

In [0]:
epochs = 100
batch_size = 64
d_model = 256
dff = 1024
num_layers = 4
num_heads = 8
max_len = max([inp_tensor.shape[1], targ_tensor.shape[1]])
dropout_rate = 0.1

In [0]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
train_loss, test_loss = tf.keras.metrics.Mean(name='train_loss'), tf.keras.metrics.Mean(name='test_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

In [0]:
transformer = Transformer(num_layers, d_model, num_heads, dff, inp_vocab_size, targ_vocab_size, pe_input=inp_vocab_size, pe_target=targ_vocab_size, rate=dropout_rate)

In [0]:
checkpoint_path = "/content/drive/My Drive/Datasets/NLP/Checkpoints/Transformer"
ckpt = tf.train.Checkpoint(transformer=transformer,optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored')

In [0]:
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
]

@tf.function(input_signature=train_step_signature)
# @tf.function()
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  
  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(tar_real, predictions)

In [0]:
%%time
epochs = 50

for epoch in range(epochs):
  start = time.time()
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  for (batch, (inp, tar)) in enumerate(train_data):
    train_step(inp, tar)
    
    if batch % 200 == 0:
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
  if (epoch + 1) % 10 == 0:
    ckpt_save_path = ckpt_manager.save()
    print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,ckpt_save_path))
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result()))
  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

In [0]:
def translate(sentence, plot=''):
  result = evaluate(sentence)
  predicted_sentence = targ_lang.decode([i for i in result if i < targ_lang.vocab_size])  

  return predicted_sentence
  
def evaluate(inp_sentence):
  inp_sentence = inp_lang.encode(inp_sentence)
  encoder_input = tf.expand_dims(inp_sentence, 0)
  decoder_input = [507]
  output = tf.expand_dims(decoder_input, 0)
    
  for i in range(max_len):
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)
    predictions, _ = transformer(encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask)
    predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)
    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
    
    if predicted_id == 509:
      return tf.squeeze(output, axis=0)
    
    output = tf.concat([output, predicted_id], axis=-1)
  return tf.squeeze(output, axis=0)

In [0]:
inp = inp_lang.decode(inp_tensor[0])
tar = targ_lang.decode(targ_tensor[0])
pred = translate(inp)
print(tar, pred)

<start> new jersey est parfois calme pendant l automne et il est neigeux en avril <end> <start> new jersey est parfois calme pendant l automne et il est neigeux en avril <end


In [0]:
df = []
with open('/content/drive/My Drive/Predictions/predictions.txt','w') as f:
  for i, (x,y) in enumerate(zip(x_test, y_test)):
    sentence = inp_lang.decode(x)
    pred = translate(sentence)
    y = ' '.join(targ_lang.decode(y).split(' ')[1:-1])
    pred = ' '.join(pred.split(' ')[1:-1])
    df += [[y, pred]]
    if i % 100 == 0: 
      print(i)
      for line in df:
        f.write(line[0]+'\t'+line[1]+'\n')

In [0]:
predictions = pd.read_csv('/content/drive/My Drive/Predictions/predictions.txt', sep='\t', header=None)
predictions.drop_duplicates(inplace=True)
print(predictions.shape)
predictions.head(1)

In [0]:
true = predictions[0].values
pred = predictions[1].values

In [28]:
score = [0,0,0]
for i, (y,p) in enumerate(zip(true,pred)):
  score[0] += sentence_bleu(y, p)
  score[1] += sentence_bleu(y, p, smoothing_function=SmoothingFunction().method1)
  score[2] += sentence_bleu(y, p, smoothing_function=SmoothingFunction().method2)

  if i % 1000 == 0: print(i)
print(score[0]/predictions.shape[0])
print(score[1]/predictions.shape[0])
print(score[2]/predictions.shape[0])

0.7178099241795025
0.005937449250641525
0.033282942197759194
