# Imports

In [1]:
import numpy as np
import math
import re
import time
import zipfile
import random
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tqdm import tqdm

2022-04-07 11:28:21.432975: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


# Loading database

In [2]:
def load_db(file_path):
    with open(file_path) as f:
        return f.read()

euro_en = load_db('pt-en/europarl-v7.pt-en.en')
euro_pt = load_db('pt-en/europarl-v7.pt-en.pt')

In [3]:
print('en sample example: ', euro_en.split('\n')[0])
print('pt sample example: ', euro_pt.split('\n')[0])

en sample example:  Resumption of the session
pt sample example:  Reinício da sessão


# Data cleaning

In [4]:
def data_cleaning(data):
    data = re.sub(r'\.(?=[0-9]|[a-z]|[A-Z])', '$$', data)
    data = re.sub(r'\$\$', '', data)
    data = re.sub(r' +', ' ', data)
    return data.split('\n')

data_en = data_cleaning(data=euro_en)
data_pt = data_cleaning(data=euro_pt)

print('Data en: ', data_en[10])
print('Data pt: ', data_pt[10])

Data en:  Would it be appropriate for you, Madam President, to write a letter to the Sri Lankan President expressing Parliament's regret at his and the other violent deaths in Sri Lanka and urging her to do everything she possibly can to seek a peaceful reconciliation to a very difficult situation?
Data pt:  Será que a senhora Presidente poderia enviar uma carta à Presidente do Sri Lanka manifestando o pesar do Parlamento por esta e outras mortes violentas perpetradas no seu país, e instando­a a envidar todos os esforços ao seu alcance para procurar obter uma reconciliação pacífica na situação extremamente difícil que ali se vive?


In [5]:
print('en data size: {} | pt data size: {}'.format(len(data_en), len(data_pt)))

en data size: 1960408 | pt data size: 1960408


# Tokenization

In [6]:
def tokenizer_data(data, vocab_size=2**13):
    return tfds.features.text.SubwordTextEncoder.build_from_corpus(data, target_vocab_size=vocab_size)

tokenizer_en = tokenizer_data(data=data_en)
tokenizer_pt = tokenizer_data(data=data_pt)

print('En vocab size: ', tokenizer_en.vocab_size)
print('Pt vocab size: ', tokenizer_pt.vocab_size)

En vocab size:  8191
Pt vocab size:  8116


In [7]:
def token_start_end(data, tokenizer):
    vocab_size = tokenizer.vocab_size + 2
    # adding start and end token in each setense
    return [[vocab_size - 2] + tokenizer.encode(sentense) + [vocab_size - 1] for sentense in data]

inputs = token_start_end(data=data_en, tokenizer=tokenizer_en)
outputs = token_start_end(data=data_pt, tokenizer=tokenizer_pt)

print('Input example: ', inputs[0])
print('Output example: ', outputs[0])

Input example:  [8191, 2458, 972, 2108, 3, 1, 2571, 8192]
Output example:  [8116, 834, 705, 7, 3561, 8117]


removing setenses longer than 15 

In [8]:
def remove_longer_sentense(data, max_length=15):
    idx_to_remove = [idx for idx, sentense in enumerate(data) if len(sentense) > max_length]

    for idx in tqdm(reversed(idx_to_remove)):
        # remove the same setense from the data
        del inputs[idx]
        del outputs[idx]

remove_longer_sentense(data=inputs)
remove_longer_sentense(data=outputs)

print('len inputs: {} | len outputs {}'.format(len(inputs), len(outputs)))

1685300it [07:43, 3639.51it/s] 
66118it [00:10, 6408.12it/s]  

len inputs: 208990 | len outputs 208990





padding sentenses 

In [9]:
def padding_sequences(data, max_length):
    return tf.keras.preprocessing.sequence.pad_sequences(sequences=data, value=0, padding='post', maxlen=max_length)

inputs = padding_sequences(data=inputs, max_length=15)
outputs = padding_sequences(data=outputs, max_length=15)

print('Input padded sequences example: ', inputs[0])
print('Output padded sequences example: ', outputs[0])

Input padded sequences example:  [8191 2458  972 2108    3    1 2571 8192    0    0    0    0    0    0
    0]
Output padded sequences example:  [8116  834  705    7 3561 8117    0    0    0    0    0    0    0    0
    0]


final dataset cration with tf optimization

In [10]:
batch_size = 64
buffer_size = 20000

dataset = tf.data.Dataset.from_tensor_slices(tensors=(inputs, outputs))
dataset = dataset.cache()
dataset = dataset.shuffle(buffer_size=buffer_size).batch(batch_size=batch_size)
dataset = dataset.prefetch(tf.data.AUTOTUNE)

2022-04-07 11:45:04.959098: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-04-07 11:45:04.960227: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-04-07 11:45:04.989977: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-07 11:45:04.991537: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:05:00.0 name: NVIDIA GeForce GTX 1060 6GB computeCapability: 6.1
coreClock: 1.8095GHz coreCount: 10 deviceMemorySize: 5.93GiB deviceMemoryBandwidth: 178.99GiB/s
2022-04-07 11:45:04.991557: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2022-04-07 11:45:05.053488: I tensorflow/stream_executor/platfo

# Model building

## Positional Encoding implementation

$ PE(pos, 2i) = sin(pos * angles)$

$ PE(pos, 2i + 1) = cos(pos * angles)$


$ angles = \frac{1}{10000^{2*i / dmodel}}$

In [11]:
class PositionalEncoding(layers.Layer):
    def __init__(self):
        super(PositionalEncoding, self).__init__()

    def get_angles(self, i, d_model):
        # definition of angles
        return 1 / np.power(10000., (2*(i//2)) / np.float32(d_model))

    def call(self, inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]

        # get the pos and i matrix to gerate positional encoding from a input tensor
        pos = np.arange(seq_length)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]

        # calculates the angle results
        angles = self.get_angles(i=i, d_model=d_model)

        # calculates the positional encoding
        pe = pos * angles #(seq_length, d_model)
        pe[:, 0::2] = np.sin(pe[:, 0::2]) #even position
        pe[:, 1::2] = np.cos(pe[:, 1::2]) #odd position

        # transform the pos encoding dimension to the same as the input
        pos_encoding = pe[np.newaxis, ...]

        print('Inputs shape: {} | PE shape {} | Pos encoding shape: {}'.format(inputs.shape, pe.shape, pos_encoding.shape))
        
        print('Pos encoding', pos_encoding)

        return inputs + pos_encoding

Testing positional encoding class

In [12]:
pos = PositionalEncoding()

matrix_test = tf.ones((1, 4, 4))

print('Inputs + pos encoding', pos(matrix_test))

Inputs shape: (1, 4, 4) | PE shape (4, 4) | Pos encoding shape: (1, 4, 4)
Pos encoding [[[ 0.          1.          0.          1.        ]
  [ 0.84147098  0.54030231  0.00999983  0.99995   ]
  [ 0.90929743 -0.41614684  0.01999867  0.99980001]
  [ 0.14112001 -0.9899925   0.0299955   0.99955003]]]
Inputs + pos encoding tf.Tensor(
[[[1.        2.        1.        2.       ]
  [1.841471  1.5403023 1.0099999 1.9999499]
  [1.9092975 0.5838531 1.0199987 1.9998   ]
  [1.14112   0.0100075 1.0299954 1.9995501]]], shape=(1, 4, 4), dtype=float32)


## Attention mecanism

### Scaled dot product Attention

![scaled dot product attention](imgs/scaled-dot-produt-attention.png)

$Attention(Q, K, V) = softmax(\frac{QK^{T}}{\sqrt{d_{k}}})*V$

Q = queries

K = keys

V = values

$K^{T}$ = K matrix transpose

$d_{k}$ = K dimension

In [13]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)

    scaled_produt = product / tf.math.sqrt(keys_dim)

    if mask is not None:
        scaled_produt += (mask * -1e9)

    softmax = tf.nn.softmax(scaled_produt, axis=-1)

    return tf.matmul(softmax, values)

### Multi-Head Attention

![mult-head attention](imgs/multi-head-attention.png)

In [15]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, num_proj):
        super(MultiHeadAttention, self).__init__()

        self.num_proj = num_proj

    def build(self, input_shape):
        d_model = input_shape[-1]
        assert d_model % self.num_proj == 0

        self.d_proj = d_model // self.num_proj

        # linear dense layers
        self.query_linear_dense = layers.Dense(units = self.d_model)
        self.keys_linear_dense = layers.Dense(units = self.d_model)
        self.values_linear_dense = layers.Dense(units = self.d_model)

        self.final_linear_dense = layers.Dense(units = self.d_model)

    def split_proj(self, inputs, batch_size):
        shape = (batch_size, -1, self.d_proj)
        splitted_inputs = tf.reshape(inputs, shape=shape) # (batch_size, seq_length, num_proj, d_proj)

        return tf.transpose(splitted_inputs, perm=[0, 2, 1, 3]) # (batch_size, num_proj, seq_length, d_proj)

    def scaled_dot_product_attention(self, queries, keys, values, mask=None):
        product = tf.matmul(queries, keys, transpose_b=True)
        keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)

        scaled_produt = product / tf.math.sqrt(keys_dim)

        if mask is not None:
            scaled_produt += (mask * -1e9)

        softmax = tf.nn.softmax(scaled_produt, axis=-1)

        return tf.matmul(softmax, values)

    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]

        queries = self.query_linear_dense(queries)
        keys = self.keys_linear_dense(keys)
        values = self.values_linear_dense(values)

        queries = self.split_proj(inputs=queries, batch_size=batch_size)
        keys = self.split_proj(inputs=keys, batch_size=batch_size)
        values = self.split_proj(inputs=values, batch_size=batch_size)

        attention = self.scaled_dot_product_attention(queries = queries,
                                                      keys = keys,
                                                      values = values,
                                                      mask=mask)

        # return the same shape as the input
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat = tf.concat(attention, shape=(batch_size, -1, self.d_model))

        return self.final_linear_dense(concat)

# Encoder

In [16]:
class EncoderLayer(layers.Layer):
    def __init__(self, ff_units, num_proj, dropout_rate):
        super(EncoderLayer, self).__init__()

        self.ff_units = ff_units
        self.num_proj = num_proj
        self.dropout_rate = dropout_rate

    def build(self, input_shape):
        self.d_model = input_shape[-1]

        self.multi_head_attention = MultiHeadAttention(num_proj=self.num_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6) #1e-6 = 0.0000001

        self.dense_1 = layers.Dense(units=self.ff_units)
        self.dense_2 = layers.Dense(units=self.ff_units)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, mask, training:bool):
        attention = self.multi_head_attention(queries = inputs,
                                              keys = inputs,
                                              values = inputs,
                                              mask = mask)
        attention = self.dropout_1(attention, training = training)
        attention = self.norm_1(attention + inputs)

        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs, training = training)

        outputs = self.norm_2(outputs + attention)

        return outputs