In [1]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow as tf

import tensorflow_text

2023-03-22 07:04:24.403872: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-22 07:04:25.048902: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-22 07:04:25.048973: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
examples, metadata = tfds.load(
    "ted_hrlr_translate/pt_to_en",
    data_dir="/home/mond/tensorflow_datasets/",
    with_info=True,
    as_supervised=True,
)

train_examples, val_examples = examples["train"], examples["validation"]

2023-03-22 07:04:26.255548: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-03-22 07:04:26.261770: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2023-03-22 07:04:26.261792: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-03-22 07:04:26.262622: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep

In [3]:
train_examples

<PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))>

In [5]:
for pt_examples, en_examples in train_examples.batch(3).take(1):
    print("> Examples in Portuguese:")
    for pt in pt_examples.numpy():
        print(pt.decode("utf-8"))
    print()

    print("> Examples in English:")
    for en in en_examples.numpy():
        print(en.decode("utf-8"))

> Examples in Portuguese:
e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
mas e se estes fatores fossem ativos ?
mas eles não tinham a curiosidade de me testar .

> Examples in English:
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
but what if it were active ?
but they did n't test for curiosity .


2023-03-22 07:06:17.967710: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [1]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size, n_heads):
        super().__init__()

        assert emb_size % n_heads == 0, "`emb_size` needs to be divisible by `n_heads`"
        self.emb_size = emb_size
        self.n_heads = n_heads
        self.head_dim = emb_size // self.n_heads

        self.values = nn.Linear(in_features=self.head_dim, out_features=self.head_dim, bias=False)
        self.keys = nn.Linear(in_features=self.head_dim, out_features=self.head_dim, bias=False)
        self.queries = nn.Linear(in_features=self.head_dim, out_features=self.head_dim, bias=False)
        self.fc_out = nn.Linear(in_features=self.emb_size, out_features=self.emb_size)
    
    def forward(self, queries, keys, values, mask):
        n_samples = queries.shape[0]
        query_length, key_length, value_length = queries.shape[1], keys.shape[1], values.shape[1]

        # Split embedding into `self.n_heads` pieces
        queries = queries.reshape(n_samples, query_length, self.n_heads, self.head_dim)
        keys = keys.reshape(n_samples, key_length, self.n_heads, self.head_dim)
        values = values.reshape(n_samples, value_length, self.n_heads, self.head_dim)

        # queries shape: [n_samples, query_length, n_heads, heads_dim]
        # keys shape: [n_samples, key_length, n_heads, heads_dim]
        # energy shape: [n_samples, n_heads, query_length, key_length]
        energy = torch.einsum("nqhd,nkhd->nhqk", queries, keys)
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))
        
        attention = torch.softmax(energy / torch.sqrt(self.emb_size), dim=3)

        # attention shape: [n_samples, n_heads, query_length, key_length]
        # values shape: [n_samples, value_length, n_heads, heads_dim]
        # output shape: [n_samples, query_length, n_heads, heads_dim]
        output = torch.einsum("nhql,nlhd->nqhd", attention, values).reshape(
            n_samples, query_length, self.emb_size
        )
        output = self.fc_out(output)

        return output

class TransformerBlock(nn.Module):
    def __init__(self, emb_size, n_heads, dropout_p, forward_expansion):
        super().__init__()

        self.attention = MultiHeadAttention(emb_size, n_heads)
        self.norm_1 = nn.LayerNorm(normalized_shape=emb_size)
        self.norm_2 = nn.LayerNorm(normalized_shape=emb_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(in_features=emb_size, out_features=forward_expansion * emb_size),
            nn.ReLU(inplace=True),
            nn.Linear(in_features=forward_expansion * emb_size, out_features=emb_size)
        )
        self.dropout = nn.Dropout(p=dropout_p)
    
    def forward(self, queries, keys, values, mask):
        attention = self.attention(queries, keys, values, mask)
        x = self.dropout(self.norm_1(attention + queries))
        forward = self.feed_forward(x)
        output = self.dropout(self.norm_2(forward + x))

        return output

class Encoder(nn.Module):
    def __init__(
            self,
            src_vocab_size,
            emb_size,
            n_layers,
            n_heads,
            forward_expansion,
            dropout_p,
            max_length,
            device
        ):
        super().__init__()

        self.emb_size = emb_size
        self.device = device
        self.word_embedding = nn.Embedding(
            num_embeddings=src_vocab_size, embedding_dim=self.emb_size
        )
        self.position_embedding = nn.Embedding(
            num_embeddings=max_length, embedding_dim=self.emb_size
        )

        self.layers = nn.ModuleList(
            [
                TransformerBlock(emb_size, n_heads, dropout_p, forward_expansion)
                for _ in range(n_layers)
            ]
        )
        self.dropout = nn.Dropout(p=dropout_p)
    
    def forward(self, x, mask):
        n_samples, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(n_samples, seq_length).to(self.device)
        output = self.dropout(self.word_embedding(x) + self.position_embedding(positions))

        for layer in self.layers:
            output = layer(output, output, output, mask)
        
        return output


In [14]:
n_samples, seq_length = 10, 5
torch.arange(0, seq_length).expand(n_samples, seq_length)

tensor([[0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4]])

In [15]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [19]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [20]:
x_bow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        x_prev = x[b, :t + 1]  # [t, C]
        x_bow[b, t] = torch.mean(x_prev, dim=0)

x_bow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [26]:
x_tmp = (x[0] @ torch.tril(torch.ones(T, T)) / torch.unsqueeze(torch.arange(1, T + 1), dim=0))
assert torch.allclose(x_tmp, x_bow[0])
