<a href="https://colab.research.google.com/github/lorrespz/Transformers-Language-Models--Pytorch-/blob/main/Transformers_Encoder_(Bert_like_architecture).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Encoder architecture (Bert-like model) from scratch

This code is from Lazy Programmer's Transformers course

https://www.udemy.com/course/data-science-transformers-nlp/

Recall that Bert models are a family of masked language models in which the main pretraining task is performed by predicting a masked token using both the tokens that come before and after it. In short, the prediction is for the probability $p_t$

$\langle p_t\,| \ldots, p_{t-2}, p_{t-1}, p_{t+1}, p_{t+2}, \ldots\rangle$

Encoder models are therefore bidirectional by nature.

In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset
import numpy as np
import matplotlib.pyplot as plt

# Multihead Attention Block

Recall the formula:

   Attention($Q, K, D$) = softmax$\left(\dfrac{QK^T}{\sqrt{d_k}}\right)V$

  where:

   $ Q = W^Q Q_{input}$

   $ K = W^K K_{input}$

   $V = W^V V_{input}$


In [2]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_k, d_model, n_heads):
    super().__init__()
    #Assume d_v = d_k (len(Q) = len(K) = d_k, len(V) = d_v)
    self.d_k = d_k
    self.n_heads = n_heads
    self.key = nn.Linear(d_model, d_k*n_heads)
    self.query = nn.Linear(d_model, d_k*n_heads)
    self.value = nn.Linear(d_model, d_k*n_heads)
    #final linear layer
    self.fc = nn.Linear(d_k*n_heads, d_model)

  def forward(self, q, k, v, mask = None):
    q = self.query(q)   # N x T x (hd_k)
    k = self.key(k)     # N x T x (hd_k)
    v = self.value(v)    # N x T x (hd_v)
    #h = n_heads
    # N = batch size
    N = q.shape[0]
    # T = sequence length
    T = q.shape[1]

    #change the shape to:
    # (N, T, h, d_k) --> (N, h, T, d_k)
    q = q.view(N, T, self.n_heads, self.d_k).transpose(1,2)
    k = k.view(N, T, self.n_heads, self.d_k).transpose(1,2)
    v = v.view(N, T, self.n_heads, self.d_k).transpose(1,2)

    #compute attention weights
    # q * k^T
    #(N,  h, T,  d_k) x (N, h, d_k, T) --> (N, h, T, T)
    #transposing the last 2 dimensions of k
    attn_scores = q @ k.transpose(-2, -1)/math.sqrt(self.d_k)
    #apply the mask, which is a tensor of size (N,T) of values 0, 1
    #for each of the N samples, need to know which of the T tokens is important
    #Change from 2D to 4D by adding None, which introduces superfluous dim of size 1
    # (N, T) --> (N, 1, 1, T)
    if mask is not None:
      #mask_fill(arg1, arg2): if arg1 = True, apply arg2
      #softmax(-inf) = 0
       attn_scores = attn_scores.masked_fill(mask[:, None, None,:] == 0, float('-inf'))
    attn_weights = F.softmax(attn_scores, dim = -1)

    #compute attention-weighted values
    #(N, h, T, T) x (N, h, T, d_k) --> (N, h, T, d_k)
    A = attn_weights @ v

    #reshape it back before the final linear layer
    A = A.transpose(1, 2) # (N, T, h, d_k)
    A = A.contiguous().view(N, T, self.d_k*self.n_heads) #(N, T, h*d_k)

    #final step is to project A with the Linear layer to
    #get the same shape as the input sequence
    return self.fc(A)


# Transformer Block

In [3]:
class TransformerBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, dropout_prob = 0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = MultiHeadAttention(d_k, d_model, n_heads)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model*4),
        nn.GELU(),
        nn.Linear(d_model*4, d_model),
        nn.Dropout(dropout_prob)
        )
    self.dropout = nn.Dropout(p = dropout_prob)

  def forward(self, x, mask = None):
    #x is an input sequence of size (NxTXD)
    # mask is of size (NxT)
    #FIRST LAYER NORM:
    #pass x in as the query, key, value into the multihead attention block
    #then add the output to the residual 'x' to be passed in the 1st layer norm
    x = self.ln1(x+ self.mha(x,x,x,mask))
    # SECOND LAYER NORM: ann + x
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return(x)

# Positional Encoding Block


$PE_{(pos, 2i)} = \sin(pos/10000^{2i/d_{model}})$

$PE_{(pos, 2i+1)} = \cos(pos/10000^{2i/d_{model}})$

In [4]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len = 2048, dropout_prob = 0.1):
    super().__init__()
    self.dropout = nn.Dropout(p = dropout_prob)
    #unsqueeze(1) adds a superfluous dim of size 1 at the end
    #so that we have a 2d array of size (max_len, 1)
    #position is pos variable in the formula
    position = torch.arange(max_len).unsqueeze(1)
    #exp_term is the '2i' in the exponent of the denominator in the formula
    exp_term = torch.arange(0, d_model, 2)
    #this is just the term 10000^(-2i/d_model)
    div_term = torch.exp(exp_term*(-math.log(10000.0)/d_model))
    #PE term
    pe = torch.zeros(1, max_len, d_model)
    #0::2 means 2, 4, 6, 8, ... indexing
    pe[0, :, 0::2] = torch.sin(position*div_term)
    #1::2 means 1,3,5, 7, ... indexing
    pe[0, :, 1::2] = torch.cos(position*div_term)
    #register_buffer allows for saving and loading the model correctly
    self.register_buffer('pe', pe)

  def forward(self, x):
    #x shape: NxTxD (D: d_model)
    x  = x + self.pe[:,:x.size(1), :]
    return self.dropout(x)

# Transformer Encoder

In [5]:
#the encoder built here is for sentiment analysis, so we have 'n_classes' as one of the inputs
class Encoder(nn.Module):
  def __init__(self, vocab_size,
               max_len, d_k, d_model, n_heads, n_layers, n_classes, dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformers_blocks = [TransformerBlock(d_k, d_model, n_heads, dropout_prob) for _ in range(n_layers)]
    self.transformer_blocks = nn.Sequential(*transformers_blocks)
    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, n_classes)

  def forward(self, x, mask = None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, mask)

    #depends on the kind of tasks that we need, here:
    #many-to-one (x has the shape N x T x D)
    x = x[:, 0, :]
    x = self.ln(x)
    x = self.fc(x)

    return x

# Test the encoder

In [6]:
#vocab_size=20k, max_len=1024, d_k=16, d_model=64, n_heads=4, n_layers=2, n_classes=5, dropout_prob=0.1
model = Encoder(20000, 1024, 16, 64, 4, 2, 5, 0.1)

In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

cuda:0


Encoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, 

In [8]:
# Create some random sequence of integers representing a sequence of tokens of size (N,T) =(8, 512)
x = np.random.randint(0, 20000, size = (8,512))
x_t = torch.tensor(x).to(device)
x_t

tensor([[12905, 17035, 14550,  ..., 15434,  2213, 19815],
        [16681,  7211, 12928,  ..., 19062, 18557,   923],
        [15995,  6729, 12049,  ...,  1502, 10122,  6539],
        ...,
        [16442,   539,  1048,  ..., 14176,  2276, 13180],
        [ 8661,  7149, 19561,  ..., 12264,   441,  8645],
        [12294,  1520,   717,  ..., 17228, 10852, 11991]], device='cuda:0')

In [9]:
#Create the corresponding mask for the input above
mask = np.ones((8,512))
mask[:, 256:] = 0
mask_t = torch.tensor(mask).to(device)
mask_t

tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]], device='cuda:0', dtype=torch.float64)

In [10]:
# Pass the input and mask thru the encoder
y = model(x_t, mask_t)
#The shape should be $[N=8, n_classes = 5]
y.shape

torch.Size([8, 5])

# Use the Encoder on a real dataset (train from scratch)

In [12]:
#!pip install transformers datasets

In [13]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [14]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [15]:
from datasets import load_dataset
#sst2 is a dataset for sentiment analysis (part of the glue benchmark)
raw_datasets = load_dataset('glue', 'sst2')
raw_datasets

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

## Tokenize the dataset

In [16]:
#define a tokenizer function which applies the tokenizer to the 'sentence' column of a batch of data
def tokenizer_fn(batch):
  return tokenizer(batch['sentence'], truncation = True)

In [17]:
tokenized_datasets = raw_datasets.map(tokenizer_fn, batched = True)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
data_collator

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [19]:
#Notice that the tokenized dataset now have 2 additional columns: 'input_ids' and 'attention_mask'
#compared to the 'untokenized' one
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [20]:
# Remove the 'sentence' and 'idx' columns in tokenized_datasets
tokenized_datasets = tokenized_datasets.remove_columns(['sentence', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')

In [21]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

## Data Loader

In [23]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_datasets['train'],
                          shuffle = True,
                          batch_size = 32,
                          collate_fn = data_collator)

valid_loader = DataLoader(tokenized_datasets['validation'],
                          batch_size = 32,
                          collate_fn = data_collator)


In [25]:
# check how it works
for batch in train_loader:
  for k, v in batch.items():
    print('k:', k, 'v shape:', v.shape)
  break

k: labels v shape: torch.Size([32])
k: input_ids v shape: torch.Size([32, 39])
k: attention_mask v shape: torch.Size([32, 39])


In [26]:
#Check the number of distinct labels (number of classes)
set(tokenized_datasets['train']['labels'])

{0, 1}

In [29]:
# Check the vocab size, max sequence length
tokenizer.vocab_size, tokenizer.max_model_input_sizes, tokenizer.max_model_input_sizes[checkpoint]

(28996,
 {'distilbert-base-uncased': 512,
  'distilbert-base-uncased-distilled-squad': 512,
  'distilbert-base-cased': 512,
  'distilbert-base-cased-distilled-squad': 512,
  'distilbert-base-german-cased': 512,
  'distilbert-base-multilingual-cased': 512},
 512)

## Build the encoder

In [30]:
model = Encoder(vocab_size = tokenizer.vocab_size,
                max_len = tokenizer.max_model_input_sizes[checkpoint],
                d_k = 16,
                d_model = 64,
                n_heads = 4,
                n_layers = 2,
                n_classes = 2,
                dropout_prob = 0.1,)
model.to(device)

Encoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, 

In [31]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [37]:
#A function to perform the train loss
from datetime import datetime
def train(model, criterion, optimizer, train_loader, valid_loader, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = 0
    n_train = 0
    for batch in train_loader:
      batch = {k:v.to(device) for k,v in batch.items()}
      #zero the parameter grads
      optimizer.zero_grad()
      #Forward pass
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs, batch['labels'])
      #backward and optimize
      loss.backward()
      optimizer.step()

      train_loss +=loss.item()*batch['input_ids'].size(0)
      n_train +=batch['input_ids'].size(0)

    #get average train loss across batches
    train_loss = train_loss/n_train

    #Evaluation round
    model.eval()
    test_loss = 0
    n_test = 0
    for batch in valid_loader:
      batch = {k:v.to(device) for k, v in batch.items()}
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs, batch['labels'])
      test_loss +=loss.item()*batch['input_ids'].size(0)
      n_test+=batch['input_ids'].size(0)

    test_loss = test_loss/n_test

    #Save losses
    train_losses[it] = train_loss
    test_losses[it] = test_loss

    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train loss: {train_loss:.4f}, Test loss: {test_loss:.4f}, Duration: {dt}')

  return train_losses, test_losses

In [38]:
train_losses, test_losses = train(model, criterion, optimizer,
                                  train_loader, valid_loader, epochs = 4)

Epoch 1/4, Train loss: 0.3642, Test loss: 0.5139, Duration: 0:00:25.204881
Epoch 2/4, Train loss: 0.2951, Test loss: 0.5089, Duration: 0:00:32.813766
Epoch 3/4, Train loss: 0.2555, Test loss: 0.5106, Duration: 0:00:27.680531
Epoch 4/4, Train loss: 0.2261, Test loss: 0.5479, Duration: 0:00:30.521084


## Calculate the train & test accuracy

In [39]:
model.eval()
#Train accuracy
n_correct = 0
n_total = 0
for batch in train_loader:
  batch = {k: v.to(device) for k, v in batch.items()}
  outputs = model(batch['input_ids'], batch['attention_mask'])
  #Get predictions
  _, predictions= torch.max(outputs, 1)
  #update counts
  n_correct +=(predictions == batch['labels']).sum().item()
  n_total +=batch['labels'].shape[0]

train_acc = n_correct/n_total

#Test accuracy
n_correct = 0
n_total = 0
for batch in valid_loader:
  batch = {k: v.to(device) for k, v in batch.items()}
  outputs = model(batch['input_ids'], batch['attention_mask'])
  #Get predictions
  _, predictions= torch.max(outputs, 1)
  #update counts
  n_correct +=(predictions == batch['labels']).sum().item()
  n_total +=batch['labels'].shape[0]

test_acc = n_correct/n_total
#print results
print(f'Train acc: {train_acc:.4f}, Test acc : {test_acc: .4f}')

Train acc: 0.9430, Test acc :  0.7936
