<a href="https://colab.research.google.com/github/lorrespz/Transformers-Language-Models--Pytorch-/blob/main/Transformers_Decoder_(GPT_like)_architecture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Decoder architecture (GPT-like model) from scratch

This code is from Lazy Programmer's Transformers course

https://www.udemy.com/course/data-science-transformers-nlp/

Recall that GPT models are a family of language models in which the main pretraining task is performed by predicting a token using **only** the tokens that come before it. In short, the prediction is for the probability $p_t$

$\langle p_t\,| \ldots, p_{t-2}, p_{t-1}\rangle$

Decoder architecture is largely similar to encoder architecture, with the only difference being the attention block used is the causal version (which implements the dependency solely on past tokens).

In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset
import numpy as np
import matplotlib.pyplot as plt

# Causal Attention Block

The structure of the causal attention block is almost identical to that of the normal attention block used in Encoder Language Model, with the exception being the presence of a 'causal attention mask' applied before the softmax in the attention formula to set all past tokens to 1.

In [2]:
class CausalSelfAttention(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len):
    super().__init__()
    #Assume d_v = d_k (len(Q) = len(K) = d_k, len(V) = d_v)
    self.d_k = d_k
    self.n_heads = n_heads
    self.key = nn.Linear(d_model, d_k*n_heads)
    self.query = nn.Linear(d_model, d_k*n_heads)
    self.value = nn.Linear(d_model, d_k*n_heads)
    #final linear layer
    self.fc = nn.Linear(d_k*n_heads, d_model)
    #causal mask: a square matrix of size max_len x max_len
    #with the lower triangle half being all 1,
    #upper triangle half being all 0
    cm = torch.tril(torch.ones(max_len, max_len))
    self.register_buffer('causal_mask',
                         cm.view(1,1,max_len, max_len))

  def forward(self, q, k, v, pad_mask = None):
    q = self.query(q)   # N x T x (hd_k)
    k = self.key(k)     # N x T x (hd_k)
    v = self.value(v)    # N x T x (hd_v)
    #h = n_heads
    # N = batch size
    N = q.shape[0]
    # T = sequence length
    T = q.shape[1]

    #change the shape to:
    # (N, T, h, d_k) --> (N, h, T, d_k)
    q = q.view(N, T, self.n_heads, self.d_k).transpose(1,2)
    k = k.view(N, T, self.n_heads, self.d_k).transpose(1,2)
    v = v.view(N, T, self.n_heads, self.d_k).transpose(1,2)

    #compute attention weights
    # q * k^T
    #(N,  h, T,  d_k) x (N, h, d_k, T) --> (N, h, T, T)
    #transposing the last 2 dimensions of k
    attn_scores = q @ k.transpose(-2, -1)/math.sqrt(self.d_k)
    #apply the mask, which is a tensor of size (N,T) of values 0, 1
    #for each of the N samples, need to know which of the T tokens is important
    #Change from 2D to 4D by adding None, which introduces superfluous dim of size 1
    # (N, T) --> (N, 1, 1, T)
    if pad_mask is not None:
      #mask_fill(arg1, arg2): if arg1 = True, apply arg2
      #softmax(-inf) = 0
       attn_scores = attn_scores.masked_fill(pad_mask[:, None, None,:] == 0, float('-inf'))
       #HERE IS THE CAUSAL MASK !!!
       attn_scores = attn_scores.masked_fill(self.causal_mask[:, :, :T,:T] == 0, float('-inf'))
    attn_weights = F.softmax(attn_scores, dim = -1)

    #compute attention-weighted values
    #(N, h, T, T) x (N, h, T, d_k) --> (N, h, T, d_k)
    A = attn_weights @ v

    #reshape it back before the final linear layer
    A = A.transpose(1, 2) # (N, T, h, d_k)
    A = A.contiguous().view(N, T, self.d_k*self.n_heads) #(N, T, h*d_k)

    #final step is to project A with the Linear layer to
    #get the same shape as the input sequence
    return self.fc(A)


# Transformer Block

Almost the same as that used in the Encoder architecture, with the only difference being an additional input 'max_len'

In [3]:
class TransformerBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob = 0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = CausalSelfAttention(d_k, d_model, n_heads, max_len)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model*4),
        nn.GELU(),
        nn.Linear(d_model*4, d_model),
        nn.Dropout(dropout_prob)
        )
    self.dropout = nn.Dropout(p = dropout_prob)

  def forward(self, x, pad_mask = None):
    #x is an input sequence of size (NxTXD)
    # mask is of size (NxT)
    #FIRST LAYER NORM:
    #pass x in as the query, key, value into the multihead attention block
    #then add the output to the residual 'x' to be passed in the 1st layer norm
    x = self.ln1(x+ self.mha(x,x,x,pad_mask))
    # SECOND LAYER NORM: ann + x
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return(x)

# Positional Encoding Block

In [4]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len = 2048, dropout_prob = 0.1):
    super().__init__()
    self.dropout = nn.Dropout(p = dropout_prob)
    #unsqueeze(1) adds a superfluous dim of size 1 at the end
    #so that we have a 2d array of size (max_len, 1)
    #position is pos variable in the formula
    position = torch.arange(max_len).unsqueeze(1)
    #exp_term is the '2i' in the exponent of the denominator in the formula
    exp_term = torch.arange(0, d_model, 2)
    #this is just the term 10000^(-2i/d_model)
    div_term = torch.exp(exp_term*(-math.log(10000.0)/d_model))
    #PE term
    pe = torch.zeros(1, max_len, d_model)
    #0::2 means 2, 4, 6, 8, ... indexing
    pe[0, :, 0::2] = torch.sin(position*div_term)
    #1::2 means 1,3,5, 7, ... indexing
    pe[0, :, 1::2] = torch.cos(position*div_term)
    #register_buffer allows for saving and loading the model correctly
    self.register_buffer('pe', pe)

  def forward(self, x):
    #x shape: NxTxD (D: d_model)
    x  = x + self.pe[:,:x.size(1), :]
    return self.dropout(x)

# Decoder Block

In [5]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, max_len, d_k, d_model, n_heads, n_layers, dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformers_blocks = [TransformerBlock(d_k, d_model, n_heads, max_len, dropout_prob) for _ in range(n_layers)]
    self.transformer_blocks = nn.Sequential(*transformers_blocks)
    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, vocab_size)

  def forward(self, x, pad_mask = None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, pad_mask)
    x = self.ln(x)
    x = self.fc(x) #many_to_many tasl

    return x

# Test the decoder with a random input

In [6]:
model = Decoder(20000, 1024, 16, 64, 4, 2, 0.1)

In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

cuda:0


Decoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05,

In [8]:
# Create some random sequence of integers representing a sequence of tokens of size (N,T) =(8, 512)
x = np.random.randint(0, 20000, size = (8,512))
x_t = torch.tensor(x).to(device)
x_t

tensor([[ 6043, 10930,   177,  ...,   681, 10249,  8482],
        [ 4863,  1640, 16227,  ...,  2034,  7062, 13644],
        [11400,  7823,  1907,  ..., 18434,   898, 11997],
        ...,
        [ 5067, 19949,  6301,  ..., 17548, 17588,   612],
        [16764,  4361, 17452,  ..., 13244,  6946,   113],
        [14127, 15075, 19763,  ...,  6535,  6059, 17018]], device='cuda:0')

In [9]:
#Create the corresponding mask for the input above
mask = np.ones((8,512))
mask[:, 256:] = 0
mask_t = torch.tensor(mask).to(device)
mask_t

tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]], device='cuda:0', dtype=torch.float64)

In [10]:
# Pass the input and mask thru the encoder
y = model(x_t, mask_t)
#The shape should be (8, 512, vocab_size = 20000)
y.shape

torch.Size([8, 512, 20000])

# Use Decoder on a real dataset (train from scratch)

In [15]:
! pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16


In [11]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [12]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [16]:
from datasets import load_dataset
#sst2 is a dataset for sentiment analysis (part of the glue benchmark)
raw_datasets = load_dataset('glue', 'sst2')
raw_datasets

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

## Tokenizer

In [14]:
#define a tokenizer function which applies the tokenizer to the 'sentence' column of a batch of data
def tokenizer_fn(batch):
  return tokenizer(batch['sentence'], truncation = True)

In [17]:
tokenized_datasets = raw_datasets.map(tokenizer_fn, batched = True)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
data_collator

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [18]:
#Notice that the tokenized dataset now have 2 additional columns: 'input_ids' and 'attention_mask'
#compared to the 'untokenized' one
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [19]:
# Remove the 'sentence', 'label' and 'idx' columns in tokenized_datasets
tokenized_datasets = tokenized_datasets.remove_columns(['sentence', 'label', 'idx'])
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

## Data Loader

In [20]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_datasets['train'],
                          shuffle = True,
                          batch_size = 32,
                          collate_fn = data_collator)

#No validation needed since we are predicting the next token
#valid_loader = DataLoader(tokenized_datasets['validation'],
#                          batch_size = 32,
#                          collate_fn = data_collator)

In [21]:
# check how it works
for batch in train_loader:
  for k, v in batch.items():
    print('k:', k, 'v shape:', v.shape)
  break

k: input_ids v shape: torch.Size([32, 36])
k: attention_mask v shape: torch.Size([32, 36])


In [23]:
# Figure out what the pad token is
tokenizer.pad_token_id

0

## Define model, loss, optimizer

In [25]:
model = Decoder(vocab_size = tokenizer.vocab_size,
                max_len = tokenizer.max_model_input_sizes[checkpoint],
                d_k = 16,
                d_model = 64,
                n_heads = 4,
                n_layers = 2,
                dropout_prob = 0.1,)
model.to(device)

Decoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05,

In [26]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters())

## Model training

In [28]:
#A function to perform the train loss
from datetime import datetime
def train(model, criterion, optimizer, train_loader, epochs):
  train_losses = []

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    for batch in train_loader:
      batch = {k:v.to(device) for k,v in batch.items()}
      #zero the parameter grads
      optimizer.zero_grad()

      #Define the train targets as the shifted version of the inputs
      #clone the input
      targets = batch['input_ids'].clone().detach()
      #shift the target to the left
      targets = torch.roll(targets, shifts = -1, dims = 1)
      #set the final target to the pad token
      targets[:,-1] = tokenizer.pad_token_id

      #Forward pass
      #outputs are (N, T, V) but pytorch expects (N, V, T)
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs.transpose(2,1), targets)
      #backward and optimize
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())

    #get average train loss across batches
    train_loss = np.mean(train_loss)
    #Save losses
    train_losses[it] = train_loss
    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train loss: {train_loss:.4f}, Duration: {dt}')

  return train_losses

In [30]:
#train_losses = train(model, criterion, optimizer, train_loader, epochs = 15)

## Evaluate the model on the validation dataset

In [None]:
valid_loader = DataLoader(tokenized_datasets['validation'],
                          batch_size = 32,
                         collate_fn = data_collator)

In [None]:
# To be continued.