In [None]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset

import numpy as np
import matplotlib.pyplot as plt

In [None]:
class CausalSelfAttention(nn.Module):
  def __init__(self,d_k,d_model,n_heads,max_len):
    super().__init__()

    self.d_k=d_k
    self.n_heads=n_heads

    self.key= nn.Linear(d_model, d_k * n_heads)
    self.query= nn.Linear(d_model, d_k * n_heads)
    self.value= nn.Linear(d_model, d_k * n_heads)

    self.fc=nn.Linear(d_k * n_heads, d_model)

    cm=torch.tril(torch.ones(max_len, max_len))
    self.register_buffer(
        "causal_mask",
        cm.view(1, 1, max_len, max_len)
    )
  def forward(self,q,k,v, pad_mask=None):
    q=self.query(q)
    k=self.key(k)
    v=self.value(v)

    N=q.shape[0]
    T=q.shape[1]

    q=q.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
    k=k.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
    v=v.view(N, T, self.n_heads, self.d_k).transpose(1, 2)

    attn_scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)
    if pad_mask is not None:
      attn_scores= attn_scores.masked_fill(
          pad_mask[:, None, None,: ] == 0 , float('-inf'))
    attn_scores = attn_scores.masked_fill(
        self.causal_mask[:, :, :T, :T] == 0, float('-inf'))
    attn_weights = F.softmax(attn_scores, dim=-1)

    A = attn_weights @ v
    A = A.transpose(1,2)
    A = A.contiguous().view(N,T, self.d_k * self.n_heads)

    return self.fc(A)

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob=0.1):
    super().__init__()

    self.ln1= nn.LayerNorm(d_model)
    self.ln2= nn.LayerNorm(d_model)
    self.mha= CausalSelfAttention(d_k, d_model, n_heads, max_len)
    self.ann= nn.Sequential(
        nn.Linear(d_model, d_model* 4),
        nn.GELU(),
        nn.Linear(d_model * 4, d_model),
        nn.Dropout(dropout_prob),
    )
    self.dropout=nn.Dropout(p=dropout_prob)

  def forward(self, x, pad_mask=None):
    x = self.ln1(x + self.mha(x,x,x, pad_mask))
    x= self.ln2(x + self.ann(x))
    x=self.dropout(x)
    return x

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self,d_model,max_len=2048, dropout_prob=0.1):
    super().__init__()
    self.dropout= nn.Dropout(dropout_prob)

    position= torch.arange(max_len).unsqueeze(1)
    exp_term =torch.arange(0, d_model, 2)
    div_term= torch.exp(exp_term * (-math.log(10000.0)/ d_model))
    pe =torch.zeros(1,max_len,d_model)
    pe[0,:,0::2]=torch.sin(position * div_term)
    pe[0, :, 1::2]=torch.sin(position * div_term)
    self.register_buffer('pe',pe)

  def forward(self, x):
    x= x+self.pe[:, :x.size(1), :]
    return self.dropout(x)

In [None]:
class Decoder(nn.Module):
  def __init__(self,
               vocab_size,
               max_len,
               d_k,
               d_model,
               n_heads,
               n_layers,
               dropout_prob):
    super().__init__()

    self.embedding= nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks=[
        TransformerBlock(
            d_k,
            d_model,
            n_heads,
            max_len,
            dropout_prob
        ) for _ in range(n_layers)
    ]
    self.transformer_blocks= nn.Sequential(*transformer_blocks)
    self.ln= nn.LayerNorm(d_model)
    self.fc= nn.Linear(d_model,vocab_size)

  def forward(self, x, pad_mask=None):
    x=self.embedding(x)
    x=self.pos_encoding(x)
    for block in self.transformer_blocks:
      x=block(x, pad_mask)
    x=self.ln(x)
    x=self.fc(x)
    return x

In [None]:
model= Decoder(20000, 1024, 16, 64, 4, 2, 0.1)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda:0


Decoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05,

In [None]:
x = np.random.randint(0, 20_000, size=(8, 512))
x_t = torch.tensor(x).to(device)

In [None]:
y=model(x_t)
y.shape

torch.Size([8, 512, 20000])

In [None]:
mask = np.ones((8, 512))
mask[:, 256:] = 0
mask_t = torch.tensor(mask).to(device)

In [None]:
!pip install transformers datasets



In [None]:
from transformers import AutoTokenizer , DataCollatorWithPadding

In [None]:

checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from datasets import load_dataset

In [None]:
# we'll use the same dataset, just ignore the labels
raw_datasets = load_dataset("glue", "sst2")

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(
    ["sentence", "idx", "label"])

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)

In [None]:
# check how it works
for batch in train_loader:
  for k, v in batch.items():
    print("k:", k, "v.shape:", v.shape)
  break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


k: input_ids v.shape: torch.Size([32, 51])
k: attention_mask v.shape: torch.Size([32, 51])


In [None]:
tokenizer.pad_token_id

0

In [None]:
model = Decoder(
    vocab_size=tokenizer.vocab_size,
    max_len=tokenizer.max_model_input_sizes[checkpoint],
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dropout_prob=0.1,
)
model.to(device)

Decoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05,

In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters())

In [None]:
from datetime import datetime

In [None]:
def train(model, criterion,optimizer,train_loader,epochs):
  train_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0= datetime.now()
    train_loss=[]
    for batch in train_loader:
      batch={k: v.to(device) for k, v in batch.items()}

      optimizer.zero_grad()

      targets = batch['input_ids'].clone().detach()
      targets = torch.roll(targets, shifts=-1,dims=1)
      targets[:,-1]= tokenizer.pad_token_id

      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss= criterion(outputs.transpose(2,1),targets)

    loss.backward()
    optimizer.step()
    train_loss.append(loss.item())

    train_loss=np.mean(train_loss)

    train_losses[it]= train_loss
    dt=datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')

  return train_losses

In [None]:
train_losses = train(
    model, criterion, optimizer, train_loader, epochs=15)

Epoch 1/15, Train Loss: 4.4477, Duration: 0:00:22.234026
Epoch 2/15, Train Loss: 4.4461, Duration: 0:00:22.073419
Epoch 3/15, Train Loss: 4.3855, Duration: 0:00:22.385481
Epoch 4/15, Train Loss: 4.5376, Duration: 0:00:22.670897
Epoch 5/15, Train Loss: 4.7858, Duration: 0:00:24.048303
Epoch 6/15, Train Loss: 4.2954, Duration: 0:00:22.235688
Epoch 7/15, Train Loss: 4.4654, Duration: 0:00:26.513532
Epoch 8/15, Train Loss: 4.3499, Duration: 0:00:25.815533
Epoch 9/15, Train Loss: 4.4842, Duration: 0:00:27.207444
Epoch 10/15, Train Loss: 4.1911, Duration: 0:00:25.701373
Epoch 11/15, Train Loss: 4.5511, Duration: 0:00:22.059607
Epoch 12/15, Train Loss: 4.3957, Duration: 0:00:22.121972
Epoch 13/15, Train Loss: 4.2381, Duration: 0:00:22.721953
Epoch 14/15, Train Loss: 4.3993, Duration: 0:00:22.260906
Epoch 15/15, Train Loss: 4.6204, Duration: 0:00:22.349653


In [None]:
valid_loader = DataLoader(
    tokenized_datasets['validation'],
    batch_size=1,
    collate_fn=data_collator
)

In [None]:
model.eval()
for batch in valid_loader:
  batch= {k: v.to(device) for k,v in batch.items()}
  outputs = model(batch['input_ids'],batch['attention_mask'])
  break

In [None]:
outputs.shape

torch.Size([1, 12, 28996])

In [None]:
torch.argmax(outputs, axis=-1)

tensor([[  170,   112,   188,   170,  2523,   117,   102, 20714,   102,   102,
           102,  1104]], device='cuda:0')

In [None]:
prediction_ids = torch.argmax(outputs, axis=-1)

In [None]:
tokenizer.decode(prediction_ids[0])

"a's a movie, [SEP] awkwardly [SEP] [SEP] [SEP] of"

In [None]:
tokenizer.decode(batch['input_ids'][0])

"[CLS] it's a charming and often affecting journey. [SEP]"

In [None]:
tokenizer.decode(torch.concat((batch['input_ids'][0, :5], prediction_ids[:, 4])))

"[CLS] it's a movie"

In [None]:
# generate something
prompt = "it's"

tokenized_prompt = tokenizer(prompt, return_tensors='pt')
tokenized_prompt

{'input_ids': tensor([[ 101, 1122,  112,  188,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [None]:
outputs = model(
    tokenized_prompt['input_ids'][:, :-1].to(device),
    tokenized_prompt['attention_mask'][:, :-1].to(device))

outputs.shape

torch.Size([1, 4, 28996])

In [None]:
prediction_ids = torch.argmax(outputs[:, -1, :], axis=-1)

In [None]:
tokenizer.decode(prediction_ids[0])

'a'

In [None]:
# generate something
prompt = "it's a"

tokenized_prompt = tokenizer(prompt, return_tensors='pt')

# prepare inputs + get rid of SEP token at the end
input_ids = tokenized_prompt['input_ids'][:, :-1].to(device)
mask = tokenized_prompt['attention_mask'][:, :-1].to(device)

for _ in range(20):
  outputs = model(input_ids, mask)
  prediction_id = torch.argmax(outputs[:, -1, :], axis=-1)

  input_ids = torch.hstack((input_ids, prediction_id.view(1, 1)))
  mask = torch.ones_like(input_ids)

  if prediction_id == tokenizer.sep_token_id:
    break

In [None]:
tokenizer.decode(input_ids[0])

"[CLS] it's a movie that is a movie that is a movie that is a movie that is a movie that is a"