<a href="https://colab.research.google.com/github/m4vic/Transformer-101/blob/main/Encoder/encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# imports
import math, random
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader


# ***DataLoading***

In [3]:
raw  = load_dataset("ag_news")

train_raw = raw["train"]
test_raw = raw["test"]

train_split = train_raw.train_test_split(test_size=0.2, seed=42)
train_raw = train_split["train"]
val_raw = train_split["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [4]:
# Print first few raw samples
for i in range(10):
    print(f"Text {i}: {train_raw[i]['text']}")
    print(f"Label {i}: {train_raw[i]['label']}")
    print("="*80)


Text 0: Nation #39;s Cotton Crop May Exceed Records This year #39;s cotton crop is on pace to be the largest in US history, although hurricanes that have battered the nation in the past few months may reduce the final amount.
Label 0: 2
Text 1: 18 years and still rollin #39; ALEX FERGUSON will take up the one-year rolling option on his contract and continue as Manchester United boss next season. Ferguson celebrates 18 years in charge at Old Trafford in the Manchester derby tomorrow.
Label 1: 1
Text 2: Madrid Masters: Safin beats Nalbandian Sunday #39;s final of the Madrid Masters pitted players ranked ninth, Marat Safin, and 10th, David Nalbandian, in the world; both near the top of their games.
Label 2: 1
Text 3: Sirius Satellite Signs Howard Stern to 5-Year Accord (Update9) Howard Stern, host of the top-rated radio show for young men in New York and Los Angeles, will move to Sirius Satellite Radio Inc.
Label 3: 2
Text 4: NATO, Russia To Meet Over Beslan School Siege 6 September 2004 

# **Tokenization**

In [None]:
# imorting r and collections
import re
from collections import Counter

In [None]:
# function for tokenization

def tokenize(text):
    text = text.lower()
    # match either words (\w+) OR single punctuation ([^\w\s])
    return re.findall(r"\w+|[^\w\s]", text)


In [None]:
print(tokenize("hi my name"))

['hi', 'my', 'name']


In [None]:
# defining the function counter
counter = Counter()

# counter
for ex in train_raw: # for string in dataset train sentence
  counter.update(tokenize(ex["text"])) #  update in counter tuple . by tokenizing all the sentences


In [None]:
# adding special tokens
vocab_size = 30000

most_common = counter.most_common(vocab_size-3) # reserve 3 tokens

itos = ['<PAD>','<UNK>','<CLS>'] + [w for w,_ in most_common]
stoi = {w:i for i,w in enumerate(itos)}

PAD_IDX = stoi['<PAD>']; UNK_IDX = stoi['<UNK>']; CLS_IDX = stoi['<CLS>']

def encode_text(text, max_len=256):
  tokens = tokenize(text)[:max_len-1]
  ids = [CLS_IDX] + [stoi.get(t, UNK_IDX) for t in tokens]
  return ids

In [None]:
print(tokenize(train_raw[0]["text"]))

['nation', '#', '39', ';', 's', 'cotton', 'crop', 'may', 'exceed', 'records', 'this', 'year', '#', '39', ';', 's', 'cotton', 'crop', 'is', 'on', 'pace', 'to', 'be', 'the', 'largest', 'in', 'us', 'history', ',', 'although', 'hurricanes', 'that', 'have', 'battered', 'the', 'nation', 'in', 'the', 'past', 'few', 'months', 'may', 'reduce', 'the', 'final', 'amount', '.']


In [None]:
class agnewsDataset(Dataset):
    def __init__(self, hf_dataset, max_len=256):

        self.examples = hf_dataset
        self.max_len = max_len

    def __len__(self): return len(self.examples) # return len of eg

    def __getitem__(self, idx): #returns the tensor of ids and labels
        text = self.examples[idx]['text'] # saves text
        label = int(self.examples[idx]['label']) # saves label
        ids = encode_text(text, self.max_len) # indices of text
        return torch.tensor(ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)

def collate_batch(batch):
    ids, labels = zip(*batch) # unpacls and zips
    lengths = [len(x) for x in ids] # len of x
    maxl = max(lengths) # the max value
    padded = torch.full((len(ids), maxl), PAD_IDX, dtype=torch.long) # paddded is a tensor of len of ids and max len of maxl
    attn_mask = torch.zeros((len(ids), maxl), dtype=torch.long) #this is tensor of zero
    for i, x in enumerate(ids):
        padded[i, :len(x)] = x
        attn_mask[i, :len(x)] = 1
    return padded, attn_mask, torch.tensor(labels)

# **Model**

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()

        pe = torch.zeros(max_len, d_model)

        pos = torch.arange(0, max_len).unsqueeze(1).float()

        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        pe = pe.unsqueeze(0)   # (1, max_len, d_model)

        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (batch, seq_len, d_model)
        seq_len = x.size(1)
        return self.pe[:, :seq_len, :]



In [None]:
def scaled_dot_product_attention(q, k, v, mask=None, dropout=None):
    # q,k,v: (batch, n_heads, seq_len, head_dim)
    dk = q.size(-1)
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(dk)   # (batch, n_heads, seq, seq)
    if mask is not None:
        # mask: (batch, 1, 1, seq) or (batch, 1, seq, seq)
        scores = scores.masked_fill(mask == 0, float('-1e9'))
    attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        attn = dropout(attn)
    output = torch.matmul(attn, v)   # (batch, n_heads, seq, head_dim)
    return output, attn


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        assert d_model % n_heads == 0  # checks that mod == 0
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads # integer devison

        self.q_lin = nn.Linear(d_model, d_model) # init nnLinear layer
        self.k_lin = nn.Linear(d_model, d_model)
        self.v_lin = nn.Linear(d_model, d_model)
        # final linear layer
        self.out_lin = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # x: (batch, seq, d_model)
        batch, seq, _ = x.size()

        q = self.q_lin(x).view(batch, seq, self.n_heads, self.head_dim).transpose(1,2) # (batch, heads, seq, head_dim)
        k = self.k_lin(x).view(batch, seq, self.n_heads, self.head_dim).transpose(1,2) # transposing 1 to 2 means sek will become heads
        v = self.v_lin(x).view(batch, seq, self.n_heads, self.head_dim).transpose(1,2) # .view devides the d_model to n_heads to get head_dim



        if mask is not None:
            # mask: (batch, seq) -> make (batch, 1, 1, seq)
            mask = mask.unsqueeze(1).unsqueeze(1)


        attn_output, attn = scaled_dot_product_attention(q,k,v,mask, self.dropout) # scaled dot product
        # attn_output: (batch, heads, seq, head_dim)
        attn_output = attn_output.transpose(1,2).contiguous().view(batch, seq, self.d_model) # it re shapes it first transpose and thent by using
        #.view we multiply n_head * head_dim = d_model

        return self.out_lin(attn_output), attn  # return (batch, seq, d_model

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, dim_ff, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, dim_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(dim_ff, d_model),
        )
    def forward(self, x): return self.net(x) # applying all the layer and give x as output


creating the encoder function



In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, dim_ff, dropout=0.1):
        super().__init__()
        # declaring mha,nlayernorm,ff,etc
        self.mha = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, dim_ff, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None): # encoder forward pass
        # x: (batch, seq, d_model)

        attn_out, _ = self.mha(x, mask) # calling mha and returning attn out
        x = x + self.dropout(attn_out) # applying reidual + dropout  + attn out
        x = self.norm1(x) # layer norm
        ff_out = self.ff(x) # ff
        x = x + self.dropout(ff_out)
        x = self.norm2(x)
        return x


testing

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model=128, n_heads=4, num_layers=2, dim_ff=512, max_len=256, dropout=0.1, pad_idx=0):
        super().__init__()
        self.d_model = d_model
        # embedding layer
        self.tok_embed = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        # positional encoding
        self.pos_enc = PositionalEncoding(d_model, max_len)

        # model
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, dim_ff, dropout) for _ in range(num_layers)])

        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask=None):
      # applying embedding + deviding with sqrt of d_model
        x = self.tok_embed(input_ids) * math.sqrt(self.d_model)
      # positional encoding + input
        x = x + self.pos_enc(x)
      # dropout
        x = self.dropout(x)
      # model application and saving it to x
        for layer in self.layers:
            x = layer(x, attention_mask)
        return x   # (batch, seq, d_model)






In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, encoder, d_model, num_classes=2):
        super().__init__()
        self.encoder = encoder
        self.head = nn.Linear(d_model, num_classes)


    def forward(self, input_ids, attention_mask):
        enc_out = self.encoder(input_ids, attention_mask) # calling the encoder
        cls = (enc_out * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)       # we added CLS at position 0 in tokenizer
        logits = self.head(cls)
        return logits


testing the collate function

# **Training Loop**

parameters -
vocab_size = 30_000
d_model = 256
n_heads = 8
num_layers = 4
dim_ff = 512
num_classes = 4
max_len = 256



In [None]:
import torch.optim as optim
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# instantiate
vocab_size = len(itos)  # if using scratch tokenizer
encoder = TransformerEncoder(vocab_size, d_model=256, n_heads=8, num_layers=4, dim_ff=512, max_len=256, pad_idx=PAD_IDX, dropout=0.2)
model = SentimentClassifier(encoder, d_model=256, num_classes=4).to(device)

optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()




In [None]:
def train_epoch(dataloader):
    model.train()
    total_loss = 0
    correct = 0
    total = 0



    for input_ids, attn_mask, labels in tqdm(dataloader):

        input_ids = input_ids.to(device); attn_mask=attn_mask.to(device); labels=labels.to(device) # shifting to gpu


        logits = model(input_ids, attn_mask) # orward pass

        loss = criterion(logits, labels) # loss calculation

        optimizer.zero_grad() # optmizer making gradient 0

        loss.backward() # backword pass

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # preventing exploding gradient

        optimizer.step() # optimizing

        total_loss += loss.item() * input_ids.size(0) # calculating loss

        preds = logits.argmax(dim=1) # prediction

        correct += (preds == labels).sum().item()

        total += labels.size(0)

    return total_loss/total, correct/total




In [None]:

def eval_epoch(dataloader):
    model.eval()
    total_loss = 0

    correct = 0; total=0

    with torch.no_grad():

        for input_ids, attn_mask, labels in dataloader:

            input_ids = input_ids.to(device); attn_mask=attn_mask.to(device); labels=labels.to(device)

            logits = model(input_ids, attn_mask)

            loss = criterion(logits, labels)

            total_loss += loss.item() * input_ids.size(0)

            preds = logits.argmax(dim=1)

            correct += (preds == labels).sum().item()

            total += labels.size(0)
    return total_loss/total, correct/total


In [None]:
 import json

In [None]:
train_ds = agnewsDataset(train_raw, max_len=256)
val_ds   = agnewsDataset(val_raw, max_len=256)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False, collate_fn=collate_batch)

best_val_acc = 0.0
for epoch in range(14, 16):
    train_loss, train_acc = train_epoch(train_loader)
    val_loss, val_acc = eval_epoch(val_loader)
    print(f"Epoch {epoch}: train_loss={train_loss:.4f} train_acc={train_acc:.4f} | val_loss={val_loss:.4f} val_acc={val_acc:.4f}")
    # save best
    if val_acc > best_val_acc:
        best_val_acc = val_acc


        torch.save({
      "epoch": epoch,
      "model_state_dict": model.state_dict(),
      "optimizer_state_dict": optimizer.state_dict(),
      "val_acc": val_acc
      }, "agnews_encoder.pt")

        # save vocab
        with open("vocab.json", "w") as f:
            json.dump({"stoi": stoi, "itos": itos}, f)

        print(f"✅ Saved new best model at epoch {epoch} with val_acc={val_acc:.4f}")


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 14: train_loss=0.2305 train_acc=0.9213 | val_loss=0.2654 val_acc=0.9175
✅ Saved new best model at epoch 14 with val_acc=0.9175


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 15: train_loss=0.2288 train_acc=0.9216 | val_loss=0.2855 val_acc=0.9170


MODEL ARCHITECTURE

> Input (B, 64) with pad_mask to ignore the <pad>
> Embedding LAyer dimension 128
> positonal encoding
>




# **Model Prediction**

In [None]:
import torch

def preprocess_text(text, max_len=256):
    ids = encode_text(text, max_len)
    pad_len = max_len - len(ids)
    if pad_len > 0:
        ids = ids + [PAD_IDX] * pad_len  # pad to max_len
    else:
        ids = ids[:max_len]  # truncate if too long

    attention_mask = [1 if id != PAD_IDX else 0 for id in ids]

    return torch.tensor([ids]), torch.tensor([attention_mask])


testing the embedding

In [None]:
def predict_sentiment(text, model, device="gpu"):
    model.eval()
    ids, mask = preprocess_text(text, max_len=256)
    ids, mask = ids.to(device), mask.to(device)

    with torch.no_grad():
        outputs = model(ids, mask)
        pred = torch.argmax(outputs, dim=1).item()
    if pred == 1:
      ret = "world"
    elif pred == 2:
      ret = "Sports"
    elif pred == 3:
      ret = "Business"
    else:
      ret = "sci/Tech"
    return ret

In [None]:
sample_text = "Aussie equestrian hopes end in sixth Australia #39;s dreams of an historic fourth successive three-day eventing gold medal ended in disappointment but there was still joy for the team when Andrew Hoy #39;s wife won dual gold medals."
print(predict_sentiment(sample_text, model, device))

world


Cleaned notebook saved as /content/drive/MyDrive/Colab Notebooks/encoder_clean.ipynb
