<a href="https://colab.research.google.com/github/m4vic/Transformer-101/blob/main/Encoder/Encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install torch torchvision torchaudio



In [None]:
pip install transformers datasets



In [None]:
import random
from datasets import load_dataset
dataset = load_dataset("imdb")

In [None]:
# imports
import math, random
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader


In [None]:
from datasets import load_dataset

# load IMDb
raw = load_dataset("imdb")

train_raw = raw["train"]
test_raw  = raw["test"]

# make validation from train
train_split = train_raw.train_test_split(test_size=0.1, seed=42)
train_raw = train_split["train"]
val_raw   = train_split["test"]


r

In [None]:

# Print first few raw samples
for i in range(40):
    print(f"Text {i}: {train_raw[i]['text']}")
    print(f"Label {i}: {train_raw[i]['label']}")
    print("="*80)


Text 0: With these people faking so many shots, using old footage, and gassing animals to get them out, not to mention that some of the scenes were filmed on a created set with actors, what's to believe? Old film of countries is nice, but the animal abuse and degradation of natives is painful to watch in these films. I know, racism is OK in these old films, but there is more to that to make this couple lose credibility. Portrayed as fliers, they never flew their planes, Martin Johnson was an ex-vaudevillian, used friends like Jack London for financial gain while stiffing them of royalties, denying his wife's apparent depression, using her as a cute prop, all this makes these films unbearable. They were by no means the first to travel to these lands, or the first to write about them. He was OK as a filmmaker and photographer, but that's about it.
Label 0: 0
Text 1: I don't know the stars, or modern Chinese teenage music - but I do know a thoroughly entertaining movie when I see one.<br 

# **Tokenization**

In [None]:
from collections import Counter
import re

def simple_tokenize(text):
  text = text.lower()
  tokens = re.findall(r"\w+\|[^\s\w]", text)
  return tokens

# building vocab

vocab_size = 20000

counter = Counter()
for ex in train_raw:
  counter.update(simple_tokenize(ex["text"]))

most_common = counter.most_common(vocab_size-3) # reserve 3 tokens
itos = ['<PAD>','<UNK>','<CLS>'] + [w for w,_ in most_common]
stoi = {w:i for i,w in enumerate(itos)}
PAD_IDX = stoi['<PAD>']; UNK_IDX = stoi['<UNK>']; CLS_IDX = stoi['<CLS>']


def encode_text(text, max_len=256):
  toks = simple_tokenize(text)[:max_len-1]
  ids =[CLS_IDX] + [stoi.get(t,UNK_IDX) for t in toks]
  return ids


**dataset class**

In [None]:
class IMDbDataset(Dataset):
    def __init__(self, hf_dataset, max_len=256):

        self.examples = hf_dataset
        self.max_len = max_len

    def __len__(self): return len(self.examples) # return len of eg

    def __getitem__(self, idx): #returns the tensor of ids and labels
        text = self.examples[idx]['text'] # saves text
        label = int(self.examples[idx]['label']) # saves label
        ids = encode_text(text, self.max_len) # indices of text
        return torch.tensor(ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)

def collate_batch(batch):
    ids, labels = zip(*batch) # unpacls and zips
    lengths = [len(x) for x in ids] # len of x
    maxl = max(lengths) # the max value
    padded = torch.full((len(ids), maxl), PAD_IDX, dtype=torch.long) # paddded is a tensor of len of ids and max len of maxl
    attn_mask = torch.zeros((len(ids), maxl), dtype=torch.long) #this is tensor of zero
    for i, x in enumerate(ids):
        padded[i, :len(x)] = x
        attn_mask[i, :len(x)] = 1
    return padded, attn_mask, torch.tensor(labels)


# **Model components**

***Positional encoding***

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()

        pe = torch.zeros(max_len, d_model)

        pos = torch.arange(0, max_len).unsqueeze(1).float()

        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        pe = pe.unsqueeze(0)   # (1, max_len, d_model)

        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (batch, seq_len, d_model)
        seq_len = x.size(1)
        return self.pe[:, :seq_len, :]






***scaled dot product***

In [None]:
def scaled_dot_product_attention(q, k, v, mask=None, dropout=None):
    # q,k,v: (batch, n_heads, seq_len, head_dim)
    dk = q.size(-1)
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(dk)   # (batch, n_heads, seq, seq)
    if mask is not None:
        # mask: (batch, 1, 1, seq) or (batch, 1, seq, seq)
        scores = scores.masked_fill(mask == 0, float('-1e9'))
    attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        attn = dropout(attn)
    output = torch.matmul(attn, v)   # (batch, n_heads, seq, head_dim)
    return output, attn


**MultiHeadAttention**

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        assert d_model % n_heads == 0  # checks that mod == 0
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads # integer devison

        self.q_lin = nn.Linear(d_model, d_model) # init nnLinear layer
        self.k_lin = nn.Linear(d_model, d_model)
        self.v_lin = nn.Linear(d_model, d_model)
        # final linear layer
        self.out_lin = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # x: (batch, seq, d_model)
        batch, seq, _ = x.size()

        q = self.q_lin(x).view(batch, seq, self.n_heads, self.head_dim).transpose(1,2) # (batch, heads, seq, head_dim)
        k = self.k_lin(x).view(batch, seq, self.n_heads, self.head_dim).transpose(1,2) # transposing 1 to 2 means sek will become heads
        v = self.v_lin(x).view(batch, seq, self.n_heads, self.head_dim).transpose(1,2) # .view devides the d_model to n_heads to get head_dim



        if mask is not None:
            # mask: (batch, seq) -> make (batch, 1, 1, seq)
            mask = mask.unsqueeze(1).unsqueeze(1)


        attn_output, attn = scaled_dot_product_attention(q,k,v,mask, self.dropout) # scaled dot product
        # attn_output: (batch, heads, seq, head_dim)
        attn_output = attn_output.transpose(1,2).contiguous().view(batch, seq, self.d_model) # it re shapes it first transpose and thent by using
        #.view we multiply n_head * head_dim = d_model

        return self.out_lin(attn_output), attn  # return (batch, seq, d_model

**FeedForward NN**

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, dim_ff, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, dim_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(dim_ff, d_model),
        )
    def forward(self, x): return self.net(x) # applying all the layer and give x as output


***1 Encoder***

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, dim_ff, dropout=0.1):
        super().__init__()
        # declaring mha,nlayernorm,ff,etc
        self.mha = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, dim_ff, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None): # encoder forward pass
        # x: (batch, seq, d_model)

        attn_out, _ = self.mha(x, mask) # calling mha and returning attn out
        x = x + self.dropout(attn_out) # applying reidual + dropout  + attn out
        x = self.norm1(x) # layer norm
        ff_out = self.ff(x) # ff
        x = x + self.dropout(ff_out)
        x = self.norm2(x)
        return x


***TransformerEncoder***

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model=128, n_heads=4, num_layers=2, dim_ff=512, max_len=256, dropout=0.1, pad_idx=0):
        super().__init__()
        self.d_model = d_model
        # embedding layer
        self.tok_embed = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        # positional encoding
        self.pos_enc = PositionalEncoding(d_model, max_len)

        # model
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, dim_ff, dropout) for _ in range(num_layers)])

        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask=None):
      # applying embedding + deviding with sqrt of d_model
        x = self.tok_embed(input_ids) * math.sqrt(self.d_model)
      # positional encoding + input
        x = x + self.pos_enc(x)
      # dropout
        x = self.dropout(x)
      # model application and saving it to x
        for layer in self.layers:
            x = layer(x, attention_mask)
        return x   # (batch, seq, d_model)


***Sentimentclassifier***

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, encoder, d_model, num_classes=2):
        super().__init__()
        self.encoder = encoder
        self.head = nn.Linear(d_model, num_classes)


    def forward(self, input_ids, attention_mask):
        enc_out = self.encoder(input_ids, attention_mask) # calling the encoder
        cls = enc_out[:, 0, :]            # we added CLS at position 0 in tokenizer
        logits = self.head(cls)
        return logits


# ***TRAINING lOOP***

In [None]:
import torch.optim as optim
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# instantiate
vocab_size = len(itos)  # if using scratch tokenizer
encoder = TransformerEncoder(vocab_size, d_model=128, n_heads=4, num_layers=2, dim_ff=512, max_len=256, pad_idx=PAD_IDX)
model = SentimentClassifier(encoder, d_model=128, num_classes=2).to(device)

optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()


In [None]:
def train_epoch(dataloader):
    model.train()
    total_loss = 0
    correct = 0
    total = 0



    for input_ids, attn_mask, labels in tqdm(dataloader):

        input_ids = input_ids.to(device); attn_mask=attn_mask.to(device); labels=labels.to(device) # shifting to gpu


        logits = model(input_ids, attn_mask) # orward pass

        loss = criterion(logits, labels) # loss calculation

        optimizer.zero_grad() # optmizer making gradient 0

        loss.backward() # backword pass

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # preventing exploding gradient

        optimizer.step() # optimizing

        total_loss += loss.item() * input_ids.size(0) # calculating loss

        preds = logits.argmax(dim=1) # prediction

        correct += (preds == labels).sum().item()

        total += labels.size(0)

    return total_loss/total, correct/total




In [None]:

def eval_epoch(dataloader):
    model.eval()
    total_loss = 0

    correct = 0; total=0

    with torch.no_grad():

        for input_ids, attn_mask, labels in dataloader:

            input_ids = input_ids.to(device); attn_mask=attn_mask.to(device); labels=labels.to(device)

            logits = model(input_ids, attn_mask)

            loss = criterion(logits, labels)

            total_loss += loss.item() * input_ids.size(0)

            preds = logits.argmax(dim=1)

            correct += (preds == labels).sum().item()

            total += labels.size(0)
    return total_loss/total, correct/total


In [None]:
train_ds = IMDbDataset(train_raw, max_len=256)
val_ds   = IMDbDataset(val_raw, max_len=256)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False, collate_fn=collate_batch)

best_val_acc = 0.0
for epoch in range(1, 10):
    train_loss, train_acc = train_epoch(train_loader)
    val_loss, val_acc = eval_epoch(val_loader)
    print(f"Epoch {epoch}: train_loss={train_loss:.4f} train_acc={train_acc:.4f} | val_loss={val_loss:.4f} val_acc={val_acc:.4f}")
    # save best
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_encoder_imdb.pt")


  0%|          | 0/704 [00:00<?, ?it/s]

Epoch 1: train_loss=0.6942 train_acc=0.4945 | val_loss=0.6954 val_acc=0.4972


  0%|          | 0/704 [00:00<?, ?it/s]

Epoch 2: train_loss=0.6937 train_acc=0.5049 | val_loss=0.6935 val_acc=0.5028


  0%|          | 0/704 [00:00<?, ?it/s]

Epoch 3: train_loss=0.6942 train_acc=0.4968 | val_loss=0.6934 val_acc=0.5028


  0%|          | 0/704 [00:00<?, ?it/s]

Epoch 4: train_loss=0.6937 train_acc=0.4975 | val_loss=0.6936 val_acc=0.5028


  0%|          | 0/704 [00:00<?, ?it/s]

Epoch 5: train_loss=0.6941 train_acc=0.4957 | val_loss=0.6965 val_acc=0.4972


  0%|          | 0/704 [00:00<?, ?it/s]

Epoch 6: train_loss=0.6940 train_acc=0.4991 | val_loss=0.6937 val_acc=0.4972


  0%|          | 0/704 [00:00<?, ?it/s]

Epoch 7: train_loss=0.6937 train_acc=0.5033 | val_loss=0.6932 val_acc=0.4972


  0%|          | 0/704 [00:00<?, ?it/s]

Epoch 8: train_loss=0.6937 train_acc=0.4956 | val_loss=0.6932 val_acc=0.5028


  0%|          | 0/704 [00:00<?, ?it/s]

Epoch 9: train_loss=0.6936 train_acc=0.5022 | val_loss=0.6932 val_acc=0.5028


In [None]:
import torch

def preprocess_text(text, max_len=256):
    ids = encode_text(text, max_len)
    pad_len = max_len - len(ids)
    if pad_len > 0:
        ids = ids + [PAD_IDX] * pad_len  # pad to max_len
    else:
        ids = ids[:max_len]  # truncate if too long

    attention_mask = [1 if id != PAD_IDX else 0 for id in ids]

    return torch.tensor([ids]), torch.tensor([attention_mask])


In [None]:
def predict_sentiment(text, model, device="gpu"):
    model.eval()
    ids, mask = preprocess_text(text, max_len=256)
    ids, mask = ids.to(device), mask.to(device)

    with torch.no_grad():
        outputs = model(ids, mask)
        pred = torch.argmax(outputs, dim=1).item()

    return "Positive" if pred == 1 else "Negative"


In [None]:
sample_text = "It's all up to you, Ed?" "Now you get to play the game. The day my father left my mother, he took me to see this film with his new girlfriend, on opening day at the Cinerama Dome in Hollywood. I was but fourteen years old. I wasn't even entirely sure what I was seeing, but I will never forgot it, especially the scene with Ned Beatty.<br /><br />Watching it today, thirty-six years l"
print(predict_sentiment(sample_text, model, device))

Positive


100%|██████████| 100/100 [00:00<00:00, 187664.61it/s]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99



