# **Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Installments**

In [None]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.5.1-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.8-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.5.1-py3-none-any.whl (890 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m890.6/890.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.8-py3-none-any.whl (26 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.8 torchmetrics-1.5.1


In [None]:
!pip install torch==1.12.1 torchdata==0.4.1 torchtext==0.13.1

Collecting torch==1.12.1
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl.metadata (22 kB)
Collecting torchdata==0.4.1
  Downloading torchdata-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting torchtext==0.13.1
  Downloading torchtext-0.13.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.9 kB)
Collecting portalocker>=2.0.0 (from torchdata==0.4.1)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchdata-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchtext-0.13.1-cp310-cp310-manylinux1_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# **Imports**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split

import torchtext

import numpy as np
import matplotlib.pyplot as plt
from torchmetrics import Accuracy

from tqdm import tqdm

In [None]:
print(torch.__version__)
print(torchtext.__version__)

1.12.1+cu102
0.13.1


# **Preprocessing**

In the following you'll see some preprocessing and transformation techniques used for natural language processing

In [None]:
from torchtext.data.utils import get_tokenizer

In [None]:
tokenizer = get_tokenizer('basic_english')

In [None]:
example1 = 'hello world!'
tokenized_example1 = tokenizer(example1)
tokenized_example1

['hello', 'world', '!']

## **Vectorization**

In [None]:
from torchtext.vocab import GloVe

In [None]:
vocab = GloVe(name='6B', dim=50)

.vector_cache/glove.6B.zip: 862MB [02:42, 5.31MB/s]                           
100%|█████████▉| 399999/400000 [00:12<00:00, 31317.82it/s]


We can both access the word by its index and its index to access the word

In [None]:
# getting the list of words in Glov dic
vocab.itos

['the',
 ',',
 '.',
 'of',
 'to',
 'and',
 'in',
 'a',
 '"',
 "'s",
 'for',
 '-',
 'that',
 'on',
 'is',
 'was',
 'said',
 'with',
 'he',
 'as',
 'it',
 'by',
 'at',
 '(',
 ')',
 'from',
 'his',
 "''",
 '``',
 'an',
 'be',
 'has',
 'are',
 'have',
 'but',
 'were',
 'not',
 'this',
 'who',
 'they',
 'had',
 'i',
 'which',
 'will',
 'their',
 ':',
 'or',
 'its',
 'one',
 'after',
 'new',
 'been',
 'also',
 'we',
 'would',
 'two',
 'more',
 "'",
 'first',
 'about',
 'up',
 'when',
 'year',
 'there',
 'all',
 '--',
 'out',
 'she',
 'other',
 'people',
 "n't",
 'her',
 'percent',
 'than',
 'over',
 'into',
 'last',
 'some',
 'government',
 'time',
 '$',
 'you',
 'years',
 'if',
 'no',
 'world',
 'can',
 'three',
 'do',
 ';',
 'president',
 'only',
 'state',
 'million',
 'could',
 'us',
 'most',
 '_',
 'against',
 'u.s.',
 'so',
 'them',
 'what',
 'him',
 'united',
 'during',
 'before',
 'may',
 'since',
 'many',
 'while',
 'where',
 'states',
 'because',
 'now',
 'city',
 'made',
 'like',
 

In [None]:
len(vocab.itos)

400000

In [None]:
vec_boy = vocab.get_vecs_by_tokens('boy')
vec_girl = vocab.get_vecs_by_tokens('girl')
vec_earth = vocab.get_vecs_by_tokens('earth')

In [None]:
F.cosine_similarity(vec_boy, vec_girl, dim=0)
F.cosine_similarity(vec_girl, vec_earth, dim=0)

tensor(0.2822)

## **Transforms**

In [None]:
from torchtext import transforms as T

In [None]:
VOCAB_FILE = "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt"

In [None]:
tokenizer = T.BERTTokenizer(vocab_path=VOCAB_FILE,
                            do_lower_case=True,
                            return_tokens=True)

100%|██████████| 232k/232k [00:00<00:00, 553kB/s]


In [None]:
example2 = 'HI! HOW IS IT GOING?'
tokenizer(example2)

['hi', '!', 'how', 'is', 'it', 'going', '?']

Extracting the index of words

In [None]:
example3 = 'I am using my computer.'
example4 = 'I am a programmer.'

In [None]:
example3_tokens = tokenizer(example3)
example4_tokens = tokenizer(example4)

In [None]:
example3_indices = [vocab.stoi[token] for token in example3_tokens]
example4_indices = [vocab.stoi[token] for token in example4_tokens]
token_ids = [example3_indices, example4_indices]

token_ids

[[41, 913, 622, 192, 951, 2], [41, 913, 7, 19226, 2]]

Making both vecs an equal size for further processing

In [None]:
resizing_ids = T.ToTensor(padding_value=0)
resizing_ids(token_ids)
# T.ToTensor(padding_value=0)(token_ids)

tensor([[   41,   913,   622,   192,   951,     2],
        [   41,   913,     7, 19226,     2,     0]])

In [None]:
T.Truncate(max_seq_len=3)(example3_tokens)

['i', 'am', 'using']

# **Dataset**

In [None]:
from torchtext import datasets

In [None]:
train_set, test_set = datasets.AG_NEWS('/content/', split=('train', 'test'))

In [None]:
from torch.nn.utils.rnn import pad_sequence

In [None]:
next(iter(train_set))

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

So based on the structure of our dataset, we extract the text and label using a custom function

In [None]:
from torchtext.data.utils import get_tokenizer

In [None]:
tokenizer = get_tokenizer('basic_english')

In [None]:
def collate(batch):
  labels = torch.LongTensor([b[0] for b in batch]) - 1
  texts = [b[1] for b in batch]
  text_tokens = [tokenizer(text) for text in texts]
  token_vectors = [vocab.get_vecs_by_tokens(tokens) for tokens in text_tokens]
  token_vectors = pad_sequence(token_vectors)

  return token_vectors, labels

Applying the custom function on our dataset

In [None]:
train_loader = DataLoader(train_set, batch_size=128, shuffle=True, collate_fn=collate)
test_loader = DataLoader(test_set, batch_size=256, collate_fn=collate)

In [None]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7d19ce5aa080>

In [None]:
x, y = next(iter(train_loader))



In [None]:
x, y = next(iter(test_loader))

## Utils

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def num_params(model):
  nums = sum(p.numel() for p in model.parameters())/1e6
  return nums

## Init

In [None]:
num_cls = 4

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# **Model**

This part demonstrates how to load a pre-trained RoBERTa model, modify its structure, and manipulate embeddings for downstream tasks, particularly classification.

In neural networks, especially those used for natural language processing (NLP), **embedding layers** and **embedding bags** play a crucial role in transforming categorical inputs (like words) into dense numerical vectors that a model can understand and process.

### 1. **What is an Embedding Layer?**
An **embedding layer** is a type of neural network layer that maps discrete, categorical values (like words or tokens) to continuous vector representations. Each unique categorical value (e.g., each word in a vocabulary) gets assigned a unique vector of fixed dimensions. These vectors are learned during training, capturing semantic relationships between categories.

For example, words with similar meanings (like "king" and "queen") will tend to have embeddings that are close in the vector space, as the embedding layer learns to encode such relationships based on the task it is trained on.

#### Example of an Embedding Layer:
Suppose we have a vocabulary of 5 words: `["cat", "dog", "fish", "lion", "tiger"]`, and we want each word to be represented by a vector of 3 dimensions.

1. We assign an index to each word: `cat=0, dog=1, fish=2, lion=3, tiger=4`.
2. We initialize an embedding matrix of shape (5, 3). Each row of this matrix is a 3-dimensional vector representing one word:
   ```
   [[0.1, 0.2, 0.3],   # embedding for "cat"
    [0.4, 0.5, 0.6],   # embedding for "dog"
    [0.7, 0.8, 0.9],   # embedding for "fish"
    [0.2, 0.3, 0.4],   # embedding for "lion"
    [0.5, 0.6, 0.7]]   # embedding for "tiger"
   ```
3. When we input a word (like "dog") into the embedding layer, it returns the corresponding vector `[0.4, 0.5, 0.6]`.

### 2. **What is an Embedding Bag?**
An **embedding bag** is an extension of the embedding layer that is designed to handle variable-length sequences and perform operations like averaging or summing over embeddings within a "bag" or a group of tokens. Instead of getting an embedding for each individual token, an embedding bag takes a sequence of token indices and computes a single embedding vector for the entire sequence by applying a reduction operation (e.g., sum or average).

This is particularly useful for tasks like sentence classification, where we may want to represent a sentence with a single vector derived from its constituent word embeddings.

#### Example of an Embedding Bag:
Suppose we have a sentence represented by the token indices `[0, 1, 3]` (corresponding to "cat," "dog," and "lion").

1. The embedding bag takes these indices and looks up each token's vector:
   ```
   "cat" -> [0.1, 0.2, 0.3]
   "dog" -> [0.4, 0.5, 0.6]
   "lion" -> [0.2, 0.3, 0.4]
   ```
2. The embedding bag then combines these vectors, for example by summing:
   ```
   [0.1 + 0.4 + 0.2, 0.2 + 0.5 + 0.3, 0.3 + 0.6 + 0.4]
   = [0.7, 1.0, 1.3]
   ```
3. This summed vector `[0.7, 1.0, 1.3]` now represents the entire sentence.


### 3. **How Embedding Works: Step-by-Step Example**

Imagine we have the following input sentence and embedding configuration:
- **Sentence**: "I love dogs"
- **Vocabulary**: `{"I": 0, "love": 1, "dogs": 2}`
- **Embedding dimension**: 2

1. **Initialize an Embedding Matrix** of size `(vocabulary_size, embedding_dimension)`:
   ```
   [[0.2, 0.8],    # "I" embedding
    [0.6, 0.1],    # "love" embedding
    [0.3, 0.7]]    # "dogs" embedding
   ```
   
2. **Convert Words to Indices**:
   ```
   "I" -> 0
   "love" -> 1
   "dogs" -> 2
   ```
   Our sentence becomes: `[0, 1, 2]`.

3. **Lookup the Embedding Vectors for Each Token**:
   ```
   [0.2, 0.8],  # "I"
   [0.6, 0.1],  # "love"
   [0.3, 0.7]   # "dogs"
   ```

4. **Combine the Embeddings**:
   For individual embeddings, we leave them as-is. If we use an embedding bag with "sum" mode, we’d get:
   ```
   [0.2 + 0.6 + 0.3, 0.8 + 0.1 + 0.7] = [1.1, 1.6]
   ```
   This `[1.1, 1.6]` vector can now represent the sentence as a whole in tasks like classification.

In [None]:
from torchtext import models
from torchtext.functional import to_tensor

In [None]:
roberta_base = models.ROBERTA_BASE_ENCODER
roberta_base._head = nn.LazyLinear(20)
roberta_base._head = nn.Identity()

In [None]:
input_batch = ["Hello world", "How are you!"]

In [None]:
roberta_encoder = roberta_base.get_model()
roberta_transformer = roberta_base.transform()

In [None]:
input_tensor = to_tensor(roberta_transformer(input_batch), padding_value=1)

In [None]:
roberta_embedded_weights = roberta_encoder.encoder.transformer.token_embedding.weight
torch.save(roberta_embedded_weights, 'roberta_embedded_weights.pt')
loaded_roberta_embedded_weights = torch.load('loaded_roberta_embedded_weights.pt')

n, d = loaded_roberta_embedded_weights.shape

embedding = nn.Embedding(n, d)
embedding = embedding.from_pretrained(loaded_roberta_embedded_weights)

In [None]:
from torchtext import models
from torchtext.functional import to_tensor

In [None]:
class RNNModel(nn.Module):
  def __init__(self, RNN, input_size, hidden_size, num_layers, bidirectional, num_cls):
    super().__init__()
    self.rnn = RNN(input_size=input_size,
                      hidden_size=hidden_size,
                      num_layers=num_layers,
                      bidirectional=bidirectional,
                      batch_first=False)
    self.fc = nn.LazyLinear(num_cls)

  def forward(self, x):
    outputs, _ = self.rnn(x)
    y = self.fc(outputs)
    y = y.mean(dim=0)
    return y

In [None]:
model = RNNModel(nn.LSTM, 50, 128, 1, False, num_cls)



## Functions

In [None]:
def train_one_epoch(model, train_loader, loss_fn, optimizer, epoch=None):
  model.train()
  loss_train = AverageMeter()
  acc_train = Accuracy(task='MULTICLASS', num_classes=num_cls).to(device)
  with tqdm(train_loader, unit="batch") as tepoch:
    for inputs, targets in tepoch:
      if epoch is not None:
        tepoch.set_description(f"Epoch {epoch}")
      inputs = inputs.to(device)
      targets = targets.to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs, targets)

      loss.backward()

      optimizer.step()
      optimizer.zero_grad()

      loss_train.update(loss.item())
      acc_train(outputs, targets.int())
      tepoch.set_postfix(loss=loss_train.avg,
                         accuracy=100.*acc_train.compute().item())
  return model, loss_train.avg, acc_train.compute().item()

In [None]:
def validation(model, test_loader, loss_fn):
  model.eval()
  with torch.no_grad():
    loss_valid = AverageMeter()
    acc_valid = Accuracy(task='MULTICLASS', num_classes=num_cls).to(device)
    for i, (inputs, targets) in enumerate(test_loader):
      inputs = inputs.to(device)
      targets = targets.to(device)

      outputs = model(inputs)
      loss = loss_fn(outputs, targets)

      loss_valid.update(loss.item())
      acc_valid(outputs, targets.int())
  return loss_valid.avg, acc_valid.compute().item()

## Train

### Step 1: check forward path

Calculate loss for one batch

In [None]:
model = RNNModel(nn.LSTM, 50, 128, 1, False, num_cls).to(device)
loss_fn = nn.CrossEntropyLoss()

x_batch, y_batch = next(iter(train_loader))
outputs = model(x_batch.to(device))
loss = loss_fn(outputs, y_batch.to(device))
print(loss)



tensor(1.3891, grad_fn=<NllLossBackward0>)


### Step 2: check backward path

Select 5 random batches and train the model

In [None]:
from torchtext.data.functional import to_map_style_dataset

In [None]:
train_set_map_style = to_map_style_dataset(train_set)



In [None]:
_, mini_train_dataset = random_split(train_set_map_style,
                                     (len(train_set_map_style)-500, 500))
mini_train_loader = DataLoader(mini_train_dataset, 20, collate_fn=collate)

In [None]:
model = RNNModel(nn.LSTM, 50, 128, 1, False, num_cls).to(device)
loss_fn = nn.CrossEntropyLoss()

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

In [None]:
num_epochs = 100
for epoch in range(num_epochs):
  model, _, _ = train_one_epoch(model, mini_train_loader, loss_fn, optimizer, epoch)

### Step 3: select best lr

Train all data for one epoch

In [None]:
num_epochs = 1
for lr in [0.1, 0.01, 0.001, 0.0001]:
  print(f'LR={lr}')
  model = RNNModel(nn.LSTM, 50, 128, 1, False, num_cls).to(device)
  # model = torch.load('model.pt')
  optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=1e-4, momentum=0.9)
  for epoch in range(num_epochs):
    model, _, _ = train_one_epoch(model, train_loader, loss_fn, optimizer, epoch)
  print()

### Step 4: small grid (optional)

Create a small grid based on the WD and the best LR



In [None]:
num_epochs = 5

for lr in [0.05, 0.04, 0.03, 0.02, 0.01, 0.009, 0.008, 0.007, 0.006, 0.005]:
  for wd in [1e-4, 1e-5, 0.]:
    model = model().to(device)
    optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd)
    print(f'LR={lr}, WD={wd}')

    for epoch in range(num_epochs):
      model, loss, _ = train_one_epoch(model, train_loader, loss_fn, optimizer, epoch)
    print()

### Step 5: train more epochs

In [None]:
model = RNNModel(nn.LSTM, 50, 128, 1, False, num_cls).to(device)



In [None]:
lr = 0.5
wd = 1e-4
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9)

In [None]:
loss_train_hist = []
loss_valid_hist = []

acc_train_hist = []
acc_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [None]:
num_epochs = 15

for epoch in range(num_epochs):
  # Train
  model, loss_train, acc_train = train_one_epoch(model,
                                                 train_loader,
                                                 loss_fn,
                                                 optimizer,
                                                 epoch)
  # Validation
  loss_valid, acc_valid = validation(model,
                                     test_loader,
                                     loss_fn)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  acc_train_hist.append(acc_train)
  acc_valid_hist.append(acc_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Acc = {acc_valid:.4}')
  print()

  epoch_counter += 1

Epoch 0: : 938batch [04:58,  3.14batch/s, accuracy=25.2, loss=1.39]


Model Saved!
Valid: Loss = 1.386, Acc = 0.2524



Epoch 1: : 938batch [05:03,  3.09batch/s, accuracy=25.2, loss=1.39]


Valid: Loss = 1.386, Acc = 0.2524



Epoch 2: : 938batch [05:25,  2.88batch/s, accuracy=25.2, loss=1.39]


Valid: Loss = 1.386, Acc = 0.2524



Epoch 3: : 703batch [03:43,  3.14batch/s, accuracy=24.9, loss=1.39]


KeyboardInterrupt: 

## Plot

In [None]:
plt.plot(range(epoch_counter), loss_train_hist, 'r-', label='Train')
plt.plot(range(epoch_counter), loss_valid_hist, 'b-', label='Validation')

plt.xlabel('Epoch')
plt.ylabel('loss')
plt.grid(True)
plt.legend()

In [None]:
plt.plot(range(epoch_counter), acc_train_hist, 'r-', label='Train')
plt.plot(range(epoch_counter), acc_valid_hist, 'b-', label='Validation')

plt.xlabel('Epoch')
plt.ylabel('Acc')
plt.grid(True)
plt.legend()