# SST-2 Binary text classification with XLM-RoBERTa model

In [1]:
import torch
import torch.nn as nn

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Data Transformation

In [2]:
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url

padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256
xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"

text_transform = T.Sequential(
    T.SentencePieceTokenizer(xlmr_spm_model_path),
    T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
    T.Truncate(max_seq_len - 2),
    T.AddToken(token=bos_idx, begin=True),
    T.AddToken(token=eos_idx, begin=False)
)

100%|██████████| 5.07M/5.07M [00:00<00:00, 6.97MB/s]
Downloading: "https://download.pytorch.org/models/text/xlmr.vocab.pt" to C:\Users\theRun/.cache\torch\hub\checkpoints\xlmr.vocab.pt


  0%|          | 0.00/4.85M [00:00<?, ?B/s]

## Dataset

In [10]:
from torchtext.datasets import SST2
from torch.utils.data import DataLoader

batch_size = 16

train_datapipe = SST2(split="train")
dev_datapipe = SST2(split="dev")

train_datapipe = train_datapipe.map(lambda x: (text_transform(x[0]), x[1]))
train_datapipe = train_datapipe.batch(batch_size)
train_datapipe = train_datapipe.rows2columnar(["token_ids", "target"])
train_dataloader = DataLoader(train_datapipe, batch_size=None)

dev_datapipe = dev_datapipe.map(lambda x: (text_transform(x[0]), x[1]))
dev_datapipe = dev_datapipe.batch(batch_size)
dev_datapipe = dev_datapipe.rows2columnar(["token_ids", "target"])
dev_dataloader = DataLoader(dev_datapipe, batch_size=None)



## Model Preparation

In [5]:
num_classes = 2
input_dim  = 768

from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER
classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
model = XLMR_BASE_ENCODER.get_model(head=classifier_head)
model.to(DEVICE)

Downloading: "https://download.pytorch.org/models/text/xlmr.base.encoder.pt" to C:\Users\theRun/.cache\torch\hub\checkpoints\xlmr.base.encoder.pt


  0%|          | 0.00/1.03G [00:00<?, ?B/s]

RobertaModel(
  (encoder): RobertaEncoder(
    (transformer): TransformerEncoder(
      (token_embedding): Embedding(250002, 768, padding_idx=1)
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (dropout): Dropout(p=0.1, inplace=False)
          (attention): MultiheadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (input_projection): Linear(in_features=768, out_features=2304, bias=True)
            (output_projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (residual_mlp): ResidualMLP(
            (mlp): Sequential(
              (0): Linear(in_features=768, out_features=3072, bias=True)
              (1): GELU()
              (2): Dropout(p=0.1, inplace=False)
              (3): Linear(in_features=3072, out_features=768, bias=True)
              (4): Dropout(p=0.1, inplace=False)
            )
          )
          (attention_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)


## Training methods

In [8]:
import torchtext.functional as F
from torch.optim import AdamW

learning_rate = 1e-5
optim = AdamW(model.parameters(), lr=learning_rate)
criteria = nn.CrossEntropyLoss()

def train_step(input_, target):
    output = model(input_)
    loss = criteria(output, target)
    optim.zero_grad()
    loss.backward()
    optim.step()

def eval_step(input_, target):
    output = model(input_)
    loss = criteria(output, target).item()
    return float(loss), (output.argmax(1) == target).type(torch.float).sum().item()

def evaluate():
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    counter = 0
    with torch.no_grad():
        for batch in dev_dataloader:
            input_ = F.to_tensor(batch["token_ids"], padding_value=padding_idx).to(DEVICE)
            target = torch.tensor(batch["target"]).to(DEVICE)
            loss, predictions = eval_step(input_, target)
            total_loss += loss
            correct_predictions += predictions
            total_predictions += len(target)
            counter += 1
    return total_loss / counter, correct_predictions / total_predictions

## Train

In [11]:
num_epochs = 1

for e in range(num_epochs):
    for batch in train_dataloader:
        input_ = F.to_tensor(batch["token_ids"], padding_value=padding_idx).to(DEVICE)
        target = torch.tensor(batch["target"]).to(DEVICE)
        train_step(input_, target)

    loss, accuracy = evaluate()
    print("Epoch = [{}], loss = [{}], accuracy = [{}]".format(e, loss, accuracy))

Epoch = [0], loss = [0.25478339956396007], accuracy = [0.9105504587155964]
