# RoBERTa

Supported by [huggingface/transformers](https://github.com/huggingface/transformers), PyTorch version.

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
%%capture
!pip install numpy pandas scikit-learn
!pip install transformers torchmetrics

In [3]:
import logging
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim.lr_scheduler as lr_scheduler
from torchmetrics import F1Score, Accuracy

from sklearn.model_selection import train_test_split
from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer)
from tqdm import tqdm

logging.basicConfig(level=logging.ERROR)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"PyTorch Device: {device}")

PyTorch Device: cuda


In [4]:
# can be tuned
MAX_LEN = 512
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 3e-5
TRAIN_SIZE = 0.8

# other constant
# PROJ_DIR = "drive/MyDrive/CS4248 Project/"
PROJ_DIR = "./"
RANDOM_SEED = 123
MODEL_NAME = "RoBERTa_L" + str(MAX_LEN) + "_B" + str(BATCH_SIZE) + "_LR" + str(LEARNING_RATE) + "_W.pth"
print(MODEL_NAME)

RoBERTa_L512_B16_LR3e-05_W.pth


## Build Dataset

In [5]:
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = data['text']
        self.label = data['label']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text_tok = self.tokenizer(
            text=self.text.iloc[index],
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_token_type_ids=True
        )
        return {
            'input_ids': torch.tensor(text_tok['input_ids'], dtype=torch.long).to(device),
            'attention_mask': torch.tensor(text_tok['attention_mask'], dtype=torch.long).to(device),
            'token_type_ids': torch.tensor(text_tok["token_type_ids"], dtype=torch.long).to(device),
            'labels': torch.tensor(self.label.iloc[index], dtype=torch.long).to(device)
        }

In [6]:
fulltrain = pd.read_csv(PROJ_DIR + 'raw_data/fulltrain.csv', names = ['label', 'text'])
fulltrain['label'] = fulltrain['label'] - 1     # label = 0, 1, 2, 3
train, valid = train_test_split(
    fulltrain, 
    train_size=TRAIN_SIZE, 
    shuffle=True,
    random_state=RANDOM_SEED
)

print(f"all labels: {fulltrain['label'].unique()}")
print(f"full data: {fulltrain.shape}")
print(f"train data: {train.shape}")
print(f"valid data: {valid.shape}")
train

all labels: [0 1 2 3]
full data: (48854, 2)
train data: (39083, 2)
valid data: (9771, 2)


Unnamed: 0,label,text
10635,0,According to a study published Thursday by res...
32278,2,Gulf War Veteran Charlie McGrath Issues Import...
29715,2,Using Nature to Solve Natures ProblemsHeather ...
147,0,"In a statement delivered to friends, family me..."
46173,3,But Zelaya's delegation immediately rejected t...
...,...,...
7763,0,During a press conference Thursday at the Mass...
15377,1,School Does Something SICKENING To Kids On 9/1...
17730,1,"He Saw This In The Distance, Did A Double Take..."
28030,2,Jurors Still Angry About Hershberger Trial; Fo...


In [7]:
# dataset and dataloader
tokenizer = AutoTokenizer.from_pretrained(
    'roberta-base',
    truncation=True,
    do_lower_case=True
)

train_ds = MyDataset(train, tokenizer, MAX_LEN)
valid_ds = MyDataset(valid, tokenizer, MAX_LEN)
train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0
)
valid_loader = DataLoader(
    valid_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

In [8]:
del fulltrain
del train
del valid

## Fine-tune RoBERTa

In [9]:
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [10]:
f1 = F1Score(task="multiclass", num_classes=4, average="weighted").to(device)
acc = Accuracy(task="multiclass", num_classes=4, average="weighted").to(device)

def train_process(dataloader, model, loss_fn, optimizer):
    model.train()   # set train mode
    size = len(dataloader.dataset)

    for batch, data in tqdm(enumerate(dataloader, 0)):
        pred_proba = model(**data).logits
        loss = loss_fn(pred_proba, data['labels'])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 500 == 0:
            loss, current = loss.item(), (batch + 1) * len(data['labels'])
            print(f"\tloss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def valid_process(dataloader, model, loss_fn):
    model.eval()    # set eval mode
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss = 0
    real_label = torch.Tensor([]).to(device)
    pred_label = torch.Tensor([]).to(device)

    with torch.no_grad():
        for batch, data in tqdm(enumerate(dataloader, 0)):
            pred_proba = model(**data).logits
            real_label = torch.cat((real_label, data['labels']))
            pred_label = torch.cat((pred_label, pred_proba.argmax(1)))
            test_loss += loss_fn(pred_proba, data['labels']).item()
    test_loss /= num_batches
    f1_score = f1(pred_label, real_label)
    accuracy = acc(pred_label, real_label)
    print(f"Test Process: \n Avg loss: {test_loss:>8f}, Acc: {accuracy:>4f}, F1: {f1_score:>4f}\n")

In [11]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)
scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}\n-------------------------------")
    train_process(train_loader, model, loss_fn, optimizer)
    valid_process(valid_loader, model, loss_fn)
    scheduler.step()

print("Done!")

Epoch 1
-------------------------------


1it [00:00,  2.95it/s]

	loss: 1.360174  [   16/39083]


501it [02:39,  3.10it/s]

	loss: 0.010188  [ 8016/39083]


1001it [05:16,  3.13it/s]

	loss: 0.003823  [16016/39083]


1501it [07:54,  3.20it/s]

	loss: 0.001379  [24016/39083]


2001it [10:31,  3.18it/s]

	loss: 0.121089  [32016/39083]


2443it [12:50,  3.17it/s]
611it [01:20,  7.60it/s]


Test Process: 
 Avg loss: 0.052958, Acc: 0.982192, F1: 0.982196

Epoch 2
-------------------------------


1it [00:00,  3.12it/s]

	loss: 0.003878  [   16/39083]


501it [02:37,  3.25it/s]

	loss: 0.001184  [ 8016/39083]


1001it [05:15,  3.16it/s]

	loss: 0.000690  [16016/39083]


1501it [07:53,  3.15it/s]

	loss: 0.000405  [24016/39083]


2001it [10:31,  3.06it/s]

	loss: 0.000617  [32016/39083]


2443it [12:52,  3.16it/s]
611it [01:22,  7.42it/s]


Test Process: 
 Avg loss: 0.006513, Acc: 0.998158, F1: 0.998158

Epoch 3
-------------------------------


1it [00:00,  3.12it/s]

	loss: 0.000631  [   16/39083]


501it [02:38,  3.13it/s]

	loss: 0.000349  [ 8016/39083]


1001it [05:16,  2.94it/s]

	loss: 0.000772  [16016/39083]


1501it [07:54,  3.22it/s]

	loss: 0.000406  [24016/39083]


2001it [10:32,  3.15it/s]

	loss: 0.000363  [32016/39083]


2443it [12:52,  3.16it/s]
611it [01:19,  7.64it/s]

Test Process: 
 Avg loss: 0.006487, Acc: 0.997851, F1: 0.997851

Done!





In [12]:
torch.save(model.state_dict(), PROJ_DIR+"model/"+MODEL_NAME)

## Test

In [13]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
model.load_state_dict(torch.load(PROJ_DIR+"model/"+MODEL_NAME))
model.to(device)
loss_fn = torch.nn.CrossEntropyLoss()


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [14]:
balancedtest = pd.read_csv(PROJ_DIR + 'raw_data/balancedtest.csv', names = ['label', 'text'])
balancedtest['label'] = balancedtest['label'] - 1

test_ds = MyDataset(balancedtest, tokenizer, MAX_LEN)
test_loader = DataLoader(
    test_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

In [15]:
valid_process(train_loader, model, loss_fn)

2443it [05:18,  7.68it/s]

Test Process: 
 Avg loss: 0.001864, Acc: 0.999514, F1: 0.999514






In [16]:
valid_process(valid_loader, model, loss_fn)

611it [01:20,  7.63it/s]

Test Process: 
 Avg loss: 0.006487, Acc: 0.997851, F1: 0.997851






In [17]:
valid_process(test_loader, model, loss_fn)

188it [00:23,  7.86it/s]

Test Process: 
 Avg loss: 2.173389, Acc: 0.698333, F1: 0.642204




