In [7]:
!pip install datasets



In [8]:
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer

MODEL_NAME_OR_PATH = 'roberta-base'
MAX_INPUT_LENGTH = 256
BATCH_SIZE = 16
TRAINING_EPOCHS = 2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize model and tokenizer
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME_OR_PATH).to(DEVICE)
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from datasets import load_dataset
from torch.utils.data import DataLoader

qnli_dataset = load_dataset('glue', 'qnli')

def convert_example_to_features(example: dict) -> dict:
    features = tokenizer(
        example['question'], example['sentence'],
        max_length=MAX_INPUT_LENGTH, padding='max_length',
        truncation='longest_first'
    )
    features['labels'] = example['label']
    return features

def collate(batch: list) -> dict:
    return {
        'input_ids': torch.tensor([item['input_ids'] for item in batch]).to(DEVICE),
        'attention_mask': torch.tensor([item['attention_mask'] for item in batch]).to(DEVICE),
        'labels': torch.tensor([item['labels'] for item in batch]).to(DEVICE),
    }

train_dataset = qnli_dataset['train'].map(convert_example_to_features)
validation_dataset = qnli_dataset['validation'].map(convert_example_to_features)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)
validation_dataloader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, collate_fn=collate)

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/872k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/877k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/104743 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5463 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5463 [00:00<?, ? examples/s]

Map:   0%|          | 0/104743 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/5463 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [10]:
from transformers import get_linear_schedule_with_warmup

WEIGHT_DECAY = 0.01
LEARNING_RATE = 2e-5
WARMUP_PROPORTION = 0.1

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": WEIGHT_DECAY,
        "lr": LEARNING_RATE
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
        "lr": LEARNING_RATE
    },
]

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
num_training_steps = len(train_dataloader) * TRAINING_EPOCHS
num_warmup_steps = int(WARMUP_PROPORTION * num_training_steps)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

In [11]:
from torch.cuda.amp import GradScaler, autocast

MIXED_PRECISION_TRAINING = torch.cuda.is_available()
scaler = GradScaler() if MIXED_PRECISION_TRAINING else None

  scaler = GradScaler() if MIXED_PRECISION_TRAINING else None


In [13]:
import tqdm
from sklearn.metrics import f1_score

def training_step(batch):
    model.train()
    optimizer.zero_grad()

    with autocast(enabled=MIXED_PRECISION_TRAINING):
        loss = model(**batch).loss

    if MIXED_PRECISION_TRAINING:
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
    else:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    lr_scheduler.step()
    return loss.item()

def evaluate(dataloader):
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc="Evaluation"):
            with autocast(enabled=MIXED_PRECISION_TRAINING):
                logits = model(**batch).logits
            predictions.append(logits.argmax(dim=-1).detach().cpu())
            labels.append(batch['labels'].detach().cpu())

    predictions = torch.cat(predictions)
    labels = torch.cat(labels)

    f1 = f1_score(labels.numpy(), predictions.numpy(), average='weighted')
    return f1

In [14]:
for epoch in range(TRAINING_EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm.tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
        total_loss += training_step(batch)
    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_dataloader)}")

    f1 = evaluate(validation_dataloader)
    print(f"Validation F1 Score: {f1}")

  with autocast(enabled=MIXED_PRECISION_TRAINING):
Epoch 1: 100%|██████████| 6547/6547 [23:58<00:00,  4.55it/s]


Epoch 1 Loss: 0.3491826747372412


  with autocast(enabled=MIXED_PRECISION_TRAINING):
Evaluation: 100%|██████████| 342/342 [00:19<00:00, 17.86it/s]


Validation F1 Score: 0.9150063523049076


  with autocast(enabled=MIXED_PRECISION_TRAINING):
Epoch 2: 100%|██████████| 6547/6547 [23:56<00:00,  4.56it/s]


Epoch 2 Loss: 0.2172062936524444


  with autocast(enabled=MIXED_PRECISION_TRAINING):
Evaluation: 100%|██████████| 342/342 [00:19<00:00, 17.90it/s]


Validation F1 Score: 0.9229339156493394
