In [1]:
import torch

In [2]:
device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda:0 device


In [3]:
from datasets import load_dataset

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

In [5]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [7]:
raw_datasets = load_dataset("glue", "mrpc")
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Using the latest cached version of the dataset since glue couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'mrpc' at /home/mtheologitis/.cache/huggingface/datasets/glue/mrpc/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c (last modified on Sat Jul 27 15:52:18 2024).


In [9]:
raw_datasets['validation']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 408
})

In [10]:
dict(raw_datasets['validation'].to_pandas()['label'].value_counts())

{1: 279, 0: 129}

In [11]:
n = 279 + 129

In [12]:
279 / n

0.6838235294117647

In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [25]:
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [26]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    #tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
    tokenized_datasets["test"], batch_size=8, collate_fn=data_collator
)

In [27]:
from torch.optim import AdamW
#from transformers import AdamW

In [28]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [29]:
num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)

In [30]:
_ = model.to(device)

In [31]:
import evaluate

def compute_metrics(model):
    
    metric = evaluate.load("glue", "mrpc")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    
    print(metric.compute())

In [32]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        
        # Backward pass: compute gradients
        loss.backward()
        
        # Update model parameters
        optimizer.step()
        #lr_scheduler.step()
        
        # Zero the gradients before the next backward pass
        optimizer.zero_grad()
        
        progress_bar.update(1)
    
    compute_metrics(model)

  0%|          | 0/4590 [00:00<?, ?it/s]

{'accuracy': 0.6817391304347826, 'f1': 0.8062124955877162}
{'accuracy': 0.8034782608695652, 'f1': 0.8540680154972019}
{'accuracy': 0.8133333333333334, 'f1': 0.8619210977701544}
{'accuracy': 0.8023188405797101, 'f1': 0.8571428571428571}
{'accuracy': 0.7971014492753623, 'f1': 0.8516949152542372}
{'accuracy': 0.7878260869565218, 'f1': 0.849009900990099}
{'accuracy': 0.7553623188405797, 'f1': 0.8346394984326019}
{'accuracy': 0.7820289855072464, 'f1': 0.8492381716118684}
{'accuracy': 0.7756521739130435, 'f1': 0.8467326732673267}
{'accuracy': 0.664927536231884, 'f1': 0.7987465181058496}


In [None]:
{'accuracy': 0.8186274509803921, 'f1': 0.8697183098591549}
{'accuracy': 0.7745098039215687, 'f1': 0.8145161290322581}
{'accuracy': 0.8406862745098039, 'f1': 0.8869565217391304}
{'accuracy': 0.8259803921568627, 'f1': 0.8830313014827018}
{'accuracy': 0.8406862745098039, 'f1': 0.8849557522123894}
{'accuracy': 0.821078431372549, 'f1': 0.8717047451669596}

In [38]:
raw_datasets['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})