https://huggingface.co/docs/transformers/main/model_doc/bert#transformers.BertForSequenceClassification

In [8]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, BertConfig

tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity")

encoding = tokenizer("Hello, my dog is cute", return_tensors="pt")

config = BertConfig(
    vocab_size=len(tokenizer),
    num_labels=4 
)
model = BertForSequenceClassification(config)

labels = torch.tensor([1])
outputs = model(**encoding, labels=labels)

for k, t in encoding.items():
    print(k, t.shape)

for k, t in outputs.items():
    print(k, t.shape)



input_ids torch.Size([1, 8])
token_type_ids torch.Size([1, 8])
attention_mask torch.Size([1, 8])
loss torch.Size([])
logits torch.Size([1, 4])


In [9]:
print(model.to(device='cuda'))

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Data Prepare

In [None]:
# 1. Read and Parse Data
import torch
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader

dataset = load_dataset("ehovy/race", "middle")

print(dataset)

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        example = self.data[i]
        prompt = f'{example['article']}'
        question = f'{example['question']}\n\n{'\n\n'.join(f'{i}. {opt}' for opt in example['options'])}'
        label = ord(example['answer']) - ord('A')
        encoding = tokenizer(prompt, question, return_tensors="pt", padding=True, truncation = 'only_first', max_length=512)
        encoding['labels'] = label
        return encoding

dataset = {
    'test': CustomDataset(dataset['test']),
    'train': CustomDataset(dataset['train']),
    'validation': CustomDataset(dataset['validation'])
}


In [None]:
# 5. Create DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import defaultdict

def _pad_sequence(tensors: list[torch.Tensor]):
    tensors = [
        t.squeeze(0)
        for t in tensors
    ]
    batch = pad_sequence(tensors, batch_first=True, padding_value=0)
    return batch.contiguous()

def collect_fn(batch):
    ret = defaultdict(list)
    labels = []
    for encoding in batch:
        for k in encoding.keys():
            if k == 'labels':
                labels.append(encoding[k])
            else:
                ret[k].append(encoding[k])
        

    ret = {k: _pad_sequence(tensors) for k, tensors in ret.items()}
    ret['labels'] = torch.tensor(labels, dtype=torch.uint8)
    
    return ret
    
test_dataloader = DataLoader(dataset['test'], batch_size=2, collate_fn=collect_fn, shuffle=True)

for example in test_dataloader:
    for k, tensor in example.items():
        print(k, tensor.shape)
    break

input_ids torch.Size([2, 338])
token_type_ids torch.Size([2, 338])
attention_mask torch.Size([2, 338])
labels torch.Size([2])


## Train

In [None]:
from transformers import TrainingArguments, Trainer

# 訓練參數
training_args = TrainingArguments(
    output_dir="./results",
    logging_steps=1000,
    eval_steps=2000,
    eval_strategy='steps',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    data_collator=collect_fn,
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation']
)

# 開始訓練
trainer.train()

Step,Training Loss,Validation Loss
2000,1.4274,1.387358
4000,1.4127,1.435543
6000,1.4098,1.420333
8000,1.4113,1.428348
10000,1.3974,1.39047
12000,1.4032,1.3923
14000,1.4038,1.394403
16000,1.3984,1.398624
18000,1.3992,1.388769
20000,1.3979,1.389451


KeyboardInterrupt: 

In [None]:
def test(model):
    acc_count = 0
    total_count = 0
    with torch.no_grad():
        model.eval()
        for batch in test_dataloader:
            batch = {k:t.to(device='cuda')for k,t in batch.items()}
            outputs = model(**batch)
            _, preds = outputs.logits.max(dim = 1)
            labels = batch['labels']
            acc_count += (labels == preds).sum().item()
            total_count += labels.shape[0]

    print(acc_count / total_count)

test(model)

0.27019498607242337


2786MiB