In [2]:
import torch
torch.cuda.synchronize()
torch.cuda.empty_cache()

https://huggingface.co/docs/transformers/main/model_doc/albert#transformers.AlbertForMultipleChoice

In [3]:
from transformers import AutoTokenizer, AlbertForMultipleChoice
import torch

tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
model = AlbertForMultipleChoice.from_pretrained("albert/albert-base-v2")

prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels)  # batch size is 1

for k, t in encoding.items():
    print(k, t.shape)

for k, t in outputs.items():
    print(k, t.shape)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of AlbertForMultipleChoice were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


input_ids torch.Size([2, 35])
token_type_ids torch.Size([2, 35])
attention_mask torch.Size([2, 35])
loss torch.Size([])
logits torch.Size([1, 2])


In [4]:
model.to(device = 'cuda')

AlbertForMultipleChoice(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bia

## Data Prepare

In [5]:
# 1. Read and Parse Data
import torch
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader

dataset = load_dataset("ehovy/race", "middle")

print(dataset)

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        example = self.data[i]
        prompt = f'{example['article']}'
        options = [f'{example['question']}\n\n{option}' for option in example['options']]
        label = ord(example['answer']) - ord('A')
        encoding = tokenizer([prompt] * len(options), options, return_tensors="pt", padding=True, truncation = 'only_first', max_length=512)
        encoding['labels'] = label
        return encoding

dataset = {
    'test': CustomDataset(dataset['test']),
    'train': CustomDataset(dataset['train']),
    'validation': CustomDataset(dataset['validation'])
}


DatasetDict({
    test: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 1436
    })
    train: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 25421
    })
    validation: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 1436
    })
})


In [6]:
# 5. Create DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import defaultdict

def _pad_sequence(tensors: list[torch.Tensor]):
    tensors = [
        t.transpose(0, 1)
        for t in tensors
    ]
    batch = pad_sequence(tensors, batch_first=True, padding_value=0)
    return batch.transpose(1, 2).contiguous()

def collect_fn(batch):
    ret = defaultdict(list)
    labels = []
    for encoding in batch:
        for k in encoding.keys():
            if k == 'labels':
                labels.append(encoding[k])
            else:
                ret[k].append(encoding[k])
        

    ret = {k: _pad_sequence(tensors) for k, tensors in ret.items()}
    ret['labels'] = torch.tensor(labels, dtype=torch.uint8)
    
    return ret
    
test_dataloader = DataLoader(dataset['test'], batch_size=2, collate_fn=collect_fn, shuffle=True)

for example in test_dataloader:
    for k, tensor in example.items():
        print(k, tensor.shape)
    break

input_ids torch.Size([2, 4, 243])
token_type_ids torch.Size([2, 4, 243])
attention_mask torch.Size([2, 4, 243])
labels torch.Size([2])


## Test

In [7]:
def test(model):
    acc_count = 0
    total_count = 0
    with torch.no_grad():
        model.eval()
        for batch in test_dataloader:
            batch = {k:t.to(device='cuda')for k,t in batch.items()}
            outputs = model(**batch)
            _, preds = outputs.logits.max(dim = 1)
            labels = batch['labels']
            acc_count += (labels == preds).sum().item()
            total_count += labels.shape[0]

    print(acc_count / total_count)

test(model)

0.24094707520891365


## Train

In [8]:
from transformers import TrainingArguments, Trainer

# 訓練參數
training_args = TrainingArguments(
    output_dir="./results",
    logging_steps=1000,
    eval_steps=2000,
    eval_strategy='steps',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    data_collator=collect_fn,
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation']
)

# 開始訓練
trainer.train()

Step,Training Loss,Validation Loss
2000,1.3895,1.386295
4000,1.3863,1.386294
6000,1.388,1.386294
8000,1.3875,1.386294
10000,1.39,1.386294
12000,1.3876,1.386294


TrainOutput(global_step=12711, training_loss=1.3884767651342218, metrics={'train_runtime': 1179.9444, 'train_samples_per_second': 21.544, 'train_steps_per_second': 10.773, 'total_flos': 1498065346657488.0, 'train_loss': 1.3884767651342218, 'epoch': 1.0})

In [9]:
test(model)

0.2576601671309192


8640MiB