In [None]:
#@title Load Model & Tokenizer

!pip install datasets
!pip install evaluate
!pip install transformers

import torch
import transformers

torch.set_grad_enabled(False)

DEVICE = torch.device("cuda")
print(f'{DEVICE=}')

pretrain_model_name = 'gpt2'  # gpt2-small or gpt2-medium
print(f'{pretrain_model_name=}')

TOKENIZER = transformers.AutoTokenizer.from_pretrained(pretrain_model_name)
TOKENIZER.pad_token = TOKENIZER.eos_token
MODEL = transformers.AutoModelForCausalLM.from_pretrained(pretrain_model_name, pad_token_id=TOKENIZER.eos_token_id).to(DEVICE)

print(f'{TOKENIZER=} {MODEL=}')

DEVICE=device(type='cuda')
pretrain_model_name='gpt2'


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


TOKENIZER=GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
} MODEL=GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e

In [None]:
#@title add dropout & freeze layers
# MODEL.transformer.drop.p = 0.3
print(f'{MODEL.transformer.drop.p=}')

num_layers = len(MODEL.transformer.h)
freeze_layer = 6

for i, layer in enumerate(MODEL.transformer.h):
    if i < freeze_layer:
        for param in layer.parameters():
            param.requires_grad = False
    else:
        for param in layer.parameters():
            param.requires_grad = True

# Verify which parameters are frozen
for name, param in MODEL.named_parameters():
    print(f"{name}: requires_grad={param.requires_grad}")

MODEL.transformer.drop.p=0.1
transformer.wte.weight: requires_grad=True
transformer.wpe.weight: requires_grad=True
transformer.h.0.ln_1.weight: requires_grad=False
transformer.h.0.ln_1.bias: requires_grad=False
transformer.h.0.attn.c_attn.weight: requires_grad=False
transformer.h.0.attn.c_attn.bias: requires_grad=False
transformer.h.0.attn.c_proj.weight: requires_grad=False
transformer.h.0.attn.c_proj.bias: requires_grad=False
transformer.h.0.ln_2.weight: requires_grad=False
transformer.h.0.ln_2.bias: requires_grad=False
transformer.h.0.mlp.c_fc.weight: requires_grad=False
transformer.h.0.mlp.c_fc.bias: requires_grad=False
transformer.h.0.mlp.c_proj.weight: requires_grad=False
transformer.h.0.mlp.c_proj.bias: requires_grad=False
transformer.h.1.ln_1.weight: requires_grad=False
transformer.h.1.ln_1.bias: requires_grad=False
transformer.h.1.attn.c_attn.weight: requires_grad=False
transformer.h.1.attn.c_attn.bias: requires_grad=False
transformer.h.1.attn.c_proj.weight: requires_grad=False

In [None]:
from datasets import Dataset
from torch.utils.data import DataLoader
import random


min_value=1000
max_value=10000
def gen():
  for i in range(size):
    a = random.randint(min_value, max_value)
    b = random.randint(min_value, max_value)
    question = f'{a} + {b} ='
    answer = a + b
    yield {'question': question, 'answer': str(answer)}

size=100000
TRAIN_DATASET = Dataset.from_generator(gen)
size=100
VALIDATE_DATASET = Dataset.from_generator(gen)
size=100
TEST_DATASET = Dataset.from_generator(gen)

# preprocess
def preprocess(batch):
  max_length = 7
  input = [q.strip() for q in batch['question']]
  input = TOKENIZER(input, max_length=max_length, truncation=True, padding="max_length")
  labels  = batch["answer"]
  labels = TOKENIZER(labels, truncation=True, max_length=max_length, padding="max_length")
  # Replace padding token id in labels with -100 for PyTorch loss calculation
  labels["input_ids"] = [
        [(label if label != TOKENIZER.pad_token_id else -100) for label in label_ids]
        for label_ids in labels["input_ids"]
  ]

  # for label_ids in labels["input_ids"]:
  #   for i, label in enumerate(label_ids):
  #     if label == -100:
  #       label_ids[i] = TOKENIZER.eos_token_id
  #       break
  input['labels'] = labels['input_ids']
  return input

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
def collate_fn(batch):
    """Collate function to process a batch of prompts and answers."""
    prompts = [item['question'] for item in batch]
    answers = torch.tensor([item['answer'] for item in batch], dtype=torch.float32)
    return prompts, answers

print(f'{len(TRAIN_DATASET)=} {len(VALIDATE_DATASET)=} {len(TEST_DATASET)=}')
print(TRAIN_DATASET[0])
print(TRAIN_DATASET[10])

TRAIN_DATASET = TRAIN_DATASET.map(preprocess, batched=True, remove_columns=TRAIN_DATASET.column_names)
TRAIN_DATASET.set_format("torch")
VALIDATE_DATASET = VALIDATE_DATASET.map(preprocess, batched=True, remove_columns=VALIDATE_DATASET.column_names)
VALIDATE_DATASET.set_format("torch")
TEST_DATASET = TEST_DATASET.map(preprocess, batched=True, remove_columns=TEST_DATASET.column_names)
TEST_DATASET.set_format("torch")

print(TRAIN_DATASET[0])
print(TRAIN_DATASET[10])

BATCH_SIZE = 64
TRAIN_DATALOADER = DataLoader(TRAIN_DATASET, shuffle=True, batch_size=BATCH_SIZE)
VALID_DATALOADER = DataLoader(VALIDATE_DATASET, shuffle=True, batch_size=BATCH_SIZE)
TEST_DATALOADER = DataLoader(TEST_DATASET, shuffle=True, batch_size=BATCH_SIZE)

print(f'{TRAIN_DATALOADER=} {VALID_DATALOADER=} {TEST_DATALOADER=}')

len(TRAIN_DATASET)=100000 len(VALIDATE_DATASET)=100 len(TEST_DATASET)=100
{'question': '9855 + 7131 =', 'answer': '16986'}
{'question': '1454 + 7652 =', 'answer': '9106'}


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

{'input_ids': tensor([ 4089,  2816,  1343,   767, 22042,   796, 50256]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0]), 'labels': tensor([ 1433, 49087,  -100,  -100,  -100,  -100,  -100])}
{'input_ids': tensor([ 1415,  4051,  1343,   767, 43193,   796, 50256]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0]), 'labels': tensor([   24, 15801,  -100,  -100,  -100,  -100,  -100])}
TRAIN_DATALOADER=<torch.utils.data.dataloader.DataLoader object at 0x7c24fb5bcf40> VALID_DATALOADER=<torch.utils.data.dataloader.DataLoader object at 0x7c24da49b220> TEST_DATALOADER=<torch.utils.data.dataloader.DataLoader object at 0x7c25049ebb50>


In [None]:
#@title eval before fine tuning
import evaluate

torch.set_grad_enabled(False)
MODEL.eval()
output_tokens = MODEL.generate(torch.as_tensor(TOKENIZER.encode("47 + 35=")).view(1, -1).to(DEVICE))
print('generated: ', output_tokens[0])
print('generated: ', TOKENIZER.decode(output_tokens[0]))

output_tokens = MODEL.generate(torch.as_tensor(TOKENIZER.encode("15 + 70=")).view(1, -1).to(DEVICE))
print('generated: ', output_tokens[0])
print('generated: ', TOKENIZER.decode(output_tokens[0]))

metric = evaluate.load("accuracy")

for batch in VALID_DATALOADER:
    input_ids = batch["input_ids"].clone().detach().to(DEVICE)
    attention_mask = batch["attention_mask"].clone().detach().to(DEVICE)
    labels = batch["labels"].clone().detach().to(DEVICE)
    with torch.no_grad():
      outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    for p, l in zip(predictions, labels):
      metric.add_batch(predictions=p, references=l)

metric.compute()


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


generated:  tensor([2857, 1343, 3439,   28,   15,   13,   20,  198,  198,  464, 1708, 3084,
        2523,  262, 2811, 1271,  286, 2173,  583,  983], device='cuda:0')
generated:  47 + 35=0.5

The following table shows the average number of points per game
generated:  tensor([1314, 1343, 4317,   28,   15,   13,   20,  198,  198,   10, 4317,   28,
          15,   13,   20, 1343, 4317,   28,   15,   13], device='cuda:0')
generated:  15 + 70=0.5

+ 70=0.5 + 70=0.


{'accuracy': 0.0}

In [None]:
#@title fine tune

from torch.optim import AdamW
from tqdm.auto import tqdm
from transformers import get_scheduler

torch.set_grad_enabled(True)

optimizer = AdamW(MODEL.parameters(), lr=1e-4)
num_epochs = 1000
num_training_steps = num_epochs * len(TRAIN_DATALOADER)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))


for epoch in range(num_epochs):
    train_loss = 0
    MODEL.train()
    for batch in TRAIN_DATALOADER:
        input_ids = batch["input_ids"].clone().detach().to(DEVICE)
        attention_mask = batch["attention_mask"].clone().detach().to(DEVICE)
        labels = batch["labels"].clone().detach().to(DEVICE)
        outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    avg_train_loss = train_loss / len(TRAIN_DATALOADER)
    print(f"Epoch {epoch + 1}: Training Loss = {avg_train_loss:.4f}")

    MODEL.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in VALID_DATALOADER:
            input_ids = batch["input_ids"].clone().detach().to(DEVICE)
            attention_mask = batch["attention_mask"].clone().detach().to(DEVICE)
            labels = batch["labels"].clone().detach().to(DEVICE)

            outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
    avg_val_loss = val_loss / len(VALID_DATALOADER)
    print(f"Epoch {epoch + 1}: Validation Loss = {avg_val_loss:.4f}")

    # checkpoint
    file_name = "math_checkpoint/epoch_" + str(epoch)
    torch.save(MODEL.state_dict(), file_name)





  0%|          | 0/1563000 [00:00<?, ?it/s]

Epoch 1: Training Loss = 5.9793
Epoch 1: Validation Loss = 6.1348
Epoch 2: Training Loss = 5.9582
Epoch 2: Validation Loss = 6.0424
Epoch 3: Training Loss = 5.9533
Epoch 3: Validation Loss = 6.0641
Epoch 4: Training Loss = 5.9491
Epoch 4: Validation Loss = 6.0689


In [None]:
#@title evaluate test split

import evaluate

torch.set_grad_enabled(False)
MODEL.eval()

prompts = ["8699 + 8256 =", "1233 + 4567 =", "9955 + 7732 ="]

for p in prompts:
  output_tokens = MODEL.generate(torch.as_tensor(TOKENIZER.encode(p)).view(1, -1).to(DEVICE),
                                num_beams=3,
                                 early_stopping=True,
                                 max_new_tokens=5,
                                do_sample=True,
                                pad_token_id=TOKENIZER.eos_token_id
                                )
  print('generated: ', TOKENIZER.decode(output_tokens[0]))

# "what holiday is first weekend of september"

metric = evaluate.load("accuracy")

for batch in TEST_DATALOADER:
    input_ids = batch["input_ids"].clone().detach().to(DEVICE)
    attention_mask = batch["attention_mask"].clone().detach().to(DEVICE)
    labels = batch["labels"].clone().detach().to(DEVICE)
    with torch.no_grad():
      outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    for p, l in zip(predictions, labels):
      metric.add_batch(predictions=p, references=l)

metric.compute()