In [1]:
#@title Load Model & Tokenizer

!pip install datasets > /dev/null
!pip install evaluate > /dev/null
!pip install transformers > /dev/null

import torch
import transformers

torch.set_grad_enabled(False)

DEVICE = torch.device("cuda")
print(f'{DEVICE=}')

pretrain_model_name = 'gpt2'  # gpt2-small or gpt2-medium
print(f'{pretrain_model_name=}')

TOKENIZER = transformers.AutoTokenizer.from_pretrained(pretrain_model_name)
TOKENIZER.pad_token = TOKENIZER.eos_token
MODEL = transformers.AutoModelForCausalLM.from_pretrained(pretrain_model_name, pad_token_id=TOKENIZER.eos_token_id).to(DEVICE)

print(f'{TOKENIZER=} {MODEL=}')

DEVICE=device(type='cuda')
pretrain_model_name='gpt2'


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


TOKENIZER=GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
} MODEL=GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e

In [2]:
#@title add dropout & freeze layers
# MODEL.transformer.drop.p = 0.5
print(f'{MODEL.transformer.drop.p=} {len(MODEL.transformer.h)=}')


num_layers = len(MODEL.transformer.h)
freeze_up_to = num_layers // 2

for i, layer in enumerate(MODEL.transformer.h):
    if i < freeze_up_to:
        for param in layer.parameters():
            param.requires_grad = False
    else:
        for param in layer.parameters():
            param.requires_grad = True

# Verify which parameters are frozen
for name, param in MODEL.named_parameters():
    print(f"{name}: requires_grad={param.requires_grad}")

MODEL.transformer.drop.p=0.1 len(MODEL.transformer.h)=12
transformer.wte.weight: requires_grad=True
transformer.wpe.weight: requires_grad=True
transformer.h.0.ln_1.weight: requires_grad=False
transformer.h.0.ln_1.bias: requires_grad=False
transformer.h.0.attn.c_attn.weight: requires_grad=False
transformer.h.0.attn.c_attn.bias: requires_grad=False
transformer.h.0.attn.c_proj.weight: requires_grad=False
transformer.h.0.attn.c_proj.bias: requires_grad=False
transformer.h.0.ln_2.weight: requires_grad=False
transformer.h.0.ln_2.bias: requires_grad=False
transformer.h.0.mlp.c_fc.weight: requires_grad=False
transformer.h.0.mlp.c_fc.bias: requires_grad=False
transformer.h.0.mlp.c_proj.weight: requires_grad=False
transformer.h.0.mlp.c_proj.bias: requires_grad=False
transformer.h.1.ln_1.weight: requires_grad=False
transformer.h.1.ln_1.bias: requires_grad=False
transformer.h.1.attn.c_attn.weight: requires_grad=False
transformer.h.1.attn.c_attn.bias: requires_grad=False
transformer.h.1.attn.c_proj

# Load and Preprocess

## step 1, (1 of 2) Load squad_v2

In [13]:
from datasets import load_dataset
from torch.utils.data import DataLoader

squad_dataset = load_dataset("squad_v2")
TRAIN_DATASET = load_dataset("squad_v2", split='train[:100]')
# TRAIN_DATASET = squad_dataset['train']
squad_dataset = squad_dataset['validation'].train_test_split(test_size=0.2)
VALIDATE_DATASET = squad_dataset['train']
TEST_DATASET = squad_dataset['test']

# only keep the examples with answers
def keep_true(example):
  return len(example['answers']["text"]) > 0

# preprocess
def preprocess(batch):
  max_length = 16
  # input = [c + q for c, q in zip(batch["context"], batch['question'])]
  input = batch["question"]
  input = TOKENIZER(input, max_length=max_length, truncation=True, padding="max_length")

  labels  = [
      a["text"][0] if len(a["text"]) > 0 else "No answer"
      for a in batch["answers"]
  ]
  labels = TOKENIZER(labels, truncation=True, max_length=max_length, padding="max_length")
  # Replace padding token id in labels with -100 for PyTorch loss calculation
  labels["input_ids"] = [
        [(label if label != TOKENIZER.pad_token_id else -100) for label in label_ids]
        for label_ids in labels["input_ids"]
  ]
  input['labels'] = labels['input_ids']
  return input

## step 1, (2 of 2) Load wiki_qa

In [None]:
#@title wikiqa

from datasets import load_dataset
from torch.utils.data import DataLoader

squad_dataset = load_dataset("wiki_qa")
TRAIN_DATASET = squad_dataset['train']
VALIDATE_DATASET = squad_dataset['validation']
TEST_DATASET = squad_dataset['test']

# only keep the true examples
def keep_true(example):
  return example['label'] == 1

# preprocess
def preprocess(batch):
  max_length = 50
  input = batch["question"]
  input = TOKENIZER(input, max_length=max_length, truncation=True, padding="max_length")

  labels = batch["answer"]
  labels = TOKENIZER(labels, truncation=True, max_length=max_length, padding="max_length")
  # Replace padding token id in labels with -100 for PyTorch loss calculation
  labels["input_ids"] = [
        [(label if label != TOKENIZER.pad_token_id else -100) for label in label_ids]
        for label_ids in labels["input_ids"]
  ]
  input['labels'] = labels['input_ids']
  return input

## step 2, load data

In [14]:
TRAIN_DATASET = TRAIN_DATASET.filter(keep_true)
VALIDATE_DATASET = VALIDATE_DATASET.filter(keep_true)
TEST_DATASET = TEST_DATASET.filter(keep_true)

print(f'{len(TRAIN_DATASET)=} {len(VALIDATE_DATASET)=} {len(TEST_DATASET)=}')

print(TRAIN_DATASET[0])
print(TRAIN_DATASET[10])
print(TRAIN_DATASET[22])
print(TRAIN_DATASET[42])

TRAIN_DATASET = TRAIN_DATASET.map(preprocess, batched=True, remove_columns=TRAIN_DATASET.column_names)
TRAIN_DATASET.set_format("torch")
VALIDATE_DATASET = VALIDATE_DATASET.map(preprocess, batched=True, remove_columns=VALIDATE_DATASET.column_names)
VALIDATE_DATASET.set_format("torch")
TEST_DATASET = TEST_DATASET.map(preprocess, batched=True, remove_columns=TEST_DATASET.column_names)
TEST_DATASET.set_format("torch")

print(TRAIN_DATASET[0])
print(TRAIN_DATASET[10])
print(TRAIN_DATASET[22])
print(TRAIN_DATASET[42])

print(TOKENIZER.decode(TRAIN_DATASET[0]['input_ids']))
print(TOKENIZER.decode(TRAIN_DATASET[0]['labels'][:5]))

print(TOKENIZER.decode(TRAIN_DATASET[10]['input_ids']))
print(TOKENIZER.decode(TRAIN_DATASET[10]['labels'][:5]))

print(TOKENIZER.decode(TRAIN_DATASET[22]['input_ids']))
print(TOKENIZER.decode(TRAIN_DATASET[22]['labels'][:2]))

print(TOKENIZER.decode(TRAIN_DATASET[42]['input_ids']))
print(TOKENIZER.decode(TRAIN_DATASET[42]['labels'][:2]))

BATCH_SIZE = 16
TRAIN_DATALOADER = DataLoader(TRAIN_DATASET, shuffle=False, batch_size=BATCH_SIZE)
VALID_DATALOADER = DataLoader(VALIDATE_DATASET, shuffle=True, batch_size=BATCH_SIZE)
TEST_DATALOADER = DataLoader(TEST_DATASET, shuffle=True, batch_size=BATCH_SIZE)

print(f'{TRAIN_DATALOADER=} {VALID_DATALOADER=} {TEST_DATALOADER=}')

Filter:   0%|          | 0/9498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2375 [00:00<?, ? examples/s]

len(TRAIN_DATASET)=100 len(VALIDATE_DATASET)=4727 len(TEST_DATASET)=1201
{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}
{'id': '56d43c5f2ccc5a1400d830ab', 't

Map:   0%|          | 0/4727 [00:00<?, ? examples/s]

Map:   0%|          | 0/1201 [00:00<?, ? examples/s]

{'input_ids': tensor([ 2215,   750, 37361,   344,   923,  5033,  2968,    30, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor([ 259,  262, 2739, 6303,   82, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100])}
{'input_ids': tensor([ 2061,   373,   262,   717,  5062, 37361, 32682,  2716,   355,   257,
        12199,  6802,    30, 50256, 50256, 50256]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]), 'labels': tensor([  35, 2564, 3481,  287, 5896, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100])}
{'input_ids': tensor([ 2215,   750, 17886,   338,  5932,   886,   511,  1448,   719,    30,
        50256, 50256, 50256, 50256, 50256, 50256]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]), 'labels': tensor([15749,  5075,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
    

# training

In [None]:
#@title eval before fine tuning
import evaluate

torch.set_grad_enabled(False)
MODEL.eval()
output_tokens = MODEL.generate(torch.as_tensor(TOKENIZER.encode("When did Beyonce start becoming popular?")).view(1, -1).to(DEVICE))
print('generated: ', output_tokens[0])
print('generated: ', TOKENIZER.decode(output_tokens[0]))

output_tokens = MODEL.generate(torch.as_tensor(TOKENIZER.encode("what holiday is first weekend of september")).view(1, -1).to(DEVICE))
print('generated: ', output_tokens[0])
print('generated: ', TOKENIZER.decode(output_tokens[0]))

metric = evaluate.load("accuracy")

for batch in VALID_DATALOADER:
    input_ids = batch["input_ids"].clone().detach().to(DEVICE)
    attention_mask = batch["attention_mask"].clone().detach().to(DEVICE)
    labels = batch["labels"].clone().detach().to(DEVICE)
    with torch.no_grad():
      outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    for p, l in zip(predictions, labels):
      metric.add_batch(predictions=p, references=l)

metric.compute()


In [17]:
#@title fine tune

from torch.optim import AdamW
from tqdm.auto import tqdm
from transformers import get_scheduler

torch.set_grad_enabled(True)

optimizer = AdamW(MODEL.parameters(), lr=1e-4)
num_epochs = 100
num_training_steps = num_epochs * len(TRAIN_DATALOADER)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))


for epoch in range(num_epochs):
    train_loss = 0
    MODEL.train()
    for batch in TRAIN_DATALOADER:
        input_ids = batch["input_ids"].clone().detach().to(DEVICE)
        attention_mask = batch["attention_mask"].clone().detach().to(DEVICE)
        labels = batch["labels"].clone().detach().to(DEVICE)
        outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    avg_train_loss = train_loss / len(TRAIN_DATALOADER)
    print(f"Epoch {epoch + 1}: Training Loss = {avg_train_loss:.4f}")

    # MODEL.eval()
    # val_loss = 0
    # with torch.no_grad():
    #     for batch in VALID_DATALOADER:
    #         input_ids = batch["input_ids"].clone().detach().to(DEVICE)
    #         attention_mask = batch["attention_mask"].clone().detach().to(DEVICE)
    #         labels = batch["labels"].clone().detach().to(DEVICE)

    #         outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    #         loss = outputs.loss
    #         val_loss += loss.item()
    # avg_val_loss = val_loss / len(VALID_DATALOADER)
    # print(f"Epoch {epoch + 1}: Validation Loss = {avg_val_loss:.4f}")

    # checkpoint
    file_name = "checkpoint/epoch_" + str(epoch)
    torch.save(MODEL.state_dict(), file_name)





  0%|          | 0/700 [00:00<?, ?it/s]

Epoch 1: Training Loss = 7.3730
Epoch 2: Training Loss = 5.9649
Epoch 3: Training Loss = 4.9902
Epoch 4: Training Loss = 4.0637
Epoch 5: Training Loss = 3.4828
Epoch 6: Training Loss = 3.0920
Epoch 7: Training Loss = 2.8462
Epoch 8: Training Loss = 2.6110
Epoch 9: Training Loss = 2.5125
Epoch 10: Training Loss = 2.4019
Epoch 11: Training Loss = 2.2551
Epoch 12: Training Loss = 2.3136
Epoch 13: Training Loss = 2.0754
Epoch 14: Training Loss = 2.2022
Epoch 15: Training Loss = 2.3312
Epoch 16: Training Loss = 2.3175
Epoch 17: Training Loss = 2.0591
Epoch 18: Training Loss = 2.2257
Epoch 19: Training Loss = 2.3998
Epoch 20: Training Loss = 1.8238
Epoch 21: Training Loss = 2.0362
Epoch 22: Training Loss = 2.2178
Epoch 23: Training Loss = 1.9152
Epoch 24: Training Loss = 1.9454
Epoch 25: Training Loss = 2.0198
Epoch 26: Training Loss = 2.3852
Epoch 27: Training Loss = 2.1300
Epoch 28: Training Loss = 1.7987
Epoch 29: Training Loss = 1.9594
Epoch 30: Training Loss = 2.1177
Epoch 31: Training 

In [18]:
#@title evaluate test split

import evaluate

torch.set_grad_enabled(False)
MODEL.eval()

prompts = [
    "When did Beyonce start becoming popular?",
    "What was the first album Beyoncé released as a solo artist?",
    "When did Destiny's Child end their group act?",
    "Beyonce's childhood home believed in what religion?",
    "When did Beyonce start becoming popular?",
           """Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy". When did Beyonce start becoming popular?""",
           """Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy". In what city and state did Beyonce grow up? """,
           'What was the first album Beyoncé released as a solo artist?',
           'Which individual began a library at Notre Dame?',
           """The first degrees from the college were awarded in 1849. The university was expanded with new buildings to accommodate more students and faculty. With each new president, new academic programs were offered and new buildings built to accommodate them. The original Main Building built by Sorin just after he arrived was replaced by a larger "Main Building" in 1865, which housed the university's administration, classrooms, and dormitories. Beginning in 1873, a library collection was started by Father Lemonnier. By 1879 it had grown to ten thousand volumes that were housed in the Main Building. Which individual began a library at Notre Dame?""" ,
           ]

# prompts = ['what holiday is first weekend of september',
#            'what does life insurance cover?',
#            'what caused ww',
#            'how long did the roman empire last']

for p in prompts:
  output_tokens = MODEL.generate(torch.as_tensor(TOKENIZER.encode(p)).view(1, -1).to(DEVICE),
                                num_beams=3,
                                 early_stopping=True,
                                 max_new_tokens=50,
                                no_repeat_ngram_size=2,
                                do_sample=True,
                                pad_token_id=TOKENIZER.eos_token_id,
                                 repetition_penalty=1.2,
                                 temperature=0.7,  # Controls randomness
                                  top_p=0.9,         # Focuses on the most probable tokens
                                )
  print('generated: ', TOKENIZER.decode(output_tokens[0]))

# metric = evaluate.load("accuracy")

# for batch in TEST_DATALOADER:
#     input_ids = batch["input_ids"].clone().detach().to(DEVICE)
#     attention_mask = batch["attention_mask"].clone().detach().to(DEVICE)
#     labels = batch["labels"].clone().detach().to(DEVICE)
#     with torch.no_grad():
#       outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     for p, l in zip(predictions, labels):
#       metric.add_batch(predictions=p, references=l)

# metric.compute()

generated:  When did Beyonce start becoming popular?s No.s 1990s Love and Hip-Hop Records Recordscé Records. 1990's No Child's I Love Myself Recordsroulette Recordsheward Recordser Recordsett Records Blacker and Brown Records Roots Records Ty Doll and the B. B
generated:  What was the first album Beyoncé released as a solo artist? 1990's No. 1 Love My Life and My. B.s.? No.? 1990cécésssss.cé Records Records.roulette Recordser Records owner Records Entertainment Records Music. Records Knowles Records Johnson Records Roots Records Williams Records Ty
generated:  When did Destiny's Child end their group act?s in 2014's.s,s ands.sss No. Records Records owner Records Entertainment Records left Records in 2005césser Recordsard Recordsrou Recordsett Recordshew Records Ty Records Williams Records Roots Records My Records Blacker. My.
generated:  Beyonce's childhood home believed in what religion? Church's and United Church of Christ's. United Methodist Church and Methodist Episcopal Church. Me

In [None]:
model = transformers.AutoModelForCausalLM.from_pretrained("gpt2").to(DEVICE)
model.load_state_dict(torch.load('math_checkpoint/epoch_70', weights_only=False))
model.eval()

output_tokens = model.generate(torch.as_tensor(TOKENIZER.encode('1234 + 4567 =')).view(1, -1).to(DEVICE),
                                num_beams=3,
                                 early_stopping=True,
                                 max_new_tokens=4,
                                # no_repeat_ngram_size=2,
                                do_sample=True,
                                pad_token_id=TOKENIZER.eos_token_id
                                )
print('generated: ', TOKENIZER.decode(output_tokens[0]))

In [11]:
!rm -rf checkpoint
!rm -rf math_checkpoint
!mkdir checkpoint
!mkdir math_checkpoint

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()