# Generating QB Questions
In this HW I'll be loading Meta's new LLaMA 3 8B Instruct model and using it to generate quizbowl questions, given an answer (first). I will fine tune it on a particular chat format, taking in the system prompt, an answer, then a question (in inference, I'll provide the answer).

## Load LLaMA-3

In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import transformers
import datasets
import torch
import pandas as pd
from tqdm.auto import tqdm
import pickle
import einops
import os
from datetime import datetime
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer
from peft import PeftModel
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model =LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.bfloat16, device_map='auto')
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_special_tokens(
#     {

#         "pad_token": "<PAD>",
#     }
# )


Loading checkpoint shards: 100%|██████████| 2/2 [00:19<00:00,  9.87s/it]


In [3]:
def generate_sentence(str, model, tokenizer, with_logprobs=False, max_new_tokens=10, top_tokens=5, show_token_strs=True, **kwargs):
    tokenized_str = tokenizer(str, return_tensors="pt").input_ids.cuda()
    
    try:
        generated_output = model.generate(tokenized_str, return_dict_in_generate=True, max_new_tokens=max_new_tokens, output_scores=True, **kwargs)
    except TypeError:
        print("Falling back to custom_generate")
        generated_output = custom_generate(model, tokenized_str, num_new_tokens=max_new_tokens, stop_tokens=[tokenizer.eos_token_id], **kwargs)

    # generated_output = custom_generate(model_fn, tokenized_str, num_new_tokens=max_new_tokens, **kwargs)
    
    tokenized_result = generated_output['sequences'][0]
    # print(tokenized_result)
    if with_logprobs:
        # rows should be token number, columns should be alternating ith token and probability of ith token, fill in with probabilities
        data = []
        for score in generated_output['scores']:
            # a tensor of logits, translate into probabilities
            probs = torch.nn.functional.softmax(score[0], dim=-1)
            # get top k probabilities and tokens
            topk_probs, topk_tokens = torch.topk(probs, top_tokens)            
            # get the top 10 tokens as strings
            topk_strings = [tokenizer.decode(token) for token in topk_tokens]

            row = {}
            # fill in df
            for i in range(top_tokens):
                row[f'Token_{i+1}'] = topk_tokens[i].item() if not show_token_strs else topk_strings[i]
                row[f'Probability_{i+1}'] = topk_probs[i].item()
            data.append(row)
        probs_df = pd.DataFrame(data)

        return tokenizer.decode(tokenized_result), probs_df
    else:
        return tokenizer.decode(tokenized_result)

generate_sentence("Hey how are you doing today?", model, tokenizer, max_new_tokens=10)

"<s> Hey how are you doing today?\n\nI'm doing well, thanks for"

## Load Dataset

In [4]:
# open qanta.buzztrain.json
import json

with open('qanta.buzztrain.json', 'r') as f:
    data = json.load(f)

print(len(data))
print(data[0].keys())
print(data[0])

train_data = data[:int(len(data)*.8)]
test_data = data[int(len(data)*.8):int(len(data)*.9)]
gen_data = data[int(len(data)*.9):]

train_data = train_data[:5000]

18460
dict_keys(['text', 'answer', 'page', 'category', 'subcategory', 'tournament', 'difficulty', 'year', 'proto_id', 'qdb_id', 'dataset', 'qanta_id', 'tokenizations', 'first_sentence', 'answer_prompt', 'gameplay', 'fold'])
{'text': 'After this character relates a story about how he didn\'t know the proper way to use a wheelbarrow, he tells of how a captain dining with his father mistakenly rubbed his hands in a punch bowl.\xa0This "sea Prince of Wales" leaves his home by hiding out in a canoe near a coral reef, and he is mistakenly called "Hedgehog" by a character who offers him a ninetieth lay, a partner of Bildad named Peleg. A door is broken down in Mrs. Hussey\'s establishment after he locks himself in his room during a "Ramadan."\xa0He is first encountered in the Spouter-Inn where the landlord thinks he may be late because "he can\'t sell his head," and his coffin helps save the narrator after the ship he\'s on sinks.\xa0For 10 points, name this native of Rokovoko and savage comp

In [5]:
trivia_system_message = {"role": "system", "content": """You are a helpful assistant generating trivia questions. I will provide an answer, and you must generate a quizbowl question that gives clues about the answer starting with easy and ending with hard questions."""}
from datasets import Dataset

max_length = 512
# Assuming 'data' is your list of dictionaries
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
test_dataset = Dataset.from_pandas(pd.DataFrame(test_data))
gen_dataset = Dataset.from_pandas(pd.DataFrame(gen_data))
def preprocess_llama_dataset_for_hf(example, include_question=True, sys_msg=trivia_system_message, tokenizer=tokenizer):
    # make a dialogue
    answer_msg = {"role": "user", "content": f"Answer: {example['answer']}"}
    dialogue = [sys_msg, answer_msg]
    
    if include_question:
        question_msg = {"role": "assistant", "content": f"Question:\n{example['text']}"}
        dialogue.append(question_msg)
        # chat = tokenizer.apply_chat_template(dialogue, truncation=True, padding=True, max_length=512)
        chat = tokenizer.apply_chat_template(dialogue)
    else:
        question_msg = {"role": "assistant", "content": "Question:\n"}
        dialogue.append(question_msg)
        chat = tokenizer.apply_chat_template(dialogue)[:-1]

    # Return the necessary fields
    return {
        "input_ids": chat,
        "formatted_prompt": tokenizer.decode(chat),
        "labels": chat,
    }

# Apply the preprocessing function to each item in the dataset
train_dataset = train_dataset.map(preprocess_llama_dataset_for_hf, batched=False)
test_dataset = test_dataset.map(preprocess_llama_dataset_for_hf, batched=False)
gen_dataset = gen_dataset.map(lambda example: preprocess_llama_dataset_for_hf(example, include_question=False), batched=False)


Map: 100%|██████████| 5000/5000 [00:10<00:00, 477.82 examples/s]
Map: 100%|██████████| 1846/1846 [00:03<00:00, 528.10 examples/s]
Map: 100%|██████████| 1846/1846 [00:01<00:00, 1248.87 examples/s]


### Sample Generation

In [6]:
# test generation
print(tokenizer.decode(train_dataset[0]['input_ids']))
# print(len(train_dataset[1]['input_ids']))
sample_input = torch.tensor(gen_dataset[0]['input_ids'], device="cuda").unsqueeze(0)
# print(tokenizer.decode(sample_input))
generation = model.generate(sample_input, max_new_tokens=100)
print(tokenizer.batch_decode(generation)[0])

<s> [INST] <<SYS>>
You are a helpful assistant generating trivia questions. I will provide an answer, and you must generate a quizbowl question that gives clues about the answer starting with easy and ending with hard questions.
<</SYS>>

Answer: Queequeg [/INST] Question:
After this character relates a story about how he didn't know the proper way to use a wheelbarrow, he tells of how a captain dining with his father mistakenly rubbed his hands in a punch bowl. This "sea Prince of Wales" leaves his home by hiding out in a canoe near a coral reef, and he is mistakenly called "Hedgehog" by a character who offers him a ninetieth lay, a partner of Bildad named Peleg. A door is broken down in Mrs. Hussey's establishment after he locks himself in his room during a "Ramadan." He is first encountered in the Spouter-Inn where the landlord thinks he may be late because "he can't sell his head," and his coffin helps save the narrator after the ship he's on sinks. For 10 points, name this native 

<s> [INST] <<SYS>>
You are a helpful assistant generating trivia questions. I will provide an answer, and you must generate a quizbowl question that gives clues about the answer starting with easy and ending with hard questions.
<</SYS>>

Answer: cochlear implant [accept rough equivalents; do not accept "hearing aid"] [/INST] Question: 

Easy: What is the primary purpose of a cochlear implant?

Medium: What is the name of the device that converts sound waves into electrical signals in a cochlear implant?

Hard: What is the process by which the electrical signals from the cochlear implant are converted into neural signals in the brain?</s>


## train a rank-16 LoRA

Parts are taken from https://github.com/meta-llama/llama-recipes/blob/main/recipes/finetuning/huggingface_trainer/peft_finetuning.ipynb

In [7]:
from peft import get_peft_model
from peft import LoraConfig, TaskType

model.train()

def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules = ["q_proj", "v_proj"]
    )

    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config

# create peft config
model, lora_config = create_peft_config(model)
model.cuda()

trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.12433454005023165


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
      

## Couldn't get huggingface trainer to work so writing my own train loop

In [8]:
from torch.nn.utils.rnn import pad_sequence

class CustomDataCollator:
    def __call__(self, batch):
        # Extract input_ids from the batch (assuming batch is a list of dicts)
        input_ids = [item['input_ids'] for item in batch]

        # Convert input_ids into a list of tensors
        input_ids_tensors = [torch.tensor(ids) for ids in input_ids]

        # Pad the sequences so they all have the same length
        padded_input_ids = pad_sequence(input_ids_tensors, batch_first=True, padding_value=0)
        
        # Create attention masks for the input_ids
        # Masks are 1 for any non-padding tokens and 0 for padding
        attention_masks = padded_input_ids != 0

        # You can return a dictionary with the masks and the padded input ids
        return {
            'input_ids': padded_input_ids,
            'attention_mask': attention_masks
        }
from torch.utils.data import DataLoader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=CustomDataCollator())
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=CustomDataCollator())

train_iter = iter(train_loader)
test_iter = iter(test_loader)

In [9]:
num_steps = 50  # Set the number of training steps
current_step = 0

grad_accum_steps = 4
device="cuda"
model.train()  # Set the model to training mode

trainable_params = []
for name, param in model.named_parameters():
    if param.requires_grad:
        trainable_params.append(param)
from torch.optim import AdamW
optimizer = AdamW(trainable_params, lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(reduce=True)
for current_step in tqdm(range(num_steps)):
    optimizer.zero_grad()  # Clear previous gradients

    tot_loss = 0
    for i in range(grad_accum_steps):
        batch = next(train_iter)
        
        # Move batch to the same device as the model
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)


        # Prepare targets: for predicting the next token, shift input_ids to the left
        labels = input_ids[:, 1:][attention_mask[:, 1:]].contiguous()

        model_output = model(input_ids[:, :-1].contiguous(), attention_mask=attention_mask[:, :-1].contiguous())
        logits = model_output.logits[attention_mask[:, 1:].contiguous()]
        # print(f"input_ids shape: {input_ids.shape}, {logits.shape=}, {labels.shape=}")

        loss = criterion(logits, labels)
        # if loss is nan ignore
        if torch.isnan(loss):
            print("Loss is nan, skipping")
            continue
        # print(loss)
        tot_loss += loss.item()

        # Backward pass and optimizer step
        loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

    # Optionally print the loss
    if current_step % 10 == 0:
        # eval on test
        with torch.no_grad():
            model.eval()
            test_loss = 0
            for i in range(10):
                batch = next(test_iter)
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = input_ids[:, 1:][attention_mask[:, 1:]].contiguous()
                model_output = model(input_ids[:, :-1].contiguous(), attention_mask=attention_mask[:, :-1].contiguous())
                logits = model_output.logits[attention_mask[:, 1:].contiguous()]
                loss = criterion(logits, labels)
                test_loss += loss.item()
            model.train()
        print(f"Step {current_step}, Train Loss: {tot_loss / grad_accum_steps}, Test Loss: {test_loss / 10}")

print("Training complete.")
model.save_pretrained(model_name+"-qa-lora")

  0%|          | 0/100 [00:00<?, ?it/s]

  1%|          | 1/100 [00:03<06:05,  3.69s/it]

Step 0: Loss = 3.6641833782196045


 11%|█         | 11/100 [00:37<05:05,  3.43s/it]

Step 10: Loss = 2.6266692876815796


 21%|██        | 21/100 [01:12<04:35,  3.48s/it]

Step 20: Loss = 2.0988070964813232


 31%|███       | 31/100 [01:47<03:59,  3.46s/it]

Step 30: Loss = 1.7149027585983276


 41%|████      | 41/100 [02:23<03:26,  3.51s/it]

Step 40: Loss = 1.6647711098194122


 51%|█████     | 51/100 [02:58<02:54,  3.56s/it]

Step 50: Loss = 1.572461873292923


 61%|██████    | 61/100 [03:34<02:20,  3.59s/it]

Step 60: Loss = 1.6039766371250153


 71%|███████   | 71/100 [04:09<01:43,  3.58s/it]

Step 70: Loss = 1.535832405090332


 78%|███████▊  | 78/100 [04:34<01:17,  3.52s/it]


StopIteration: 

## Sample new questions

In [None]:
# can load either llama-2-7b-qa-lora or llama-3-8b-qa-lora


In [16]:
def preprocess_llama_dataset_for_hf(example, include_question=True, sys_msg=trivia_system_message, tokenizer=tokenizer):
    # make a dialogue
    answer_msg = {"role": "user", "content": f"Answer: {example['answer']}"}
    dialogue = [sys_msg, answer_msg]
    
    if include_question:
        question_msg = {"role": "assistant", "content": f"Question:\n{example['text']}"}
        dialogue.append(question_msg)
        # chat = tokenizer.apply_chat_template(dialogue, truncation=True, padding=True, max_length=512)
        chat = tokenizer.apply_chat_template(dialogue)
    else:
        question_msg = {"role": "assistant", "content": "Question:\n"}
        dialogue.append(question_msg)
        chat = tokenizer.apply_chat_template(dialogue)[:-1]

    # Return the necessary fields
    return {
        "input_ids": chat,
        "formatted_prompt": tokenizer.decode(chat),
        "labels": chat,
    }

def sample_question(answer, sys_msg=trivia_system_message, tokenizer=tokenizer, max_new_tokens=100):
    answer_msg = {"role": "user", "content": f"Answer: {answer}"}
    dialogue = [sys_msg, answer_msg]
    
    question_msg = {"role": "assistant", "content": "Question:\n"}
    dialogue.append(question_msg)
    chat = tokenizer.apply_chat_template(dialogue)[:-1]

    sample_input = torch.tensor(chat, device="cuda").unsqueeze(0)
    with torch.no_grad():
        generation = model.generate(sample_input, max_new_tokens=max_new_tokens)
    question_alone = generation[0][len(chat)+1:-1]
    return tokenizer.decode(question_alone)

for i in range(10):
    answer = gen_dataset[i]['answer']
    print(f"Answer: {answer}\nReal Question: {gen_dataset[i]['text']}\nGenerated question: {sample_question(answer)}")
    print("\n\n")



Answer: cochlear implant [accept rough equivalents; do not accept "hearing aid"]
Real Question: In the 1970s, the Bilger report contradicted the NIH's wishes to disband all research related to this technology, and in the 1980s the NIH gave a grant to Australia's Graham Clark to further develop the multi-channel version of this device. It can be traced back to Volta's description of sensations evoked by electrically stimulating his ears, and current manufacturers include Med-El and Advanced Bionics. Sometimes called the "bionic ear", for 10 points, name this device that restores hearing to deaf people, an implant in the part of the ear containing hair cells.
Generated question: In addition to being used to treat this condition, this device is also used to treat the condition of tinnitus, which causes the perception of ringing in the ears.  The electrodes are placed in the mastoid cavity, and the device is activated by a microphone in the ear canal.  FTP, name this device which has been 

## Broken trainer code

In [7]:
from transformers import DataCollatorForLanguageModeling

# Assuming 'tokenizer' is your tokenizer instance
from torch.nn.utils.rnn import pad_sequence

max_seq_len = 384
class CustomDataCollator:
    def __call__(self, batch):
        # Extract input_ids from the batch (assuming batch is a list of dicts)
        input_ids = [item['input_ids'] for item in batch]

        # Convert input_ids into a list of tensors
        input_ids_tensors = [torch.tensor(ids) for ids in input_ids]

        # Pad the sequences so they all have the same length
        padded_input_ids = pad_sequence(input_ids_tensors, batch_first=True, padding_value=0)
        padded_input_ids = padded_input_ids[:, :max_seq_len]
        
        # Create attention masks for the input_ids
        # Masks are 1 for any non-padding tokens and 0 for padding
        attention_masks = padded_input_ids != 0

        # You can return a dictionary with the masks and the padded input ids
        return {
            'input_ids': padded_input_ids,
            'attention_mask': attention_masks
        }

In [9]:

from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments

from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments

# path where the Trainer will save its checkpoints and logs
output_dir = 'PhillipGuo/llama3_lora'
model.train()

# based on config
training_args = TrainingArguments(
    fp16=True, # specify bf16=True instead when training on GPUs that support bf16
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=1,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=8, # originally set to 8
    per_device_train_batch_size=8, # originally set to 8
    # push_to_hub=True,
    # hub_model_id="zephyr-7b-sft-lora",
    # hub_strategy="every_save",
    # report_to="tensorboard",
    save_strategy="no",
    save_total_limit=None,
    seed=42,
)

# based on config
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

trainer = SFTTrainer(
        model=model,
        # model_init_kwargs=model_kwargs,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        dataset_text_field="formatted_prompt",
        tokenizer=tokenizer,
        packing=True,
        peft_config=peft_config,
        max_seq_length=tokenizer.model_max_length,
    )

train_result = trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend
The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: proto_id, fold, answer_prompt, first_sentence, gameplay, subcategory, answer, tokenizations, page, year, dataset, text, category, difficulty, formatted_prompt, qdb_id, tournament, qanta_id. If proto_id, fold, answer_prompt, first_sentence, gameplay, subcategory, answer, tokenizations, page, year, dataset, text, category, difficulty, formatted_prompt, qdb_id, tournament, qanta_id are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5,000
  Num Epochs

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.

In [9]:
# Define training args
from transformers import TrainingArguments, Trainer

output_dir = "PhillipGuo/llama3_lora"
config = {
    'lora_config': lora_config,
    'learning_rate': 1e-4,
    'num_train_epochs': 1,
    'gradient_accumulation_steps': 1,
    'per_device_train_batch_size': 1,
    'gradient_checkpointing': False,
}
enable_profiler=False

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    bf16=True,  # Use BF16 if available
    # logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
    optim="adamw_torch_fused",
    # max_steps=total_steps if enable_profiler else -1,
    **{k:v for k,v in config.items() if k != 'lora_config'}
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=CustomDataCollator()
)

# Start training
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mphilliphguo[0m ([33mquirky_lats_at_mats[0m). Use [1m`wandb login --relogin`[0m to force relogin


ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

In [10]:
from transformers import AutoTokenizer, DefaultDataCollator, AutoModelForQuestionAnswering, TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="PhillipGuo/llama3_qa_lora",
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

trainer.train()



TypeError: LlamaForCausalLM.forward() got an unexpected keyword argument 'decoder_input_ids'