# Generating QB Questions
In this HW I'll be loading Meta's new LLaMA 3 8B model and using it to generate quizbowl questions, given an answer (first). I will fine tune it on a particular chat format, taking in the system prompt, an answer, then a question (in inference, I'll provide the answer).

## Load LLaMA-3

In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import transformers
import datasets
import torch
import pandas as pd
from tqdm import tqdm
import pickle
import einops
import os
from datetime import datetime
import transformers

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer
from peft import PeftModel
model =LlamaForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", load_in_8bit=True, device_map='auto', torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer.add_special_tokens(
    {

        "pad_token": "<PAD>",
    }
)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [3]:
def generate_sentence(str, model, tokenizer, with_logprobs=False, max_new_tokens=10, top_tokens=5, show_token_strs=True, **kwargs):
    tokenized_str = tokenizer(str, return_tensors="pt").input_ids.cuda()
    
    try:
        generated_output = model.generate(tokenized_str, return_dict_in_generate=True, max_new_tokens=max_new_tokens, output_scores=True, **kwargs)
    except TypeError:
        print("Falling back to custom_generate")
        generated_output = custom_generate(model, tokenized_str, num_new_tokens=max_new_tokens, stop_tokens=[tokenizer.eos_token_id], **kwargs)

    # generated_output = custom_generate(model_fn, tokenized_str, num_new_tokens=max_new_tokens, **kwargs)
    
    tokenized_result = generated_output['sequences'][0]
    # print(tokenized_result)
    if with_logprobs:
        # rows should be token number, columns should be alternating ith token and probability of ith token, fill in with probabilities
        data = []
        for score in generated_output['scores']:
            # a tensor of logits, translate into probabilities
            probs = torch.nn.functional.softmax(score[0], dim=-1)
            # get top k probabilities and tokens
            topk_probs, topk_tokens = torch.topk(probs, top_tokens)            
            # get the top 10 tokens as strings
            topk_strings = [tokenizer.decode(token) for token in topk_tokens]

            row = {}
            # fill in df
            for i in range(top_tokens):
                row[f'Token_{i+1}'] = topk_tokens[i].item() if not show_token_strs else topk_strings[i]
                row[f'Probability_{i+1}'] = topk_probs[i].item()
            data.append(row)
        probs_df = pd.DataFrame(data)

        return tokenizer.decode(tokenized_result), probs_df
    else:
        return tokenizer.decode(tokenized_result)

# generate_sentence("Hey how are you doing today?", llama3, tokenizer, max_new_tokens=10)

## Load Dataset

In [4]:
# open qanta.buzztrain.json
import json

with open('qanta.buzztrain.json', 'r') as f:
    data = json.load(f)

print(len(data))
print(data[0].keys())
print(data[0])

train_data = data[:int(len(data)*.8)]
test_data = data[int(len(data)*.8):int(len(data)*.9)]
gen_data = data[int(len(data)*.9):]


18460
dict_keys(['text', 'answer', 'page', 'category', 'subcategory', 'tournament', 'difficulty', 'year', 'proto_id', 'qdb_id', 'dataset', 'qanta_id', 'tokenizations', 'first_sentence', 'answer_prompt', 'gameplay', 'fold'])
{'text': 'After this character relates a story about how he didn\'t know the proper way to use a wheelbarrow, he tells of how a captain dining with his father mistakenly rubbed his hands in a punch bowl.\xa0This "sea Prince of Wales" leaves his home by hiding out in a canoe near a coral reef, and he is mistakenly called "Hedgehog" by a character who offers him a ninetieth lay, a partner of Bildad named Peleg. A door is broken down in Mrs. Hussey\'s establishment after he locks himself in his room during a "Ramadan."\xa0He is first encountered in the Spouter-Inn where the landlord thinks he may be late because "he can\'t sell his head," and his coffin helps save the narrator after the ship he\'s on sinks.\xa0For 10 points, name this native of Rokovoko and savage comp

In [5]:
trivia_system_message = {"role": "system", "content": """You are a helpful assistant generating trivia questions. I will provide an answer, and you must generate a quizbowl question that gives clues about the answer starting with easy and ending with hard questions."""}
from datasets import Dataset

max_length = 512
# Assuming 'data' is your list of dictionaries
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
test_dataset = Dataset.from_pandas(pd.DataFrame(test_data))
gen_dataset = Dataset.from_pandas(pd.DataFrame(gen_data))
def preprocess_llama_dataset_for_hf(example, include_question=True, sys_msg=trivia_system_message, tokenizer=tokenizer):
    # make a dialogue
    answer_msg = {"role": "user", "content": f"Answer: {example['answer']}"}
    dialogue = [sys_msg, answer_msg]
    
    if include_question:
        question_msg = {"role": "assistant", "content": f"Question: {example['text']}"}
        dialogue.append(question_msg)
    
    dialogue.append(answer_msg)
    chat = tokenizer.apply_chat_template(dialogue, truncation=True, padding=True, max_length=512)
    # Return the necessary fields
    return {
        "input_ids": chat,
        "formatted_prompt": tokenizer.decode(chat)
    }

# Apply the preprocessing function to each item in the dataset
train_dataset = train_dataset.map(preprocess_llama_dataset_for_hf, batched=False)
test_dataset = test_dataset.map(preprocess_llama_dataset_for_hf, batched=False)
gen_dataset = gen_dataset.map(lambda example: preprocess_llama_dataset_for_hf(example, include_question=False), batched=False)
# Now, 'processed_dataset' contains the original data along with the new columns 'input_ids' and 'attention_mask'



Map:   0%|          | 0/14768 [00:00<?, ? examples/s]


No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



Map:   0%|          | 0/1846 [00:00<?, ? examples/s]

Map:   0%|          | 0/1846 [00:00<?, ? examples/s]

In [6]:
tokenizer.decode(train_dataset[0]['input_ids'])

'<|im_start|>system\nYou are a helpful assistant generating trivia questions. I will provide an answer, and you must generate a quizbowl question that gives clues about the answer starting with easy and ending with hard questions.<|im_end|>\n<|im_start|>user\nAnswer: Queequeg<|im_end|>\n<|im_start|>assistant\nQuestion: After this character relates a story about how he didn\'t know the proper way to use a wheelbarrow, he tells of how a captain dining with his father mistakenly rubbed his hands in a punch bowl.\xa0This "sea Prince of Wales" leaves his home by hiding out in a canoe near a coral reef, and he is mistakenly called "Hedgehog" by a character who offers him a ninetieth lay, a partner of Bildad named Peleg. A door is broken down in Mrs. Hussey\'s establishment after he locks himself in his room during a "Ramadan."\xa0He is first encountered in the Spouter-Inn where the landlord thinks he may be late because "he can\'t sell his head," and his coffin helps save the narrator after 

In [13]:
len(train_dataset[0]['input_ids'])

512

## train a rank-16 LoRA

Parts are taken from https://github.com/meta-llama/llama-recipes/blob/main/recipes/finetuning/huggingface_trainer/peft_finetuning.ipynb

In [8]:
from peft import get_peft_model
from peft import LoraConfig, TaskType

model.train()

def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules = ["q_proj", "v_proj"]
    )

    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config

# create peft config
model, lora_config = create_peft_config(model)

trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


## Couldn't get huggingface trainer to work so writing my own train loop

In [None]:
train_loader = 

## Broken trainer code

In [9]:
from transformers import DataCollatorForLanguageModeling

# Assuming 'tokenizer' is your tokenizer instance
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # For causal language models, set mlm (Masked Language Model) to False
)


In [12]:
# Define training args
from transformers import TrainingArguments, Trainer

output_dir = "PhillipGuo/llama3_lora"
config = {
    'lora_config': lora_config,
    'learning_rate': 1e-4,
    'num_train_epochs': 1,
    'gradient_accumulation_steps': 2,
    'per_device_train_batch_size': 2,
    'gradient_checkpointing': False,
}
enable_profiler=False

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    bf16=True,  # Use BF16 if available
    # logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
    optim="adamw_torch_fused",
    # max_steps=total_steps if enable_profiler else -1,
    **{k:v for k,v in config.items() if k != 'lora_config'}
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    # data_collator=data_collator,
)

# Start training
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mphilliphguo[0m ([33mquirky_lats_at_mats[0m). Use [1m`wandb login --relogin`[0m to force relogin


../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [692,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [692,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [692,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [692,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [692,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [692,0,0], thread: [69,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [692,

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [10]:
from transformers import AutoTokenizer, DefaultDataCollator, AutoModelForQuestionAnswering, TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="PhillipGuo/llama3_qa_lora",
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

trainer.train()



TypeError: LlamaForCausalLM.forward() got an unexpected keyword argument 'decoder_input_ids'