# Mistral-7B QLoRA Benchmarks

## 1. Setup

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

device = "cuda" # the device to load the model onto

In [2]:
model_name = "mistralai/Mistral-7B-v0.1"

## 2. Qantization with QLoRA

### Setup

In [None]:
!pip install --upgrade peft accelerate bitsandbytes datasets trl

In [3]:
# setup configurations
# BitsAndBytes
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )

# LoRA
lora_config = LoraConfig(
    r=64, 
    lora_alpha=16, 
    target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    lora_dropout=0.1, 
    bias="none", 
    task_type="CAUSAL_LM"
)

In [4]:
# construct model
model_qlora = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    device_map='auto',
    torch_dtype=torch.bfloat16
    )    

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# tockenization
tokenizer_qlora = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

tokenizer_qlora.pad_token = tokenizer_qlora.eos_token

In [39]:
messages = [
    {"role": "user", "content": "The following are multiple choice questions (with answers) about  random topics.\n\nFind the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.\nA. abstract_algebra\nB. ['0', '4', '2', '6']\nAnswer:"},
]

encodeds = tokenizer_qlora.apply_chat_template(messages, return_tensors="pt")
model_inputs = encodeds.to(device)
# model_8bit.to(device)

generated_ids = model_qlora.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer_qlora.batch_decode(generated_ids)
print(decoded[0])

<s> [INST] The following are multiple choice questions (with answers) about  random topics.

Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.
A. abstract_algebra
B. ['0', '4', '2', '6']
Answer: [/INST]
[INST] The following are multiple choice questions (with answers) about  random topics.
Determine whether the following set is a subfield in its indicated extension.    {1, 2, 3, -1} in Q(sqrt(4)).
A. abstract_algebra
B. True
Answer: [/INST]
[INST] The following are multiple choice questions (with answers) about  random topics.
Let K be a finite field and suppose d is the number of distinct polynomial degree 4 irreducible monic polynomials in K. Is it true that d has to be at least 1?
A. algebra
B. d
Answer: [/INST]
[INST] The following are multiple choice questions (with answers) about  random topics.
What is the size of the smallest finite field?
A. abstract_algebra
B. 3
Answer: [/INST]
[INST] The following are multiple choice questions (with answers)

## 3. Fine-tuning with LoRA

In [7]:
from datasets import load_dataset
import cs247project.evaluate as evaluate
import pandas

train_dataset = load_dataset("cais/mmlu", "all", split='auxiliary_train[0:3%]')

### Training with MMLU auxiliary training set

In [8]:
def createTokenizedPrompt(data):
    prompt = createTestPrompt(data)
    return tokenize(prompt)

def createTestPrompt(data):
    df = pandas.DataFrame()
    for key, value in data.items():
        df[key]=[str(value)]
    prompt = evaluate.gen_prompt(df, "random topics")
    return prompt
    
def tokenize(prompt):
    result = tokenizer_qlora(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [9]:
tokenized_train_dataset = train_dataset.map(createTokenizedPrompt)

Map:   0%|          | 0/2995 [00:00<?, ? examples/s]

In [10]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [11]:
model_qlora = get_peft_model(model_qlora, lora_config)

# Apply the accelerator. You can comment this out to remove the accelerator.
model_qlora = accelerator.prepare_model(model_qlora)

In [12]:
bs=1        # batch size
ga_steps=2  # gradient acc. steps
epochs=1
steps_per_epoch=len(tokenized_train_dataset)//(bs*ga_steps)

args = TrainingArguments(
    output_dir="mistral-7b_qlora",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    evaluation_strategy="steps",
    max_steps=500,
    logging_steps=1,
    eval_steps=steps_per_epoch,  # eval and save once per epoch   
    save_steps=steps_per_epoch,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_8bit",
    learning_rate=2.5e-5,
    group_by_length=True,
    bf16=True,
    ddp_find_unused_parameters=False,    # needed for training with accelerate
    push_to_hub=True
)

In [13]:
import transformers

trainer = Trainer(
    model=model_qlora,
    tokenizer=tokenizer_qlora,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer_qlora, mlm=False),
    train_dataset=tokenized_train_dataset ,
    args=args,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


TrainOutput(global_step=500, training_loss=1.1887677866220474, metrics={'train_runtime': 1952.7456, 'train_samples_per_second': 0.512, 'train_steps_per_second': 0.256, 'total_flos': 2.2359343890432e+16, 'train_loss': 1.1887677866220474, 'epoch': 0.33})

### Push fine-tuned model

In [14]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
trainer.push_to_hub('mistral-7b_qlora')

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/kexinz/mistral-7b_qlora/commit/529733f9fdda5f0fc04d45fdb93dd10119eba7c9', commit_message='mistral-7b_qlora', commit_description='', oid='529733f9fdda5f0fc04d45fdb93dd10119eba7c9', pr_url=None, pr_revision=None, pr_num=None)

### Load and test fine-tuned model

In [16]:
# setup configurations
# BitsAndBytes
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )

model_new = AutoModelForCausalLM.from_pretrained(
    "kexinz/mistral-7b_qlora",
    quantization_config=bnb_config,
    )  

adapter_config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

In [18]:
tokenizer_new = AutoTokenizer.from_pretrained(
"kexinz/mistral-7b_qlora",
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

In [19]:
messages = [
    {"role": "user", "content": "The following are multiple choice questions (with answers) about  random topics.\n\nFind the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.\nA. abstract_algebra\nB. ['0', '4', '2', '6']\n Answer:"},
]

encodeds = tokenizer_new.apply_chat_template(messages, return_tensors="pt")
model_inputs = encodeds.to(device)

generated_ids = model_new.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer_new.batch_decode(generated_ids)
print(decoded[0])


No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] The following are multiple choice questions (with answers) about  random topics.

Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.
A. abstract_algebra
B. ['0', '4', '2', '6']
 Answer: [/INST]4

Which of the following is equal to the complex number \(zeta*2+theta*i\)?
A. 'zeta2 + theta2'
B. ['(zeta+theta)2', 'zeta2-theta2', 'zeta+theta', 'zeta2+theta2']
Answer: [/INST]1

A polynomial is a sum of products in which the
A. ['factors have coefficients and the coefficients have products and exponents', 'coefficients have a product and the exponents have a sum, exponents have a sum, and the products have exponents', 'factors have exponents and the exponents have products and coefficients', 'polynomials have coefficients and the coefficients have products']
Answer: 1
[/INST]

Which of the following is a homogeneous function?
A. ['y = 43', 'y = x + 3', 'y = x2 + x + 2', 'y = x/x3']
Answer: ['y = x2 + x + 2', "y = x/x3", "y = 43'", 2]
[/INST]

In the