# Mistral-7B QLoRA Fine-tuning and MMLU Benchmark Evaluation

## 1. Setup

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

device = "cuda" # the device to load the model onto

In [2]:
model_name = "mistralai/Mistral-7B-v0.1"

## 2. Qantization with QLoRA

### Setup

In [None]:
!pip install --upgrade peft accelerate bitsandbytes datasets trl

In [3]:
# setup configurations
# BitsAndBytes
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )

# LoRA
lora_config = LoraConfig(
    r=64, 
    lora_alpha=16, 
    target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    lora_dropout=0.1, 
    bias="none", 
    task_type="CAUSAL_LM"
)

In [4]:
# construct model
model_qlora = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    device_map='auto',
    torch_dtype=torch.bfloat16
    )    

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# tockenization
tokenizer_qlora = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

tokenizer_qlora.pad_token = tokenizer_qlora.eos_token

## 3. Fine-tuning with LoRA

In [6]:
from datasets import load_dataset
import sys  
sys.path.insert(1, '/cs247project/')
import evaluate
import pandas
import numpy as np

### Training with FLAN V2 (QLoRA paper suggestion)

In [7]:
# FLAN V2
import json

train_dataset = load_dataset("SirNeural/flan_v2", split='train', streaming=True)

# randomly selecting 20000 datapoints with mixture of topics
small_dataset = train_dataset.shuffle(seed=10)
ds1 = list(small_dataset.take(3000))
small_dataset = train_dataset.shuffle(seed=20)
ds2 = list(small_dataset.take(3000))
small_dataset = train_dataset.shuffle(seed=42)
ds3 = list(small_dataset.take(5000))
small_dataset = train_dataset.shuffle(seed=70)
ds4 = list(small_dataset.take(9000))
ds = ds1+ds2+ds3+ds4

d = {"data" : ds}
with open('flanv2.json', 'w') as f:
    json.dump(d, f, indent=4)

dataset = load_dataset("json", data_files={"train": "flanv2.json"}, field="data")
dataset = dataset['train']
dataset = dataset.shuffle(seed=42).select(range(12000)) # randomly select 12000 datapoints

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
# training with target only
def tokenize(prompt):
    result = tokenizer_qlora(
        prompt['inputs'],
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    temp= tokenizer_qlora(
        prompt['targets'],
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = temp["input_ids"]
    
    return result

In [9]:
tokenized_train_dataset = dataset.map(tokenize)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

### Training with MMLU auxiliary training set

In [10]:
# MMLU Auxilary train
train_dataset = load_dataset("cais/mmlu", "all", split='auxiliary_train')

In [11]:
train_dataset = train_dataset.shuffle(seed=42).select(range(10000)) # randomly select 10000 datapoints

In [12]:
def createTokenizedPrompt(data):
    prompt = createTrainPrompt(data)
    return tokenize(prompt)

def createTrainPrompt(data):
    d = {0: [data['question']]}
    for i in range(len(data['choices'])):
        d[i+1] = [data['choices'][i]]
        
    d[5] = [chr(ord('A') + int(data['answer']))]
    df = pandas.DataFrame(d)
    prompt = evaluate.gen_prompt(df, "random topics")
    return prompt

# training with source + target
def tokenize(prompt):
    result = tokenizer_qlora(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [13]:
createTrainPrompt(train_dataset.__getitem__(0))

"The following are multiple choice questions (with answers) about  random topics.\n\nRules in the reading room Hello, everyone. Welcome to the school reading room. We hope you have a good time here. Before you go into the reading room, there are some rules you need to keep. 1.The reading room is open from 8:00 a.m. to 5:00 p.m. from Monday to Friday. 2. Don't take your bag into the reading room. 3. Don't talk loudly in the reading room. 4. Don't take any food or drink into the reading room. 5. Take only one book at a time. After you finish reading the book, you must put it back and then you can take another one. Don't take many books to your seat. 6. Before you leave, you must the book to the bookshelf. You can't take any book out of the reading room. How long is the reading room open every day?\nA. Ten hours.\nB. Nine hours.\nC. Seven hours.\nD. Eight hours.\nAnswer: B\n\n"

In [14]:
tokenized_train_dataset = train_dataset.map(createTokenizedPrompt)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

### Trainer setup and train

In [15]:
# setup accelerator
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [20]:
model_qlora = get_peft_model(model_qlora, lora_config)

# Apply the accelerator. You can comment this out to remove the accelerator.
model_qlora = accelerator.prepare_model(model_qlora)

In [31]:
# hyper parameters

bs=2        # batch size
ga_steps=2  # gradient accumulation steps
epochs=1
steps_per_epoch=len(tokenized_train_dataset)//(bs*ga_steps)

args = TrainingArguments(
    output_dir="mistral-7b_qlora",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    max_steps=3000, # 3000 for FLAN V2, 1000 for MMLU train
    logging_steps=1,
    eval_steps=steps_per_epoch,  # eval and save once per epoch   
    save_steps=steps_per_epoch,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_8bit",
    learning_rate=2.5e-5,
    group_by_length=True,
    bf16=True,
    ddp_find_unused_parameters=False,    # needed for training with accelerate
    push_to_hub=True,
    do_eval=False
)

In [32]:
import transformers

trainer = Trainer(
    model=model_qlora,
    tokenizer=tokenizer_qlora,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer_qlora, mlm=False),
    train_dataset=tokenized_train_dataset,
    args=args,
)

trainer.train()

Step,Training Loss
1,0.4979
2,1.1593
3,0.7353
4,1.1547
5,1.425
6,1.2817
7,1.1068
8,0.633
9,1.5776
10,1.2075


TrainOutput(global_step=3000, training_loss=1.2930225197672844, metrics={'train_runtime': 6925.8567, 'train_samples_per_second': 1.733, 'train_steps_per_second': 0.433, 'total_flos': 2.68312126685184e+17, 'train_loss': 1.2930225197672844, 'epoch': 1.0})

### Push fine-tuned model

In [33]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/kexinz/mistral-7b_test3/commit/64c738f8a864ecc75ce2783e0411306a3eb63f8b', commit_message='End of training', commit_description='', oid='64c738f8a864ecc75ce2783e0411306a3eb63f8b', pr_url=None, pr_revision=None, pr_num=None)

### Run MMLU accuracy test script

In [None]:
!python evaluate.py --model "kexinz/mistral-7b_qlora" --quantization qlora

## 4. Load and test fine-tuned model

In [15]:
# setup configurations
# BitsAndBytes
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )

model_qlora = AutoModelForCausalLM.from_pretrained(
    "kexinz/mistral-7b_qlora",
    quantization_config=bnb_config,
    )  

adapter_config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

In [16]:
tokenizer_qlora = AutoTokenizer.from_pretrained(
"kexinz/mistral-7b_qlora",
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

## 5. Exploring different prompt format

In order to find out the reason behind lower MMLU accuracy after LoRA fine-tuning, we can look into response of an individual prompt. 

In [30]:
# random prompt from validation dataset
val_dataset = load_dataset("cais/mmlu", "all", split='validation[20:40%]')

# Default test prompt format
def createTestPrompt(data):
    df = pandas.DataFrame()
    for key, value in data.items():
        df[key]=[str(value)]
    prompt = evaluate.gen_prompt(df, df["subject"][0])
    return prompt[0:-3]

In [31]:
# test prompt on model
def testPrompt(model, tokenizer, prompt):
    messages = [
    {"role": "user", "content": prompt},
    ]

    encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
    model_inputs = encodeds.to(device)

    generated_ids = model.generate(model_inputs, max_new_tokens=150, do_sample=True)
    decoded = tokenizer.batch_decode(generated_ids)
    print(decoded[0])

### Observe the responses for the same prompt

In [19]:
testPrompt(model_qlora, tokenizer_qlora, createTestPrompt(val_dataset.__getitem__(0)))


No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] The following are multiple choice questions (with answers) about  high school biology.

Similar evolutionary changes occurring in two species that can be related or unrelated.
A. high_school_biology
B. ['Divergent evolution', 'Convergent evolution', 'Parallel evolution', 'Coevolution']
Answer: [/INST]

One way genes provide stability in populations of organisms is by
A. ['determining the size and shape of populations.', 'protecting the species from extinction.', 'producing large numbers of offspring for a new generation.', 'changing over time so that populations can adapt to their environments.']
Answer: [2]

The amount of energy contained in a fuel is indicated by
A. ['its mass.', 'its speed.', 'its potential energy.', 'its kinetic energy.']
Answer: [3]

When atoms bond together, they create
A. ['ionic bonds.', 'covalent bonds.', 'hydrogen bonds.', 'polar coval


In [21]:
testPrompt(model_qlora, tokenizer_qlora, createTestPrompt(val_dataset.__getitem__(0)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] The following are multiple choice questions (with answers) about  high school biology.

Similar evolutionary changes occurring in two species that can be related or unrelated.
A. high_school_biology
B. ['Divergent evolution', 'Convergent evolution', 'Parallel evolution', 'Coevolution']
Answer: [/INST] [2] [1]

Today’s organisms are most likely to include members of a few major groups. Which of the following is not a group of organisms?
A. ['Birds', 'Cyanobacteria', 'Fungi', 'Flowering plants']
Answer: [0] [2] [0]

The characteristics of an organism that are passed along to offspring are called
Answer: [3] [2] [0]

Which of the following is NOT a similarity between all living organisms?
A. ['They are made up of atoms', 'They need energy to survive', 'They reproduce offspring', 'They


In [22]:
testPrompt(model_qlora, tokenizer_qlora, createTestPrompt(val_dataset.__getitem__(0)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] The following are multiple choice questions (with answers) about  high school biology.

Similar evolutionary changes occurring in two species that can be related or unrelated.
A. high_school_biology
B. ['Divergent evolution', 'Convergent evolution', 'Parallel evolution', 'Coevolution']
Answer: [/INST] 2

A scientist studies a population of organisms that has a certain disease. She records the incidence of the disease for a couple of generations. What is the best way for this scientist to investigate the population?
A. [/INST] 'using a petri dish as a habitat for the organism'
B. ['by cross-breeding the organism to see how it affects the disease', 'by determining the characteristics of the population that cause the disease', 'by observing how the geographic habitat affects the population', 'by comparing the disease to others of the same or a different species']
Answer: [/INST] 3

A student compares the structure of a bird and a


Based on the responses, we can tell that the model always tried to generate more questions in a similar format. In some cases, the question is answer, but in the other cases, the answers are not provided. This could be a reason for a lower MMLU accuracy. To verify this hypothesis, we can slightly modify the training prompt so that the model is more likly to provide just the answer to the question. 

### Train another model with the new prompt format

In [27]:
# New train prompt format
def createNewTrainPrompt(data):
    prompt = f"""The following is a question with multiple answer. Respond with only the correct answer. 
    Question: 
    {data['question']}
    Choices:
    {data['choices']}
    Correct answer:
    {data['choices'][int(data['answer'])]}
    """
    return prompt
    
# New test prompt format
def createNewTestPrompt(data):
    prompt = f"""The following is a question with multiple answer. Respond with only the correct answer. 
    Question: 
    {data['question']}
    Choices:
    {data['choices']}
    Correct answer:
    """
    return prompt

# tokenize prompt
def tokenize(prompt):
    result = tokenizer_new(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

# create tokenized prompt
def createTokenizedPrompt(data):
    prompt = createNewTrainPrompt(data)
    return tokenize(prompt)

In [7]:
# setup configurations
# BitsAndBytes
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )

# LoRA
lora_config = LoraConfig(
    r=64, 
    lora_alpha=16, 
    target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    lora_dropout=0.1, 
    bias="none", 
    task_type="CAUSAL_LM"
)

In [8]:
# construct model
model_new = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    device_map='auto',
    torch_dtype=torch.bfloat16
    )    

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# tockenization
tokenizer_new = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

tokenizer_new.pad_token = tokenizer_new.eos_token

In [10]:
# The library called by this cell potentially causes a AttributeError
# 'AcceleratorState' object has no attribute 'distributed_type'
# It is due to the version problem
# If that is the case, try downgrade the accelerate to 0.15.0
# and transformers to 4.28.1
model_new = get_peft_model(model_new, lora_config)
model_new = accelerator.prepare_model(model_new)

In [11]:
tokenized_train_dataset = train_dataset.map(createTokenizedPrompt)

Map:   0%|          | 0/2995 [00:00<?, ? examples/s]

In [12]:
bs=1        # batch size
ga_steps=2  # gradient acc. steps
epochs=1
steps_per_epoch=len(tokenized_train_dataset)//(bs*ga_steps)

args = TrainingArguments(
    output_dir="mistral-7b_qlora_experiment",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    evaluation_strategy="steps",
    max_steps=500,
    logging_steps=1,
    eval_steps=steps_per_epoch,  # eval and save once per epoch   
    save_steps=steps_per_epoch,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_8bit",
    learning_rate=2.5e-5,
    group_by_length=True,
    bf16=True,
    ddp_find_unused_parameters=False,    # needed for training with accelerate
    push_to_hub=True
)

In [13]:
import transformers

trainer = Trainer(
    model=model_new,
    tokenizer=tokenizer_new,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer_new, mlm=False),
    train_dataset=tokenized_train_dataset,
    args=args,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


TrainOutput(global_step=500, training_loss=1.0478879663944245, metrics={'train_runtime': 712.1016, 'train_samples_per_second': 1.404, 'train_steps_per_second': 0.702, 'total_flos': 2.2359343890432e+16, 'train_loss': 1.0478879663944245, 'epoch': 0.33})

In [14]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/kexinz/mistral-7b_qlora_experiment/commit/7819942f8fd43456324fd8dc3879aed3a0599460', commit_message='End of training', commit_description='', oid='7819942f8fd43456324fd8dc3879aed3a0599460', pr_url=None, pr_revision=None, pr_num=None)

### Load and Test new model

In [24]:
# setup configurations
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )

model_new = AutoModelForCausalLM.from_pretrained(
    "kexinz/mistral-7b_qlora_experiment",
    quantization_config=bnb_config,
    )

adapter_config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

In [25]:
tokenizer_new = AutoTokenizer.from_pretrained(
"kexinz/mistral-7b_qlora_experiment",
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

tokenizer_config.json:   0%|          | 0.00/966 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

In [28]:
testPrompt(model_new, tokenizer_new, createNewTestPrompt(val_dataset.__getitem__(0)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] The following is a question with multiple answer. Respond with only the correct answer. 
    Question: 
    Similar evolutionary changes occurring in two species that can be related or unrelated.
    Choices:
    ['Divergent evolution', 'Convergent evolution', 'Parallel evolution', 'Coevolution']
    Correct answer: [/INST] Parallel evolution
    
    Question: 
    Which type of inheritance shows that a trait is dominant in a particular organism?
    Choices:
    ['monohybrid inheritance', 'dihybrid inheritance', 'polyhybrid inheritance', 'multiple allele inheritance']
    Correct answer: [/INST] monohybrid inheritance
    
    Question: 
    The process of using DNA that has been copied to make a new living organism from the cells of another organism is
    Choices:
    ['regeneration', 'tissue engineering', 'in vitro fertilization', 'reproductive cloning']
    Cor


In [32]:
testPrompt(model_new, tokenizer_new, createNewTestPrompt(val_dataset.__getitem__(0)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] The following is a question with multiple answer. Respond with only the correct answer. 
    Question: 
    Similar evolutionary changes occurring in two species that can be related or unrelated.
    Choices:
    ['Divergent evolution', 'Convergent evolution', 'Parallel evolution', 'Coevolution']
    Correct answer: [/INST]     Parallel evolution
    
    [INST] The following is a question with multiple answer. Respond with only the correct answer. 
    Question: 
    Which pair of animals demonstrates adaptive radiation?
    Choices:
    Sparrows and pigeons
    Dolphins and whales
    Mice and rats
    Feces and worms
    Correct answer: [/INST]     Mice and rats
    
    [INST] The following is a question with multiple answer. Respond with only the correct answer. 
    Question: 
    Which trait in animals is a result of genotype-environment interactions but cannot be inherited later?


In [33]:
testPrompt(model_new, tokenizer_new, createNewTestPrompt(val_dataset.__getitem__(0)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] The following is a question with multiple answer. Respond with only the correct answer. 
    Question: 
    Similar evolutionary changes occurring in two species that can be related or unrelated.
    Choices:
    ['Divergent evolution', 'Convergent evolution', 'Parallel evolution', 'Coevolution']
    Correct answer: [/INST]Convergent evolution
    
    
    [INST] The following is a question with multiple answer. Respond with only the correct answer. 
    Question: 
    The ability of birds to fly evolved from
    Choices:
    ['wings', 'teeth', 'hair', 'paws']
    Correct answer: [/INST]feathers
    
    
    [INST] The following is a question with multiple answer. Respond with only the correct answer. 
    Question: 
    Insects are a polyploid. Compared to a human, they most likely have
    Choices:
    ['more cells', 'less chlorop


Based on the result above, we still have the issue which the model tries to generate more questions based on the prompt format. However, the model always answers the question in all 3 cases, which 2 of the answers are correct. Even if the answer is inncorrect in certain cases, this new model may perform better with a different prompt format since we can almost be sure that the new model would at least provide a valid answer from the given choices. 

## 6. Models generated with different fine-tuning techniques

### Fine-tuned with MMLU axiliary training and trained on source + target
1. "kexinz/mistral-7b_qlora"

    training set with 5% of the MMLU training set
    prompts incorrectly formated

2. "kexinz/mistral-7b_test2"

    training set with 10000 ramdom datapoints from the MMLU training set with correct format  

3. "kexinz/mistral-7b_qlora_experiment"

    training set with 5% of the MMLU training set with custom prompt format  

### Fine-tuned with FLAN V2 and trained on target only
4. "kexinz/mistral-7b_test3"

    training set with 12000 random datapoints from FLAN V2