In [None]:
# run the following command to install the required packages into your environments
# pip install -r requirements.txt

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List
import torch
import pandas as pd
import json
import termtables as tt
import numpy as np
from datasets import Dataset, load_dataset
import sys
import os

# ----------------------------------------------------------------------------------------------------------------------
# For LoRA:

from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM

# ----------------------------------------------------------------------------------------------------------------------
# For benchmarking:

from deepeval.benchmarks import MMLU, TruthfulQA
from deepeval.benchmarks.tasks import MMLUTask, TruthfulQATask
from deepeval.benchmarks.modes import TruthfulQAMode
from deepeval.models.base_model import DeepEvalBaseLLM
# ----------------------------------------------------------------------------------------------------------------------

import yaml
from mergekit.config import MergeConfiguration
from mergekit.merge import MergeOptions, run_merge
# ----------------------------------------------------------------------------------------------------------------------

torch_device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {torch_device}")

# ----------------------------------------------------------------------------------------------------------------------
### Change working directory to huggingface home directory
# This directory needs to contain all the models and tokenizers, meaning there should be a lot of available room in this directory

# To set HF_HOME directory, run the following command in terminal:
# export HF_HOME=/path/to/HF_HOME/directory

# After this, you also need huggingface access tokens to the huggingface models, 
# so go to the huggingface website and create an access token
# then go the page of the model you need and click to verify your intent with the model
# then write in the terminal:
# huggingface-cli login
# and paste the access token

# '/path/to/HF_HOME/directory'
os.chdir('/dtu/blackhole/06/187238/cache') # Change this to the directory where you want to save the models
print("Current working directory:", os.getcwd())

  backends.update(_get_backends("networkx.backends"))


Using device: cuda:0
Current working directory: /dtu/blackhole/06/187238/cache


In [8]:
# List of available models
models = { 
#   name                  : path
    "Mistral 7B"          : "mistralai/Mistral-7B-v0.1",
    "Mistral 7B Instruct" : "mistralai/Mistral-7B-Instruct-v0.1",
    "BioMistral 7B"       : "BioMistral/BioMistral-7B",
    "MetaMath 7B"         : "meta-math/MetaMath-Mistral-7B",
    "MetaBioMerge 7B"     : "./hub/models--MetaBioMerged--MathBio--MistralInstruct--7B",
    "LoRA-Bio 7B"         : "./hub/models--LoRABio--MistralInstruct--PubMedQA--FullDataset--7B--v1--unloaded",
    "LoRA-Math 7B"        : "./hub/models--LoRAMath--MistralInstruct--MetaMathQA--FullDataset--7B--v1--unloaded",
    "LoRA-Merged 7B"      : "./hub/models--LoRA_Merged--MathBio--MistralInstruct--FullDatasets--7B"
}
# List of available datasets
datasets = {
    "Math" : "meta-math/MetaMathQA",    
    "Bio" : "qiaojin/PubMedQA"
}
labels = {"Math" : ["query", "reponse"],
          "Bio" : ["question", "long_answer"]
}

---

# LoRA Code
Need torch_device to be a gpu with atleast 30 GB VRAM

In [4]:
model_name = "Mistral 7B Instruct"
dataset_name = "Bio"



question_label = labels[dataset_name][0]
answer_label = labels[dataset_name][1]

print("selected:")
print("model:"+model_name)
print("dataset:"+dataset_name)

# 1. Load the model and tokenizer
print("Loading model and tokenizer... ")
tokenizer = AutoTokenizer.from_pretrained(models[model_name])
model = AutoModelForCausalLM.from_pretrained(models[model_name], torch_dtype=torch.float16)
print("Loading model and tokenizer... Done")
tokenizer.pad_token = tokenizer.eos_token

# 2. Define the LoRA configuration
lora_config = LoraConfig(
    r=8,  # LoRA rank
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target attention layers   # Check if this is correct
    lora_dropout=0.1,  # Dropout probability
    bias="none"  # Don't train biases
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)


# 3. Prepare the dataset
if dataset_name == "Bio":
    ds = load_dataset(datasets[dataset_name], "pqa_artificial")
else:
    ds = load_dataset(datasets[dataset_name])


total_data_points = 400000 # 400000 is all the datapoints in the dataset. Change this to a smaller number to train on a smaller dataset (Takes much less time)
# For testing purposes, try only 400 data points

ds = ds['train'].to_pandas()[:total_data_points]

# Split the dataset
split = 0.95
split_idx = int(len(ds) * split)
train_data_raw = ds[:split_idx]
eval_data_raw = ds[split_idx:]


# Preprocess the dataset
train_data = []
eval_data = []
for i in range(len(train_data_raw)):
    train_data.append({"prompt": train_data_raw.iloc[i][question_label], "answer": train_data_raw.iloc[i][answer_label]})
for i in range(len(eval_data_raw)):
    eval_data.append({"prompt": eval_data_raw.iloc[i][question_label], "answer": eval_data_raw.iloc[i][answer_label]})

def preprocess_function(example):
    prompt = example["prompt"]
    answer = example["answer"]
    tokenized = tokenizer(
        prompt,
        answer,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

train_dataset = Dataset.from_list(train_data).map(preprocess_function, batched=True)
eval_dataset = Dataset.from_list(eval_data).map(preprocess_function, batched=True)


train_data_length = len(train_dataset)
epochs = 1.5
batch_size = 4
gradient_accumulation_steps = 2
effective_batch_size = gradient_accumulation_steps*batch_size

total_steps = epochs*train_data_length/effective_batch_size

# Creating output model directory
dataset_name_ = datasets[dataset_name][datasets[dataset_name].find('/')+1:]
print(dataset_name_)
Output_Model_Dir = f"LoRA{dataset_name}--MistralInstruct--"
Output_Model_Dir += dataset_name_
if total_data_points == 400000:
    Output_Model_Dir += "--FullDataset"
Output_Model_Dir += "--7B--v1"
print(Output_Model_Dir)

# 4. Training configuration
training_args = TrainingArguments(
    output_dir=f"./hub/models--{Output_Model_Dir}", # Output directory
    eval_strategy="steps", # Evaluate every 500 steps
    save_strategy="steps", # Save every 500 steps
    save_steps=int(total_steps/10),
    use_cpu = False,
    per_device_train_batch_size=batch_size, # Batch size per GPU
    #auto_find_batch_size = True,
    gradient_accumulation_steps=gradient_accumulation_steps, # Accumulate gradients
    num_train_epochs=epochs,
    learning_rate=2e-4,
    fp16=True, # Use mixed precision
    logging_strategy="steps", # Log every 100 steps
    logging_dir="./logs", # Logs
    logging_steps=int(total_steps/15), # Log every 100 steps
    save_total_limit=10, # Save only the last 10 checkpoints
    report_to="none", # Don't report to Hugging Face
)


#training_args = training_args.set_dataloader(train_batch_size=10, eval_batch_size=64)

print("#########################################################################")
print(f"Total steps:{total_steps}")
print(f"Batch size: {training_args.train_batch_size}")
print(f"Effective Batch size: {effective_batch_size}")
print("#########################################################################")

# 5. Trainer setup
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    tokenizer=tokenizer
)

# 6. Fine-tune the model
print("Fine-tuning the model... ")
trainer.train()
print("Fine-tuning the model... Done")

# File path where you want to save the log
if not os.path.exists("./log"):
    os.makedirs("./log")
log_file_path = f"./log/Log_History--{Output_Model_Dir}.json"

# Save log history to a file
with open(log_file_path, "w") as log_file:
    json.dump(trainer.state.log_history, log_file, indent=4)

# 7. Save the fine-tuned LoRA model
model.save_pretrained(f"./hub/models--{Output_Model_Dir}")
tokenizer.save_pretrained(f"./hub/models--{Output_Model_Dir}")

selected:
model:Mistral 7B Instruct
dataset:Bio
Loading model and tokenizer... 


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading model and tokenizer... Done


Map:   0%|          | 0/380 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

PubMedQA
LoRABio--MistralInstruct--PubMedQA--7B--v1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Trainer(


#########################################################################
Total steps:71.25
Batch size: 4
Effective Batch size: 8
#########################################################################
Fine-tuning the model... 


Step,Training Loss,Validation Loss
4,3.0244,No log
8,2.0953,No log
12,0.8866,No log
16,0.8535,No log
20,0.8319,No log
24,0.8205,No log
28,0.7788,No log
32,0.7885,No log
36,0.7155,No log
40,0.6402,No log


Fine-tuning the model... Done


('./hub/models--LoRABio--MistralInstruct--PubMedQA--7B--v1/tokenizer_config.json',
 './hub/models--LoRABio--MistralInstruct--PubMedQA--7B--v1/special_tokens_map.json',
 './hub/models--LoRABio--MistralInstruct--PubMedQA--7B--v1/tokenizer.model',
 './hub/models--LoRABio--MistralInstruct--PubMedQA--7B--v1/added_tokens.json',
 './hub/models--LoRABio--MistralInstruct--PubMedQA--7B--v1/tokenizer.json')

### Unpacking the PEFT Model

In [4]:
# Turn peft adapter model into a transformers model and save it
model_path_peft = "models--LoRAMath--MistralInstruct--MetaMathQA--FullDataset--7B--v1" # Model path from LoRA training

# Local path, where the model is saved and select last checkpoint
model_id = f"./hub/{model_path_peft}/checkpoint-70359"
peft_model = AutoPeftModelForCausalLM.from_pretrained(model_id)
print(type(peft_model))

merged_model = peft_model.merge_and_unload()
# The adapters are merged now and it is transformers class again
print(type(merged_model))

merged_model.save_pretrained(f'./hub/{model_path_peft}--unloaded')

# Load and save tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)  # or use the base model directory
tokenizer.save_pretrained(f'./hub/{model_path_peft}--unloaded')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<class 'peft.peft_model.PeftModelForCausalLM'>
<class 'transformers.models.mistral.modeling_mistral.MistralForCausalLM'>


('./hub/models--LoRAMath--MistralInstruct--MetaMathQA--FullDataset--7B--v1--unloaded/tokenizer_config.json',
 './hub/models--LoRAMath--MistralInstruct--MetaMathQA--FullDataset--7B--v1--unloaded/special_tokens_map.json',
 './hub/models--LoRAMath--MistralInstruct--MetaMathQA--FullDataset--7B--v1--unloaded/tokenizer.model',
 './hub/models--LoRAMath--MistralInstruct--MetaMathQA--FullDataset--7B--v1--unloaded/added_tokens.json',
 './hub/models--LoRAMath--MistralInstruct--MetaMathQA--FullDataset--7B--v1--unloaded/tokenizer.json')

---

# Merging

Need 120GB of RAM (No need for GPU)

In [None]:
# Example of configuration file:

# models:
#   - model: ./hub/models--MistralInstruct--BioLoRA--FullDataset--7B--v1--unloaded
#     parameters:
#       density: 0.5
#       weight: 0.5
#   - model: ./hub/models--MistralInstruct--MathLoRA--FullDataset--7B--v1--unloaded
#     parameters:
#       density: 0.5
#       weight: 0.5
# merge_method: dare_ties
# base_model: mistralai/Mistral-7B-Instruct-v0.1
# parameters:
#   normalize: false
#   int8_mask: true
# dtype: float16

In [7]:
OUTPUT_PATH = "./hub/models--MetaBioMerged--MathBio--MistralInstruct--7B"  # folder to store the result in
LORA_MERGE_CACHE = "../mergekit/merge_cache"  # change if you want to keep these for some reason. "mergekit" is where you git cloned it to
CONFIG_YML = "./MergeConfigs/meta-bio.yml"  # merge configuration file. See above for an example or in the folder "Merge Config Files" on the Github
# Make sure the CONFIG_YML path is correct
COPY_TOKENIZER = True  # you want a tokenizer? yeah, that's what i thought
LAZY_UNPICKLE = False  # experimental low-memory model loader
LOW_CPU_MEMORY = False  # enable if you somehow have more VRAM than RAM+swap

In [8]:
with open(CONFIG_YML, "r", encoding="utf-8") as fp:
    merge_config = MergeConfiguration.model_validate(yaml.safe_load(fp))

run_merge(
    merge_config,
    out_path=OUTPUT_PATH,
    options=MergeOptions(
        lora_merge_cache=LORA_MERGE_CACHE,
        cuda=torch.cuda.is_available(),
        copy_tokenizer=COPY_TOKENIZER,
        lazy_unpickle=LAZY_UNPICKLE,
        low_cpu_memory=LOW_CPU_MEMORY,
    ),
)
print("Done!")

Warmup loader cache:   0%|                                                                                                         | 0/3 [00:00<?, ?it/s]

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Warmup loader cache:  33%|████████████████████████████████▎                                                                | 1/3 [00:00<00:01,  1.38it/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

  shard = torch.load(model_path, map_location="meta")
Warmup loader cache: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:09<00:00,  3.09s/it]
Executing graph: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1748/1748 [12:58<00:00,  2.25it/s]


Done!


---

# Benchmarking

Need atleast 30 GB of VRAM on GPU

In [3]:
model_name = "LoRA-Merged 7B" # Choose model name from the list at the top


# Define wrapper class for models
class DeepEvalModelWrapper(DeepEvalBaseLLM):
    def __init__(
        self,
        model_name,
        model,
        tokenizer
    ):
        self.model_name = model_name
        self.model = model
        self.model.to(torch_device)
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        def find_answer_choice(new_tokens, answer_choices):
            if new_tokens in ["A", "B", "C", "D"]:
                return new_tokens
            for answer_choice in answer_choices:
                if new_tokens in answer_choice:
                    return answer_choice[0]
            return None
        
        if self.model_name not in ["MetaMath 7B"]:
            prompt = prompt[:-55]
        answer_choices = [prompt[prompt.find("A. "):prompt.find("\n", prompt.find("A. "))].strip(), 
                          prompt[prompt.find("B. "):prompt.find("\n", prompt.find("B. "))].strip(), 
                            prompt[prompt.find("C. "):prompt.find("\n", prompt.find("C. "))].strip(), 
                            prompt[prompt.find("D. "):prompt.find("\n", prompt.find("D. "))].strip()]

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(torch_device)

        generated_ids = self.model.generate(**model_inputs, max_new_tokens=(50 if self.model_name in ["MetaMath 7B"] else 1), do_sample=True, pad_token_id=self.tokenizer.eos_token_id, temperature=0.0001)
        decoded_tokens = self.tokenizer.batch_decode(generated_ids)[0]

        new_tokens = decoded_tokens[len(prompt)+4+(1 if self.model_name in ['MetaMath 7B'] else 0):].strip()
        
        if self.model_name in ["MetaMath 7B"]:
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
            if "The answer is: " in new_tokens:
                new_tokens = new_tokens[new_tokens.find("The answer is: ")+15:]
            elif "he answer is: " in new_tokens:
                new_tokens = new_tokens[new_tokens.find("he answer is: ")+14:]
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
            new_tokens = new_tokens.replace("</s>", "")
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
            if new_tokens.strip() == "":
                return ""

            if new_tokens[-1] == ".":
                new_tokens = new_tokens[:-1]
                fac = find_answer_choice(new_tokens, answer_choices)
                if fac:
                    return fac
            if "\\text{" in new_tokens:
                new_tokens = new_tokens.replace("\\text{", "")
                new_tokens = new_tokens[:-1]
                fac = find_answer_choice(new_tokens, answer_choices)
                if fac:
                    return fac
            if new_tokens[0] == "(" or new_tokens[0] == "[":
                new_tokens = new_tokens[1:-1]
                fac = find_answer_choice(new_tokens, answer_choices)
                if fac:
                    return fac
        return new_tokens

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps: List[str]) -> List[str]:
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(promtps, return_tensors="pt").to(device)
        self.model.to(device)

        generated_ids = self.model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return self.model_name

In [4]:
# Quick command to load wrapped model
def load_model(model_name: str):
    model_name_path = models[model_name]
    global model_wrapped
    global model
    global tokenizer
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(model_name_path)
    print("Model loaded")
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name_path)
    print("Tokenizer loaded")
    print("Wrapping model...")
    model_wrapped = DeepEvalModelWrapper(model_name, model, tokenizer)
    print("Model wrapped")

In [9]:
# Load chosen model
load_model(model_name)

Loading model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded
Loading tokenizer...
Tokenizer loaded
Wrapping model...
Model wrapped


In [10]:
def quick_test(model):
    print('### Q1 ###')
    print('---')
    print('Correct: A')
    print(model.generate("In a population of giraffes, an environmental change occurs that favors individuals that are tallest. As a result, more of the taller individuals are able to obtain nutrients and survive to pass along their genetic information. This is an example of\nA. directional selection.\nB. stabilizing selection.\nC. sexual selection.\nD. disruptive selection.\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
    print('---')
    print('### Q2 ###')
    print('Correct: A')
    print(model.generate("Which of the changes below following the start codon in an mRNA would most likely have the greatest deleterious effect?\nA. a deletion of a single nucleotide\nB. a deletion of a nucleotide triplet\nC. a single nucleotide substitution of the nucleotide occupying the first codon position\nD. a single nucleotide substitution of the nucleotide occupying the third codon position\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
    print('---')
    print('### Q3 ###')
    print('Correct: C')
    print(model.generate("The energy given up by electrons as they move through the electron transport chain is used to\nA. break down glucose\nB. make glucose\nC. produce ATP\nD. make NADH\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))

    print('### Math Questions ###')
    print('---')
    print('### Q1 ###')
    print('Correct: A')
    print(model.generate("What is 5 minus 2?\nA. 3\nB. 5\nC. 25\nD. 50\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
    print('---')
    print('### Q2 ###')
    print('Correct: A')
    print(model.generate("The following are multiple choice questions (with answers) about high school biology.\n\nWhich of the following is not a way to form recombinant DNA?\nA. Translation\nB. Conjugation\nC. Specialized transduction\nD. Transformation\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
    print('---')
    print('### Q3 ###')
    print('Correct: D')
    print(model.generate("If a metamath_7bn P with vertices at (– 2, – 4), (– 4, 1), (–1, 4), (2, 4), and (3, 0) is reflected across the line y = x to get a new pentagon, P’, then one of the vertices of P’ is\nA. (0, – 3)\nB. (4, 1)\nC. (2, 2)\nD. (– 4, –2)\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))

In [11]:
quick_test(model_wrapped)

### Q1 ###
---
Correct: A
A
---
### Q2 ###
Correct: A
B
---
### Q3 ###
Correct: C
D
### Math Questions ###
---
### Q1 ###
Correct: A
A
---
### Q2 ###
Correct: A
A
---
### Q3 ###
Correct: D
B


In [12]:
# Define evaluation benchmarks
n_shots = 0

mm_tasks_math = [MMLUTask.HIGH_SCHOOL_MATHEMATICS,
                 MMLUTask.ABSTRACT_ALGEBRA,
                 MMLUTask.MACHINE_LEARNING,
                 MMLUTask.ELEMENTARY_MATHEMATICS,
                 MMLUTask.COLLEGE_MATHEMATICS,
                 MMLUTask.FORMAL_LOGIC,
                 MMLUTask.HIGH_SCHOOL_STATISTICS]

mm_tasks_bio = [MMLUTask.CLINICAL_KNOWLEDGE,
                MMLUTask.MEDICAL_GENETICS,
                MMLUTask.ANATOMY,
                MMLUTask.PROFESSIONAL_MEDICINE,
                MMLUTask.COLLEGE_BIOLOGY,
                MMLUTask.COLLEGE_MEDICINE,
                MMLUTask.HIGH_SCHOOL_BIOLOGY]

mm_tasks_all = mm_tasks_math + mm_tasks_bio

tqa_tasks = [TruthfulQATask.SCIENCE]
tqa_mode = TruthfulQAMode.MC1 # Use MC1 as a benchmark for pinpoint accuracy and MC2 for depth of understanding.

# Define benchmark with specific tasks and shots
all_benchmark = MMLU(
    tasks=mm_tasks_all,
    n_shots=n_shots
)

math_benchmark = MMLU(
    tasks=mm_tasks_all,
    n_shots=n_shots
)

bio_benchmark = MMLU(
    tasks=mm_tasks_all,
    n_shots=n_shots
)

TQABenchmark = TruthfulQA(
    tasks=tqa_tasks,
    mode=tqa_mode
)

In [13]:
benchmark = all_benchmark # Choose benchmark from the list above

benchmark.evaluate(model=model_wrapped)
results = benchmark.predictions

README.md:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

mmlu.py:   0%|          | 0.00/5.01k [00:00<?, ?B/s]

Processing high_school_mathematics: 100%|██████████████████████████████████████████████████████████████████████████████| 270/270 [00:40<00:00,  6.69it/s]


MMLU Task Accuracy (task=high_school_mathematics): 0.3037037037037037


Processing abstract_algebra: 100%|█████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:13<00:00,  7.43it/s]


MMLU Task Accuracy (task=abstract_algebra): 0.35


Processing machine_learning: 100%|█████████████████████████████████████████████████████████████████████████████████████| 112/112 [00:16<00:00,  6.60it/s]


MMLU Task Accuracy (task=machine_learning): 0.45535714285714285


Processing elementary_mathematics: 100%|███████████████████████████████████████████████████████████████████████████████| 378/378 [00:54<00:00,  7.00it/s]


MMLU Task Accuracy (task=elementary_mathematics): 0.3253968253968254


Processing college_mathematics: 100%|██████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00,  6.42it/s]


MMLU Task Accuracy (task=college_mathematics): 0.28


Processing formal_logic: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 126/126 [00:24<00:00,  5.14it/s]


MMLU Task Accuracy (task=formal_logic): 0.35714285714285715


Processing high_school_statistics: 100%|███████████████████████████████████████████████████████████████████████████████| 216/216 [00:44<00:00,  4.81it/s]


MMLU Task Accuracy (task=high_school_statistics): 0.3611111111111111


Processing clinical_knowledge: 100%|███████████████████████████████████████████████████████████████████████████████████| 265/265 [00:37<00:00,  7.16it/s]


MMLU Task Accuracy (task=clinical_knowledge): 0.5471698113207547


Processing medical_genetics: 100%|█████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:13<00:00,  7.49it/s]


MMLU Task Accuracy (task=medical_genetics): 0.53


Processing anatomy: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 135/135 [00:18<00:00,  7.11it/s]


MMLU Task Accuracy (task=anatomy): 0.4666666666666667


Processing professional_medicine: 100%|████████████████████████████████████████████████████████████████████████████████| 272/272 [01:21<00:00,  3.32it/s]


MMLU Task Accuracy (task=professional_medicine): 0.5036764705882353


Processing college_biology: 100%|██████████████████████████████████████████████████████████████████████████████████████| 144/144 [00:23<00:00,  6.20it/s]


MMLU Task Accuracy (task=college_biology): 0.4930555555555556


Processing college_medicine: 100%|█████████████████████████████████████████████████████████████████████████████████████| 173/173 [00:34<00:00,  5.03it/s]


MMLU Task Accuracy (task=college_medicine): 0.43352601156069365


Processing high_school_biology: 100%|██████████████████████████████████████████████████████████████████████████████████| 310/310 [00:50<00:00,  6.14it/s]

MMLU Task Accuracy (task=high_school_biology): 0.6
Overall MMLU Accuracy: 0.43391336542021475





In [14]:
# print all results
print(f'Scores for {model_wrapped.get_model_name()}:')

math_rows = 0
for task in mm_tasks_math:
    math_rows += len(math_benchmark.load_benchmark_dataset(task))
results_math = results.iloc[:math_rows]
results_bio = results.iloc[math_rows:]

math_mean_score = results_math['Correct'].mean()
bio_mean_score = results_bio['Correct'].mean()

print("-"*50)
print("Counting all answers: ")
print("     Accuracy math score:     " + str(math_mean_score))
print("     Accuracy bio score:      " + str(bio_mean_score))
print("     Accuracy overall score:  " + str(benchmark.overall_score))

## Counting only correctly formatted answers

results_correct_format = results[results['Prediction'].isin(["A", "B", "C", "D"])]
acc_all_correct_format = results_correct_format['Correct'].mean()

results_math_correct_format = results_math[results_math['Prediction'].isin(["A", "B", "C", "D"])]
acc_math_correct_format = results_math_correct_format['Correct'].mean()

results_bio_correct_format = results_bio[results_bio['Prediction'].isin(["A", "B", "C", "D"])]
acc_bio_correct_format = results_bio_correct_format['Correct'].mean()
print("-"*50)
print("Counting only correctly formatted answers:")
print("     Accuracy math score:     " + str(acc_math_correct_format))
print("     Accuracy bio score:      " + str(acc_bio_correct_format))
print("     Accuracy overall score:  " + str(acc_all_correct_format))

print("-"*50)
print("Benchmark dataset sizes:")
print("     Number of correctly formatted answers in math:  " + str(len(results_math_correct_format)) + " out of " + str(len(results_math)) + f". ({100*len(results_math_correct_format)/len(results_math):.5f}%)")
print("     Number of correctly formatted answers in bio:   " + str(len(results_bio_correct_format)) + " out of " + str(len(results_bio)) + f". ({100*len(results_bio_correct_format)/len(results_bio):.5f}%)")
print("     Number of correctly formatted answers overall:  " + str(len(results_correct_format)) + " out of " + str(len(results)) + f". ({100*len(results_correct_format)/len(results):.5f}%)")

print("-"*50)
pd.set_option("display.max_rows", None)
benchmark.task_scores

Scores for LoRA-Merged 7B:
--------------------------------------------------
Counting all answers: 
     Accuracy math score:     0.33947772657450076
     Accuracy bio score:      0.5218012866333095
     Accuracy overall score:  0.43391336542021475
--------------------------------------------------
Counting only correctly formatted answers:
     Accuracy math score:     0.33947772657450076
     Accuracy bio score:      0.5218012866333095
     Accuracy overall score:  0.43391336542021475
--------------------------------------------------
Benchmark dataset sizes:
     Number of correctly formatted answers in math:  1302 out of 1302. (100.00000%)
     Number of correctly formatted answers in bio:   1399 out of 1399. (100.00000%)
     Number of correctly formatted answers overall:  2701 out of 2701. (100.00000%)
--------------------------------------------------


Unnamed: 0,Task,Score
0,high_school_mathematics,0.303704
1,abstract_algebra,0.35
2,machine_learning,0.455357
3,elementary_mathematics,0.325397
4,college_mathematics,0.28
5,formal_logic,0.357143
6,high_school_statistics,0.361111
7,clinical_knowledge,0.54717
8,medical_genetics,0.53
9,anatomy,0.466667


In [15]:
# pd.set_option("display.max_rows", None)
pd.set_option("display.max_rows", 12)
print(results['Prediction'])
print(results['Input'][2])

0       B
1       B
2       A
3       B
4       B
       ..
2696    D
2697    A
2698    D
2699    B
2700    D
Name: Prediction, Length: 2701, dtype: object
A positive integer n is called “powerful” if, for every prime factor p of n, p^2 is also a factor of n. An example of a powerful number is
A. 392
B. 336
C. 300
D. 297
Answer:
