# Requirements

In [11]:
# ! pip install -U deepeval
# ! pip install numpy
# ! pip install transformers
# ! pip3 install torch torchvision torchaudio

In [86]:
# ! pip install sentencepiece
# ! pip install tokenizers
# ! pip install accelerate
# ! pip install bitsandbytes
# ! pip install vllm
# ! pip install fraction
# ! pip install protobuf
# ! pip install termtables

In [1]:
from deepeval.benchmarks import MMLU, TruthfulQA
from deepeval.benchmarks.tasks import MMLUTask, TruthfulQATask
from deepeval.benchmarks.modes import TruthfulQAMode
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from typing import List
import torch
import pandas as pd
import json
import termtables as tt
import numpy as np

torch_device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {torch_device}")

Using device: cuda:0


---

# Model Loading

In [2]:
# Remember to set HF_HOME to the blackhole-cache directory beforehand 
blackhole_dir = "/dtu/blackhole/06/187238/cache" # (Set to blackhole-cache directory)
model_name = "Mistral Instruct 7B" # Choose model name from the list below

import os
os.chdir(blackhole_dir)

# List of model names:
#--------------------------------------------------------------------------------
models = { 
#   name                  : path
    "Mistral 7B"          : "mistralai/Mistral-7B-v0.1",
    "Mistral Instruct 7B" : "mistralai/Mistral-7B-Instruct-v0.1",
    "BioMistral 7B"       : "BioMistral/BioMistral-7B",
    "MetaMath 7B"         : "meta-math/MetaMath-Mistral-7B",
    "MetaBioMerge 7B"     : "./hub/models--merge--meta--bio--7B"          ##### Det hedder min model, så kald gerne jeres det samme
}
#--------------------------------------------------------------------------------        

In [3]:
# Define wrapper class for models
class DeepEvalModelWrapper(DeepEvalBaseLLM):
    def __init__(
        self,
        model_name,
        model,
        tokenizer
    ):
        self.model_name = model_name
        self.model = model
        self.model.to(torch_device)
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        # model = self.load_model()
        def find_answer_choice(new_tokens, answer_choices):
            if new_tokens in ["A", "B", "C", "D"]:
                return new_tokens
            for answer_choice in answer_choices:
                if new_tokens in answer_choice:
                    return answer_choice[0]
            return None
        
        if self.model_name not in ["MetaMath 7B"]:
            prompt = prompt[:-55]
        answer_choices = [prompt[prompt.find("A. "):prompt.find("\n", prompt.find("A. "))].strip(), 
                          prompt[prompt.find("B. "):prompt.find("\n", prompt.find("B. "))].strip(), 
                            prompt[prompt.find("C. "):prompt.find("\n", prompt.find("C. "))].strip(), 
                            prompt[prompt.find("D. "):prompt.find("\n", prompt.find("D. "))].strip()]

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(torch_device)
        # model.to(torch_device)

        generated_ids = self.model.generate(**model_inputs, max_new_tokens=(50 if self.model_name in ["MetaMath 7B"] else 1), do_sample=True, pad_token_id=self.tokenizer.eos_token_id, temperature=0.0001)
        decoded_tokens = self.tokenizer.batch_decode(generated_ids)[0]

        new_tokens = decoded_tokens[len(prompt)+4+(1 if self.model_name in ['MetaMath 7B'] else 0):].strip()
        
        if self.model_name in ["MetaMath 7B"]:
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
            if "The answer is: " in new_tokens:
                new_tokens = new_tokens[new_tokens.find("The answer is: ")+15:]
            elif "he answer is: " in new_tokens:
                new_tokens = new_tokens[new_tokens.find("he answer is: ")+14:]
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
            new_tokens = new_tokens.replace("</s>", "")
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
            if new_tokens.strip() == "":
                return ""

            if new_tokens[-1] == ".":
                new_tokens = new_tokens[:-1]
                fac = find_answer_choice(new_tokens, answer_choices)
                if fac:
                    return fac
            if "\\text{" in new_tokens:
                new_tokens = new_tokens.replace("\\text{", "")
                new_tokens = new_tokens[:-1]
                fac = find_answer_choice(new_tokens, answer_choices)
                if fac:
                    return fac
            if new_tokens[0] == "(" or new_tokens[0] == "[":
                new_tokens = new_tokens[1:-1]
                fac = find_answer_choice(new_tokens, answer_choices)
                if fac:
                    return fac
        return new_tokens

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps: List[str]) -> List[str]:
        # model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(promtps, return_tensors="pt").to(device)
        self.model.to(device)

        generated_ids = self.model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return self.model_name

In [4]:
# Reset loaded models to avoid memory issues
def free_memory():
    # Doesn't work for some reason
    try:
        del model_wrapped.model
        del model_wrapped
        del model
        del tokenizer
    except:
        pass
    try:
        torch.cuda.empty_cache()
    except:
        pass
    
# Quick command to load wrapped model
def load_model(model_name: str):
    free_memory()
    model_name_path = models[model_name]
    global model_wrapped
    global model
    global tokenizer
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(model_name_path)
    print("Model loaded")
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name_path)
    print("Tokenizer loaded")
    print("Wrapping model...")
    model_wrapped = DeepEvalModelWrapper(model_name, model, tokenizer)
    print("Model wrapped")

In [5]:
# Load chosen model
load_model(model_name)

Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded
Loading tokenizer...
Tokenizer loaded
Wrapping model...
Model wrapped


In [6]:
def quick_test(model):
    print('### Q1 ###')
    print('---')
    print('Correct: A')
    print(model.generate("In a population of giraffes, an environmental change occurs that favors individuals that are tallest. As a result, more of the taller individuals are able to obtain nutrients and survive to pass along their genetic information. This is an example of\nA. directional selection.\nB. stabilizing selection.\nC. sexual selection.\nD. disruptive selection.\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
    print('---')
    print('### Q2 ###')
    print('Correct: A')
    print(model.generate("Which of the changes below following the start codon in an mRNA would most likely have the greatest deleterious effect?\nA. a deletion of a single nucleotide\nB. a deletion of a nucleotide triplet\nC. a single nucleotide substitution of the nucleotide occupying the first codon position\nD. a single nucleotide substitution of the nucleotide occupying the third codon position\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
    print('---')
    print('### Q3 ###')
    print('Correct: C')
    print(model.generate("The energy given up by electrons as they move through the electron transport chain is used to\nA. break down glucose\nB. make glucose\nC. produce ATP\nD. make NADH\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))

    print('### Math Questions ###')
    print('---')
    print('### Q1 ###')
    print('Correct: A')
    print(model.generate("What is 5 minus 2?\nA. 3\nB. 5\nC. 25\nD. 50\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
    print('---')
    print('### Q2 ###')
    print('Correct: A')
    print(model.generate("The following are multiple choice questions (with answers) about high school biology.\n\nWhich of the following is not a way to form recombinant DNA?\nA. Translation\nB. Conjugation\nC. Specialized transduction\nD. Transformation\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
    print('---')
    print('### Q3 ###')
    print('Correct: D')
    print(model.generate("If a metamath_7bn P with vertices at (– 2, – 4), (– 4, 1), (–1, 4), (2, 4), and (3, 0) is reflected across the line y = x to get a new pentagon, P’, then one of the vertices of P’ is\nA. (0, – 3)\nB. (4, 1)\nC. (2, 2)\nD. (– 4, –2)\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))

In [7]:
quick_test(model_wrapped)

### Q1 ###
---
Correct: A
A
---
### Q2 ###
Correct: A
B
---
### Q3 ###
Correct: C
D
### Math Questions ###
---
### Q1 ###
Correct: A
A
---
### Q2 ###
Correct: A
A
---
### Q3 ###
Correct: D
A


---

# Model Evaluation

In [8]:
# Define evaluation benchmarks
n_shots = 0

mm_tasks_math = [MMLUTask.HIGH_SCHOOL_MATHEMATICS,
                 MMLUTask.ABSTRACT_ALGEBRA,
                 MMLUTask.MACHINE_LEARNING,
                 MMLUTask.ELEMENTARY_MATHEMATICS,
                 MMLUTask.COLLEGE_MATHEMATICS,
                 MMLUTask.FORMAL_LOGIC,
                 MMLUTask.HIGH_SCHOOL_STATISTICS]

mm_tasks_bio = [MMLUTask.CLINICAL_KNOWLEDGE,
                MMLUTask.MEDICAL_GENETICS,
                MMLUTask.ANATOMY,
                MMLUTask.PROFESSIONAL_MEDICINE,
                MMLUTask.COLLEGE_BIOLOGY,
                MMLUTask.COLLEGE_MEDICINE,
                MMLUTask.HIGH_SCHOOL_BIOLOGY]

mm_tasks_all = mm_tasks_math + mm_tasks_bio

tqa_tasks = [TruthfulQATask.SCIENCE]
tqa_mode = TruthfulQAMode.MC1 # Use MC1 as a benchmark for pinpoint accuracy and MC2 for depth of understanding.

# Define benchmark with specific tasks and shots
all_benchmark = MMLU(
    tasks=mm_tasks_all,
    n_shots=n_shots
)

math_benchmark = MMLU(
    tasks=mm_tasks_all,
    n_shots=n_shots
)

bio_benchmark = MMLU(
    tasks=mm_tasks_all,
    n_shots=n_shots
)

TQABenchmark = TruthfulQA(
    tasks=tqa_tasks,
    mode=tqa_mode
)

In [16]:
benchmark = all_benchmark # Choose benchmark from the list above

benchmark.evaluate(model=model_wrapped)
results = benchmark.predictions

Processing high_school_mathematics: 100%|██████████████████████████████████████████████████████████████████████████████| 270/270 [01:07<00:00,  4.02it/s]


MMLU Task Accuracy (task=high_school_mathematics): 0.3037037037037037


Processing abstract_algebra: 100%|█████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:20<00:00,  4.91it/s]


MMLU Task Accuracy (task=abstract_algebra): 0.33


Processing machine_learning: 100%|█████████████████████████████████████████████████████████████████████████████████████| 112/112 [00:24<00:00,  4.61it/s]


MMLU Task Accuracy (task=machine_learning): 0.45535714285714285


Processing elementary_mathematics: 100%|███████████████████████████████████████████████████████████████████████████████| 378/378 [01:28<00:00,  4.28it/s]


MMLU Task Accuracy (task=elementary_mathematics): 0.3412698412698413


Processing college_mathematics: 100%|██████████████████████████████████████████████████████████████████████████████████| 100/100 [00:26<00:00,  3.81it/s]


MMLU Task Accuracy (task=college_mathematics): 0.38


Processing formal_logic: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 126/126 [00:39<00:00,  3.19it/s]


MMLU Task Accuracy (task=formal_logic): 0.35714285714285715


Processing high_school_statistics: 100%|███████████████████████████████████████████████████████████████████████████████| 216/216 [01:17<00:00,  2.77it/s]


MMLU Task Accuracy (task=high_school_statistics): 0.4074074074074074


Processing clinical_knowledge: 100%|███████████████████████████████████████████████████████████████████████████████████| 265/265 [01:01<00:00,  4.31it/s]


MMLU Task Accuracy (task=clinical_knowledge): 0.6075471698113207


Processing medical_genetics: 100%|█████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:22<00:00,  4.39it/s]


MMLU Task Accuracy (task=medical_genetics): 0.62


Processing anatomy: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 135/135 [00:31<00:00,  4.22it/s]


MMLU Task Accuracy (task=anatomy): 0.45925925925925926


Processing professional_medicine: 100%|████████████████████████████████████████████████████████████████████████████████| 272/272 [02:12<00:00,  2.05it/s]


MMLU Task Accuracy (task=professional_medicine): 0.5625


Processing college_biology: 100%|██████████████████████████████████████████████████████████████████████████████████████| 144/144 [00:41<00:00,  3.50it/s]


MMLU Task Accuracy (task=college_biology): 0.5902777777777778


Processing college_medicine: 100%|█████████████████████████████████████████████████████████████████████████████████████| 173/173 [00:56<00:00,  3.08it/s]


MMLU Task Accuracy (task=college_medicine): 0.49710982658959535


Processing high_school_biology: 100%|██████████████████████████████████████████████████████████████████████████████████| 310/310 [01:24<00:00,  3.69it/s]

MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548
Overall MMLU Accuracy: 0.47130692336171787





In [15]:
# print all results
print(f'Scores for {model_wrapped.get_model_name()}:')

math_rows = 0
for task in mm_tasks_math:
    math_rows += len(math_benchmark.load_benchmark_dataset(task))
results_math = results.iloc[:math_rows]
results_bio = results.iloc[math_rows:]

math_mean_score = results_math['Correct'].mean()
bio_mean_score = results_bio['Correct'].mean()

print("-"*50)
print("Counting all answers: ")
print("     Accuracy math score:     " + str(math_mean_score))
print("     Accuracy bio score:      " + str(bio_mean_score))
print("     Accuracy overall score:  " + str(benchmark.overall_score))

## Counting only correctly formatted answers

results_correct_format = results[results['Prediction'].isin(["A", "B", "C", "D"])]
acc_all_correct_format = results_correct_format['Correct'].mean()

results_math_correct_format = results_math[results_math['Prediction'].isin(["A", "B", "C", "D"])]
acc_math_correct_format = results_math_correct_format['Correct'].mean()

results_bio_correct_format = results_bio[results_bio['Prediction'].isin(["A", "B", "C", "D"])]
acc_bio_correct_format = results_bio_correct_format['Correct'].mean()
print("-"*50)
print("Counting only correctly formatted answers:")
print("     Accuracy math score:     " + str(acc_math_correct_format))
print("     Accuracy bio score:      " + str(acc_bio_correct_format))
print("     Accuracy overall score:  " + str(acc_all_correct_format))

print("-"*50)
print("Benchmark dataset sizes:")
print("     Number of correctly formatted answers in math:  " + str(len(results_math_correct_format)) + " out of " + str(len(results_math)) + f". ({100*len(results_math_correct_format)/len(results_math):.5f}%)")
print("     Number of correctly formatted answers in bio:   " + str(len(results_bio_correct_format)) + " out of " + str(len(results_bio)) + f". ({100*len(results_bio_correct_format)/len(results_bio):.5f}%)")
print("     Number of correctly formatted answers overall:  " + str(len(results_correct_format)) + " out of " + str(len(results)) + f". ({100*len(results_correct_format)/len(results):.5f}%)")

print("-"*50)
pd.set_option("display.max_rows", None)
benchmark.task_scores

Scores for Mistral Instruct 7B:
--------------------------------------------------
Counting all answers: 
     Accuracy math score:     0.3579109062980031
     Accuracy bio score:      0.5768406004288777
     Accuracy overall score:  0.47130692336171787
--------------------------------------------------
Counting only correctly formatted answers:
     Accuracy math score:     0.3579109062980031
     Accuracy bio score:      0.5768406004288777
     Accuracy overall score:  0.47130692336171787
--------------------------------------------------
Benchmark dataset sizes:
     Number of correctly formatted answers in math:  1302 out of 1302. (100.00000%)
     Number of correctly formatted answers in bio:   1399 out of 1399. (100.00000%)
     Number of correctly formatted answers overall:  2701 out of 2701. (100.00000%)
--------------------------------------------------


Unnamed: 0,Task,Score
0,high_school_mathematics,0.303704
1,abstract_algebra,0.33
2,machine_learning,0.455357
3,elementary_mathematics,0.34127
4,college_mathematics,0.38
5,formal_logic,0.357143
6,high_school_statistics,0.407407
7,clinical_knowledge,0.607547
8,medical_genetics,0.62
9,anatomy,0.459259


In [11]:
# pd.set_option("display.max_rows", None)
pd.set_option("display.max_rows", 12)
print(results['Prediction'])
print(results['Input'][2])

0       A
1       B
2       A
3       B
4       B
       ..
2696    B
2697    B
2698    D
2699    B
2700    A
Name: Prediction, Length: 2701, dtype: object
A positive integer n is called “powerful” if, for every prime factor p of n, p^2 is also a factor of n. An example of a powerful number is
A. 392
B. 336
C. 300
D. 297
Answer:


---

# Results

In [9]:
# Mistral 7B Instruct
print("""
Processing high_school_mathematics: 100%|█████| 270/270 [00:40<00:00,  6.60it/s]
MMLU Task Accuracy (task=high_school_mathematics): 0.3037037037037037
Processing abstract_algebra: 100%|████████████| 100/100 [00:13<00:00,  7.24it/s]
MMLU Task Accuracy (task=abstract_algebra): 0.33
Processing machine_learning: 100%|████████████| 112/112 [00:17<00:00,  6.49it/s]
MMLU Task Accuracy (task=machine_learning): 0.45535714285714285
Processing elementary_mathematics: 100%|██████| 378/378 [00:55<00:00,  6.85it/s]
MMLU Task Accuracy (task=elementary_mathematics): 0.3412698412698413
Processing college_mathematics: 100%|█████████| 100/100 [00:15<00:00,  6.30it/s]
MMLU Task Accuracy (task=college_mathematics): 0.38
Processing formal_logic: 100%|████████████████| 126/126 [00:24<00:00,  5.10it/s]
MMLU Task Accuracy (task=formal_logic): 0.35714285714285715
Processing high_school_statistics: 100%|██████| 216/216 [00:45<00:00,  4.78it/s]
MMLU Task Accuracy (task=high_school_statistics): 0.4074074074074074
Processing clinical_knowledge: 100%|██████████| 265/265 [00:37<00:00,  7.06it/s]
MMLU Task Accuracy (task=clinical_knowledge): 0.6075471698113207
Processing medical_genetics: 100%|████████████| 100/100 [00:13<00:00,  7.38it/s]
MMLU Task Accuracy (task=medical_genetics): 0.62
Processing anatomy: 100%|█████████████████████| 135/135 [00:19<00:00,  7.01it/s]
MMLU Task Accuracy (task=anatomy): 0.45925925925925926
Processing professional_medicine: 100%|███████| 272/272 [01:21<00:00,  3.32it/s]
MMLU Task Accuracy (task=professional_medicine): 0.5625
Processing college_biology: 100%|█████████████| 144/144 [00:23<00:00,  6.15it/s]
MMLU Task Accuracy (task=college_biology): 0.5902777777777778
Processing college_medicine: 100%|████████████| 173/173 [00:34<00:00,  5.00it/s]
MMLU Task Accuracy (task=college_medicine): 0.49710982658959535
Processing high_school_biology: 100%|█████████| 310/310 [00:51<00:00,  6.07it/s]
MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548
Overall MMLU Accuracy: 0.47130692336171787
--------------------------------------------------
Overall score for Mistral 7B Instruct on all tasks: 0.47130692336171787
""")


Processing high_school_mathematics: 100%|█████| 270/270 [00:40<00:00,  6.60it/s]
MMLU Task Accuracy (task=high_school_mathematics): 0.3037037037037037
Processing abstract_algebra: 100%|████████████| 100/100 [00:13<00:00,  7.24it/s]
MMLU Task Accuracy (task=abstract_algebra): 0.33
Processing machine_learning: 100%|████████████| 112/112 [00:17<00:00,  6.49it/s]
MMLU Task Accuracy (task=machine_learning): 0.45535714285714285
Processing elementary_mathematics: 100%|██████| 378/378 [00:55<00:00,  6.85it/s]
MMLU Task Accuracy (task=elementary_mathematics): 0.3412698412698413
Processing college_mathematics: 100%|█████████| 100/100 [00:15<00:00,  6.30it/s]
MMLU Task Accuracy (task=college_mathematics): 0.38
Processing formal_logic: 100%|████████████████| 126/126 [00:24<00:00,  5.10it/s]
MMLU Task Accuracy (task=formal_logic): 0.35714285714285715
Processing high_school_statistics: 100%|██████| 216/216 [00:45<00:00,  4.78it/s]
MMLU Task Accuracy (task=high_school_statistics): 0.4074074074074074

In [None]:
# BioMistral 7B
print("""
0-shot
Processing high_school_biology: 100%|█████████| 310/310 [00:50<00:00,  6.08it/s]
MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548
Processing medical_genetics: 100%|████████████| 100/100 [00:13<00:00,  7.39it/s]
MMLU Task Accuracy (task=medical_genetics): 0.67
Processing virology: 100%|████████████████████| 166/166 [00:22<00:00,  7.32it/s]
MMLU Task Accuracy (task=virology): 0.4457831325301205
Processing professional_medicine: 100%|███████| 272/272 [01:21<00:00,  3.33it/s]
MMLU Task Accuracy (task=professional_medicine): 0.5514705882352942
Processing nutrition: 100%|███████████████████| 306/306 [00:46<00:00,  6.54it/s]
MMLU Task Accuracy (task=nutrition): 0.6045751633986928
Processing anatomy: 100%|█████████████████████| 135/135 [00:19<00:00,  7.00it/s]
MMLU Task Accuracy (task=anatomy): 0.4962962962962963
Processing college_medicine: 100%|████████████| 173/173 [00:34<00:00,  5.01it/s]
MMLU Task Accuracy (task=college_medicine): 0.5491329479768786
Processing college_biology: 100%|█████████████| 144/144 [00:23<00:00,  6.12it/s]
MMLU Task Accuracy (task=college_biology): 0.6180555555555556
Overall MMLU Accuracy: 0.5759651307596513
--------------------------------------------------
Overall score for BioMistral 7B: 0.5759651307596513
--------------------------------------------------
3-shot
Processing high_school_biology: 100%|█████████| 310/310 [02:42<00:00,  1.91it/s]
MMLU Task Accuracy (task=high_school_biology): 0.6516129032258065
Processing medical_genetics: 100%|████████████| 100/100 [00:50<00:00,  1.99it/s]
MMLU Task Accuracy (task=medical_genetics): 0.65
Processing virology: 100%|████████████████████| 166/166 [01:23<00:00,  1.99it/s]
MMLU Task Accuracy (task=virology): 0.4819277108433735
Processing professional_medicine: 100%|███████| 272/272 [02:57<00:00,  1.53it/s]
MMLU Task Accuracy (task=professional_medicine): 0.5845588235294118
Processing nutrition: 100%|███████████████████| 306/306 [02:38<00:00,  1.93it/s]
MMLU Task Accuracy (task=nutrition): 0.5980392156862745
Processing anatomy: 100%|█████████████████████| 135/135 [01:08<00:00,  1.97it/s]
MMLU Task Accuracy (task=anatomy): 0.43703703703703706
Processing college_medicine: 100%|████████████| 173/173 [01:38<00:00,  1.76it/s]
MMLU Task Accuracy (task=college_medicine): 0.5953757225433526
Processing college_biology: 100%|█████████████| 144/144 [01:16<00:00,  1.89it/s]
MMLU Task Accuracy (task=college_biology): 0.5833333333333334
Overall MMLU Accuracy: 0.5821917808219178
--------------------------------------------------
Overall score for BioMistral 7B: 0.5821917808219178
--------------------------------------------------
5-shot
Processing high_school_biology: 100%|█████████| 310/310 [03:27<00:00,  1.49it/s]
MMLU Task Accuracy (task=high_school_biology): 0.6612903225806451
Processing medical_genetics: 100%|████████████| 100/100 [01:04<00:00,  1.54it/s]
MMLU Task Accuracy (task=medical_genetics): 0.64
Processing virology: 100%|████████████████████| 166/166 [01:48<00:00,  1.53it/s]
MMLU Task Accuracy (task=virology): 0.45180722891566266
Processing professional_medicine: 100%|███████| 272/272 [03:43<00:00,  1.22it/s]
MMLU Task Accuracy (task=professional_medicine): 0.5845588235294118
Processing nutrition: 100%|███████████████████| 306/306 [03:22<00:00,  1.51it/s]
MMLU Task Accuracy (task=nutrition): 0.5784313725490197
Processing anatomy: 100%|█████████████████████| 135/135 [01:28<00:00,  1.53it/s]
MMLU Task Accuracy (task=anatomy): 0.43703703703703706
Processing college_medicine: 100%|████████████| 173/173 [02:02<00:00,  1.42it/s]
MMLU Task Accuracy (task=college_medicine): 0.5895953757225434
Processing college_biology: 100%|█████████████| 144/144 [01:36<00:00,  1.49it/s]
MMLU Task Accuracy (task=college_biology): 0.6041666666666666
Overall MMLU Accuracy: 0.5778331257783312
--------------------------------------------------
Overall score for BioMistral 7B: 0.5778331257783312 
""")


0-shot
Processing high_school_biology: 100%|█████████| 310/310 [00:50<00:00,  6.08it/s]
MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548
Processing medical_genetics: 100%|████████████| 100/100 [00:13<00:00,  7.39it/s]
MMLU Task Accuracy (task=medical_genetics): 0.67
Processing virology: 100%|████████████████████| 166/166 [00:22<00:00,  7.32it/s]
MMLU Task Accuracy (task=virology): 0.4457831325301205
Processing professional_medicine: 100%|███████| 272/272 [01:21<00:00,  3.33it/s]
MMLU Task Accuracy (task=professional_medicine): 0.5514705882352942
Processing nutrition: 100%|███████████████████| 306/306 [00:46<00:00,  6.54it/s]
MMLU Task Accuracy (task=nutrition): 0.6045751633986928
Processing anatomy: 100%|█████████████████████| 135/135 [00:19<00:00,  7.00it/s]
MMLU Task Accuracy (task=anatomy): 0.4962962962962963
Processing college_medicine: 100%|████████████| 173/173 [00:34<00:00,  5.01it/s]
MMLU Task Accuracy (task=college_medicine): 0.5491329479768786
Processing col

In [17]:
# header = ['Model', 'Overall Score (0-shot)', 'Overall Score (3-shot)', 'Overall Score (5-shot)', 'Overall Score - MATH', 'Overall Score - BIO']
# data = [
#     # [metamath_7b.get_model_name(), math_benchmark.overall_score],
#     ['Mistral 7B Instruct', 0.47131, '-', '-', '-', '-'],
#     ['BioMistral 7B', 0.57597, 0.58219, 0.57783, '-', '-'],
#     ['-', '-', '-', '-', '-', '-']
# ]

header = ['Accuracy Metric', 'Mistral 7B Instruct', 'BioMistral 7B', 'MetaMath 7B', 'BioLoRA 7B', 'MetaLoRA 7B', 'MetaBio 7B', 'MetaBioLoRA 7B']
data = [
    ['Overall Score', 0.47131, 0.48019, '-', '-', '-', 0.49897, '-'],
    ['Overall Score - MATH', 0.36784, 0.36213, '-', '-', '-', '\033[32m0.36883\033[0m', '-'],
    ['Overall Score - BIO', 0.56791, 0.59179, '-', '-', '-', '-', '-'],
    ['HS Math', '-', '-', '-', '-', '-', '-', '-'],
    ['Abstract Algebra', '-', '-', '-', '-', '-', '-', '-'],
    ['Machine Learning', '-', '-', '-', '-', '-', '-', '-'],
    ['Elementary Math', '-', '-', '-', '-', '-', '-', '-'],
    ['College Math', '-', '-', '-', '-', '-', '-', '-'],
    ['Formal Logic', '-', '-', '-', '-', '-', '-', '-'],
    ['HS Stats', '-', '-', '-', '-', '-', '-', '-'],
    ['Clinical KG', '-', '-', '-', '-', '-', '-', '-'],
    ['Medical Genetics', '-', '-', '-', '-', '-', '-', '-'],
    ['Anatomy', '-', '-', '-', '-', '-', '-', '-'],
    ['Professional Medicine', '-', '-', '-', '-', '-', '-', '-'],
    ['College Biology', '-', '-', '-', '-', '-', '-', '-'],
    ['College Medicine', '-', '-', '-', '-', '-', '-', '-'],
    ['HS Biology', '-', '-', '-', '-', '-', '-', '-']
]

print('All models are evaluated using zero-shot prediction.')
tt.print(
    data,
    header=header,
    style=tt.styles.ascii_thin_double,
    padding=(0, 1),
    #alignment="lcr"
)

All models are evaluated using zero-shot prediction.
+-----------------------+---------------------+---------------+-------------+------------+-------------+------------+----------------+
| Accuracy Metric       | Mistral 7B Instruct | BioMistral 7B | MetaMath 7B | BioLoRA 7B | MetaLoRA 7B | MetaBio 7B | MetaBioLoRA 7B |
| Overall Score         | 0.47131             | 0.48019       | -           | -          | -           | 0.49897    | -              |
+-----------------------+---------------------+---------------+-------------+------------+-------------+------------+----------------+
| Overall Score - MATH  | 0.36784             | 0.36213       | -           | -          | -           | [32m0.36883[0m    | -              |
+-----------------------+---------------------+---------------+-------------+------------+-------------+------------+----------------+
| Overall Score - BIO   | 0.56791             | 0.59179       | -           | -          | -           | -          | -         

In [20]:
# Mistral 7B Instruct
# MATH
# MMLU Task Accuracy (task=high_school_mathematics): 0.3037037037037037
# MMLU Task Accuracy (task=abstract_algebra): 0.33
# MMLU Task Accuracy (task=machine_learning): 0.45535714285714285
# MMLU Task Accuracy (task=elementary_mathematics): 0.3412698412698413
# MMLU Task Accuracy (task=college_mathematics): 0.38
# MMLU Task Accuracy (task=formal_logic): 0.35714285714285715
# MMLU Task Accuracy (task=high_school_statistics): 0.4074074074074074
# BIO
# MMLU Task Accuracy (task=clinical_knowledge): 0.6075471698113207
# MMLU Task Accuracy (task=medical_genetics): 0.62
# MMLU Task Accuracy (task=anatomy): 0.45925925925925926
# MMLU Task Accuracy (task=professional_medicine): 0.5625
# MMLU Task Accuracy (task=college_biology): 0.5902777777777778
# MMLU Task Accuracy (task=college_medicine): 0.49710982658959535
# MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548

print('MATH')
print(np.round((0.3037037037037037 + 0.33 + 0.45535714285714285 + 0.3412698412698413 + 0.38 + 0.35714285714285715 + 0.4074074074074074) / 7, 5))
print('BIO')
print(np.round((0.6075471698113207 + 0.62 + 0.45925925925925926 + 0.5625 + 0.5902777777777778 + 0.49710982658959535 + 0.6387096774193548) / 7, 5))

MATH
0.36784
BIO
0.56791


In [12]:
# BioMistral 7B
# MATH
# MMLU Task Accuracy (task=high_school_mathematics): 0.3148148148148148
# MMLU Task Accuracy (task=abstract_algebra): 0.3
# MMLU Task Accuracy (task=machine_learning): 0.4642857142857143
# MMLU Task Accuracy (task=elementary_mathematics): 0.34656084656084657
# MMLU Task Accuracy (task=college_mathematics): 0.35
# MMLU Task Accuracy (task=formal_logic): 0.3333333333333333
# MMLU Task Accuracy (task=high_school_statistics): 0.42592592592592593
# BIO
# MMLU Task Accuracy (task=clinical_knowledge): 0.6188679245283019
# MMLU Task Accuracy (task=medical_genetics): 0.67
# MMLU Task Accuracy (task=anatomy): 0.4962962962962963
# MMLU Task Accuracy (task=professional_medicine): 0.5514705882352942
# MMLU Task Accuracy (task=college_biology): 0.6180555555555556
# MMLU Task Accuracy (task=college_medicine): 0.5491329479768786
# MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548

print('MATH')
print(np.round((0.3148148148148148 + 0.3 + 0.4642857142857143 + 0.34656084656084657 + 0.35 + 0.3333333333333333 + 0.42592592592592593) / 7, 5))
print('BIO')
print(np.round((0.6188679245283019 + 0.67 + 0.4962962962962963 + 0.5514705882352942 + 0.6180555555555556 + 0.5491329479768786 + 0.6387096774193548) / 7, 5))

MATH
0.36213
BIO
0.59179


In [15]:
# MetaBio
# MATH
np.round((0.307407 + 0.300000 + 0.482143 + 0.349206 + 0.360000 + 0.357143 + 0.425926) / 7, 5)

0.36883