# Requirements

In [11]:
# ! pip install -U deepeval
# ! pip install numpy
# ! pip install transformers
# ! pip3 install torch torchvision torchaudio

In [86]:
# ! pip install sentencepiece
# ! pip install tokenizers
# ! pip install accelerate
# ! pip install bitsandbytes
# ! pip install vllm
# ! pip install fraction
# ! pip install protobuf
# ! pip install termtables

In [2]:
from deepeval.benchmarks import MMLU, TruthfulQA
from deepeval.benchmarks.tasks import MMLUTask, TruthfulQATask
from deepeval.benchmarks.modes import TruthfulQAMode
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from typing import List
import torch
import pandas as pd
import json
import termtables as tt
import numpy as np

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {torch_device}")



Using device: cuda


---

# Model Loading

In [None]:
# Reset loaded models to avoid memory issues
model = None
tokenizer = None
mistral_7b_instruct = None

bio_model = None
bio_tokenizer = None
biomistral_7b = None

math_model = None
math_tokenizer = None
metamath_7b = None

# Set cache directory
blackhole_dir = #black_dir

# Load chosen model
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", cache_dir=blackhole_dir)
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", cache_dir=blackhole_dir)

bio_tokenizer = AutoTokenizer.from_pretrained("BioMistral/BioMistral-7B", cache_dir=blackhole_dir)
bio_model = AutoModelForCausalLM.from_pretrained("BioMistral/BioMistral-7B", cache_dir=blackhole_dir)

# math_model = AutoModelForCausalLM.from_pretrained("meta-math/MetaMath-Mistral-7B", cache_dir=blackhole_dir)
# math_tokenizer = AutoTokenizer.from_pretrained("meta-math/MetaMath-Mistral-7B", cache_dir=blackhole_dir)

In [4]:
# Define wrapper class for Mistral 7B Instruct
class Mistral7BInstruct(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto 

        prompt = prompt[:-55] # len("\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed.") = 55, so we remove that from the prompt

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=1, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, temperature=0.0001)
        decoded_tokens = self.tokenizer.batch_decode(generated_ids)[0]
        new_tokens = decoded_tokens[len(prompt)+4:].strip()
        return new_tokens

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, prompts: List[str]) -> List[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        prompts = [prompt[:-55] for prompt in prompts]

        model_inputs = self.tokenizer(prompts, return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=1, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, temperature=0.0001)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "Mistral 7B Instruct"

# Define wrapper class for BioMistral 7B
class BioMistral7B(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto 

        prompt = prompt[:-55] # len("\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed.") = 55, so we remove that from the prompt

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=1, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, temperature=0.0001)
        decoded_tokens = self.tokenizer.batch_decode(generated_ids)[0]
        new_tokens = decoded_tokens[len(prompt)+4:].strip()
        return new_tokens

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, prompts: List[str]) -> List[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        prompts = [prompt[:-55] for prompt in prompts]

        model_inputs = self.tokenizer(prompts, return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=1, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, temperature=0.0001)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "BioMistral 7B"

# Define wrapper class for MetaMath 7B
class MetaMath7B(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto 

        # prompt = "[INST] " + prompt + " [/INST]"
        # prompt = prompt + "\nYou will only respond with a JSON object with the keys Explanation and Answer Choice, where Explanation is a string and Answer Choice is either 'A', 'B', 'C' or 'D'."
        # prompt = """Please answer the following question strictly in JSON format. Your response must contain only a JSON object 
        # with the keys 'Explanation' and 'Answer_choice'. The 'Answer_choice' should be one of four options: 'A', 'B', 'C' or 'D'. 
        # No extra text, tokens or explanation outside the JSON format is allowed.\n
        # Examples on how to answer questions:\n
        # What color is the sky?\nA. green\nB. blue \nC. red\nD. orange\nAnswer: 
        # {"Explanation": "The color of the sky depends on the time of day, the weather, and the location. In general, the sky is blue during the day, but it can also be other colors such as gray or black during certain weather conditions. Therefore, the answer cannot be determined with the given information.",
        # "Answer_choice": "D"}
        
        # Answer the following question with the json format and nothing else then stop generating:\n""" + prompt[:-57]

        # prompt = """
        # The correct answer to the following question is formatted as a JSON object with the keys "Explanation" and "Answer_Choice". The "Explanation" key should contain a string explaining the answer, and the "Answer_Choice" key should contain the letter of the correct answer, either A, B, C or D. For example, if the correct answer is "A", the JSON object should look like this: {"Explanation": "This is the explanation.", "Answer_Choice": "A"}. I want you to only print out the value of the key "Answer_Choice". The question is as follows:\n 
        # """ + prompt[:-57]

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=50, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, temperature=0.0001)
        decoded_tokens = self.tokenizer.batch_decode(generated_ids)[0]
        # return decoded_tokens[len(prompt)-50:]
        # return decoded_tokens
        new_tokens = decoded_tokens[len(prompt)+5:]#.strip()
        if "The answer is: " in new_tokens:
            new_tokens = new_tokens[new_tokens.find("The answer is: ")+15:]
        new_tokens = new_tokens.replace("</s>", "")
        if new_tokens.strip() == "":
            return ""
        answer_choices = [prompt[prompt.find("A. "):prompt.find("\n", prompt.find("A. "))].strip(), 
                          prompt[prompt.find("B. "):prompt.find("\n", prompt.find("B. "))].strip(), 
                            prompt[prompt.find("C. "):prompt.find("\n", prompt.find("C. "))].strip(), 
                            prompt[prompt.find("D. "):prompt.find("\n", prompt.find("D. "))].strip()]
        
        def find_answer_choice(new_tokens, answer_choices):
            if new_tokens in ["A", "B", "C", "D"]:
                return new_tokens
            for answer_choice in answer_choices:
                if new_tokens in answer_choice:
                    return answer_choice[0]
            return None
        fac = find_answer_choice(new_tokens, answer_choices)
        if fac:
            return fac

        if new_tokens[-1] == ".":
            new_tokens = new_tokens[:-1]
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
        if "\\text{" in new_tokens:
            new_tokens = new_tokens.replace("\\text{", "")
            new_tokens = new_tokens[:-1]
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
        if new_tokens[0] == "(" or new_tokens[0] == "[":
            new_tokens = new_tokens[1:-1]
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
        return new_tokens
        new_tokens = new_tokens[new_tokens.find("{"):new_tokens.rfind("}")+1]
        try:
            new_tokens_json = json.loads(new_tokens, strict=False)
            return new_tokens_json['Answer_choice']
        except:
            new_tokens = new_tokens.replace("{", "").replace("}", "")
            new_tokens = "{" + new_tokens + "}"
            print(new_tokens)
            return new_tokens
        return decoded_tokens[len(prompt):]
        return decoded_tokens[len(prompt)+5:]
        return self.tokenizer.batch_decode(generated_ids)[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps: List[str]) -> List[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(promtps, return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "MetaMath Mistral 7B"

In [5]:
# Check if model is loaded and initialize wrapper class
model_name = None

if model != None and tokenizer != None:
    mistral_7b_instruct = Mistral7BInstruct(model=model, tokenizer=tokenizer)
    model_name = 'Mistral 7B Instruct'
elif bio_model != None and bio_tokenizer != None:
    biomistral_7b = BioMistral7B(model=bio_model, tokenizer=bio_tokenizer)
    model_name = 'BioMistral 7B'
elif math_model != None and math_tokenizer != None:
    metamath_7b = MetaMath7B(model=math_model, tokenizer=math_tokenizer)
    model_name = 'MetaMath 7B'
else:
    print("No model loaded")
print(f'{model_name} loaded')

BioMistral 7B loaded


In [41]:
# MetaMath
# We want to evaluate the model by calculating its overall accuracy (this is bad, at around ~12%),
# as well as its accuracy when it answers in the correct format (A, B, C or D) - which is hopefully better.

In [6]:
def quick_test(model_name):
    if model_name == 'Mistral 7B Instruct':
        print('### Q1 (Bio) ###')
        print(mistral_7b_instruct.generate("In a population of giraffes, an environmental change occurs that favors individuals that are tallest. As a result, more of the taller individuals are able to obtain nutrients and survive to pass along their genetic information. This is an example of\nA. directional selection.\nB. stabilizing selection.\nC. sexual selection.\nD. disruptive selection.\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
        print('---')
        print('### Q2 (Bio) ###')
        print(mistral_7b_instruct.generate("Which of the changes below following the start codon in an mRNA would most likely have the greatest deleterious effect?\nA. a deletion of a single nucleotide\nB. a deletion of a nucleotide triplet\nC. a single nucleotide substitution of the nucleotide occupying the first codon position\nD. a single nucleotide substitution of the nucleotide occupying the third codon position\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
        print('---')
        print('### Q3 (Bio) ###')
        print(mistral_7b_instruct.generate("The energy given up by electrons as they move through the electron transport chain is used to\nA. break down glucose\nB. make glucose\nC. produce ATP\nD. make NADH\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
        print('---')
        print('### Q1 (Math) ###')
        print(mistral_7b_instruct.generate("If a metamath_7bn P with vertices at (– 2, – 4), (– 4, 1), (–1, 4), (2, 4), and (3, 0) is reflected across the line y = x to get a new pentagon, P’, then one of the vertices of P’ is\nA. (0, – 3)\nB. (4, 1)\nC. (2, 2)\nD. (– 4, –2)\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
    elif model_name == 'BioMistral 7B':
        print('### Q1 (Bio) ###')
        print(biomistral_7b.generate("In a population of giraffes, an environmental change occurs that favors individuals that are tallest. As a result, more of the taller individuals are able to obtain nutrients and survive to pass along their genetic information. This is an example of\nA. directional selection.\nB. stabilizing selection.\nC. sexual selection.\nD. disruptive selection.\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
        print('---')
        print('### Q2 (Bio) ###')
        print(biomistral_7b.generate("Which of the changes below following the start codon in an mRNA would most likely have the greatest deleterious effect?\nA. a deletion of a single nucleotide\nB. a deletion of a nucleotide triplet\nC. a single nucleotide substitution of the nucleotide occupying the first codon position\nD. a single nucleotide substitution of the nucleotide occupying the third codon position\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
        print('---')
        print('### Q3 (Bio) ###')
        print(biomistral_7b.generate("The energy given up by electrons as they move through the electron transport chain is used to\nA. break down glucose\nB. make glucose\nC. produce ATP\nD. make NADH\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
        print('---')
        print('### Q1 (Math) ###')
        print(biomistral_7b.generate("If a metamath_7bn P with vertices at (– 2, – 4), (– 4, 1), (–1, 4), (2, 4), and (3, 0) is reflected across the line y = x to get a new pentagon, P’, then one of the vertices of P’ is\nA. (0, – 3)\nB. (4, 1)\nC. (2, 2)\nD. (– 4, –2)\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
    elif model_name == 'MetaMath 7B':
        print('### Q1 (Bio) ###')
        print(metamath_7b.generate("In a population of giraffes, an environmental change occurs that favors individuals that are tallest. As a result, more of the taller individuals are able to obtain nutrients and survive to pass along their genetic information. This is an example of\nA. directional selection.\nB. stabilizing selection.\nC. sexual selection.\nD. disruptive selection.\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
        print('---')
        print('### Q2 (Bio) ###')
        print(metamath_7b.generate("Which of the changes below following the start codon in an mRNA would most likely have the greatest deleterious effect?\nA. a deletion of a single nucleotide\nB. a deletion of a nucleotide triplet\nC. a single nucleotide substitution of the nucleotide occupying the first codon position\nD. a single nucleotide substitution of the nucleotide occupying the third codon position\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
        print('---')
        print('### Q3 (Bio) ###')
        print(metamath_7b.generate("The energy given up by electrons as they move through the electron transport chain is used to\nA. break down glucose\nB. make glucose\nC. produce ATP\nD. make NADH\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
        print('---')
        print('### Q1 (Math) ###')
        print(metamath_7b.generate("If a metamath_7bn P with vertices at (– 2, – 4), (– 4, 1), (–1, 4), (2, 4), and (3, 0) is reflected across the line y = x to get a new pentagon, P’, then one of the vertices of P’ is\nA. (0, – 3)\nB. (4, 1)\nC. (2, 2)\nD. (– 4, –2)\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
    else:
        print('No model loaded')

In [7]:
quick_test(model_name)

### Q1 (Bio) ###
A
---
### Q2 (Bio) ###
B
---
### Q3 (Bio) ###
C
---
### Q1 (Math) ###
A


In [7]:
# print(metamath_7b.generate("What is 5 minus 2?\nA. 3\nB. 5\nC. 25\nD. 50\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
# print('---')
# print(metamath_7b.generate("The following are multiple choice questions (with answers) about high school biology.\n\nWhich of the following is not a way to form recombinant DNA?\nA. Translation\nB. Conjugation\nC. Specialized transduction\nD. Transformation\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
# print('---')
# print(metamath_7b.generate("If a metamath_7bn P with vertices at (– 2, – 4), (– 4, 1), (–1, 4), (2, 4), and (3, 0) is reflected across the line y = x to get a new pentagon, P’, then one of the vertices of P’ is\nA. (0, – 3)\nB. (4, 1)\nC. (2, 2)\nD. (– 4, –2)\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))

---

# Model Evaluation

In [8]:
# Define evaluation benchmarks
n_shots = 0

mm_tasks_math = [MMLUTask.HIGH_SCHOOL_MATHEMATICS,
                 MMLUTask.ABSTRACT_ALGEBRA,
                 MMLUTask.MACHINE_LEARNING,
                 MMLUTask.ELEMENTARY_MATHEMATICS,
                 MMLUTask.COLLEGE_MATHEMATICS,
                 MMLUTask.FORMAL_LOGIC,
                 MMLUTask.HIGH_SCHOOL_STATISTICS]

mm_tasks_bio = [MMLUTask.CLINICAL_KNOWLEDGE,
                MMLUTask.MEDICAL_GENETICS,
                MMLUTask.ANATOMY,
                MMLUTask.PROFESSIONAL_MEDICINE,
                MMLUTask.COLLEGE_BIOLOGY,
                MMLUTask.COLLEGE_MEDICINE,
                MMLUTask.HIGH_SCHOOL_BIOLOGY]

mm_tasks_all = mm_tasks_math + mm_tasks_bio

tqa_tasks = [TruthfulQATask.SCIENCE]
tqa_mode = TruthfulQAMode.MC1 # Use MC1 as a benchmark for pinpoint accuracy and MC2 for depth of understanding.

# Define benchmark with specific tasks and shots
instruct_benchmark = MMLU(
    tasks=mm_tasks_all,
    n_shots=n_shots
)

math_benchmark = MMLU(
    tasks=mm_tasks_all,
    n_shots=n_shots
)

bio_benchmark = MMLU(
    tasks=mm_tasks_all,
    n_shots=n_shots
)

TQABenchmark = TruthfulQA(
    tasks=tqa_tasks,
    mode=tqa_mode
)

In [None]:
instruct_benchmark.evaluate(model=mistral_7b_instruct)
instruct_results = instruct_benchmark.predictions
print('-'*50)
print(f'Overall score for {model_name} on all tasks: {instruct_benchmark.overall_score}')

Processing high_school_mathematics: 100%|█████| 270/270 [00:40<00:00,  6.60it/s]


MMLU Task Accuracy (task=high_school_mathematics): 0.3037037037037037


Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing abstract_algebra: 100%|████████████| 100/100 [00:13<00:00,  7.24it/s]


MMLU Task Accuracy (task=abstract_algebra): 0.33


Generating test split:   0%|          | 0/112 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing machine_learning: 100%|████████████| 112/112 [00:17<00:00,  6.49it/s]


MMLU Task Accuracy (task=machine_learning): 0.45535714285714285


Generating test split:   0%|          | 0/378 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/41 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing elementary_mathematics: 100%|██████| 378/378 [00:55<00:00,  6.85it/s]


MMLU Task Accuracy (task=elementary_mathematics): 0.3412698412698413


Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing college_mathematics: 100%|█████████| 100/100 [00:15<00:00,  6.30it/s]


MMLU Task Accuracy (task=college_mathematics): 0.38


Generating test split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/14 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing formal_logic: 100%|████████████████| 126/126 [00:24<00:00,  5.10it/s]


MMLU Task Accuracy (task=formal_logic): 0.35714285714285715


Generating test split:   0%|          | 0/216 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing high_school_statistics: 100%|██████| 216/216 [00:45<00:00,  4.78it/s]


MMLU Task Accuracy (task=high_school_statistics): 0.4074074074074074


Generating test split:   0%|          | 0/265 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing clinical_knowledge: 100%|██████████| 265/265 [00:37<00:00,  7.06it/s]


MMLU Task Accuracy (task=clinical_knowledge): 0.6075471698113207


Processing medical_genetics: 100%|████████████| 100/100 [00:13<00:00,  7.38it/s]


MMLU Task Accuracy (task=medical_genetics): 0.62


Processing anatomy: 100%|█████████████████████| 135/135 [00:19<00:00,  7.01it/s]


MMLU Task Accuracy (task=anatomy): 0.45925925925925926


Processing professional_medicine: 100%|███████| 272/272 [01:21<00:00,  3.32it/s]


MMLU Task Accuracy (task=professional_medicine): 0.5625


Processing college_biology: 100%|█████████████| 144/144 [00:23<00:00,  6.15it/s]


MMLU Task Accuracy (task=college_biology): 0.5902777777777778


Processing college_medicine: 100%|████████████| 173/173 [00:34<00:00,  5.00it/s]


MMLU Task Accuracy (task=college_medicine): 0.49710982658959535


Processing high_school_biology: 100%|█████████| 310/310 [00:51<00:00,  6.07it/s]

MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548
Overall MMLU Accuracy: 0.47130692336171787
--------------------------------------------------
Overall score for Mistral 7B Instruct on all tasks: 0.47130692336171787





In [None]:
# Replace 'mistral_7b' with your own custom model
math_benchmark.evaluate(model=metamath_7b)
math_results = math_benchmark.predictions
print('-'*50)
print(f'Overall score for {metamath_7b.get_model_name()}: {math_benchmark.overall_score}')

Processing high_school_mathematics: 100%|█████| 270/270 [02:12<00:00,  2.04it/s]

MMLU Task Accuracy (task=high_school_mathematics): 0.1259259259259259
Overall MMLU Accuracy: 0.1259259259259259
0.1259259259259259





In [9]:
# Zero-shot
bio_benchmark.evaluate(model=biomistral_7b)
bio_results = bio_benchmark.predictions
print('-'*50)
print(f'Overall score for {biomistral_7b.get_model_name()}: {bio_benchmark.overall_score}')

Processing high_school_mathematics: 100%|█████| 270/270 [00:41<00:00,  6.58it/s]


MMLU Task Accuracy (task=high_school_mathematics): 0.3148148148148148


Processing abstract_algebra: 100%|████████████| 100/100 [00:13<00:00,  7.33it/s]


MMLU Task Accuracy (task=abstract_algebra): 0.3


Processing machine_learning: 100%|████████████| 112/112 [00:17<00:00,  6.48it/s]


MMLU Task Accuracy (task=machine_learning): 0.4642857142857143


Processing elementary_mathematics: 100%|██████| 378/378 [00:55<00:00,  6.83it/s]


MMLU Task Accuracy (task=elementary_mathematics): 0.34656084656084657


Processing college_mathematics: 100%|█████████| 100/100 [00:15<00:00,  6.31it/s]


MMLU Task Accuracy (task=college_mathematics): 0.35


Processing formal_logic: 100%|████████████████| 126/126 [00:24<00:00,  5.10it/s]


MMLU Task Accuracy (task=formal_logic): 0.3333333333333333


Processing high_school_statistics: 100%|██████| 216/216 [00:45<00:00,  4.79it/s]


MMLU Task Accuracy (task=high_school_statistics): 0.42592592592592593


Processing clinical_knowledge: 100%|██████████| 265/265 [00:37<00:00,  7.05it/s]


MMLU Task Accuracy (task=clinical_knowledge): 0.6188679245283019


Processing medical_genetics: 100%|████████████| 100/100 [00:13<00:00,  7.36it/s]


MMLU Task Accuracy (task=medical_genetics): 0.67


Processing anatomy: 100%|█████████████████████| 135/135 [00:22<00:00,  5.96it/s]


MMLU Task Accuracy (task=anatomy): 0.4962962962962963


Processing professional_medicine: 100%|███████| 272/272 [01:45<00:00,  2.57it/s]


MMLU Task Accuracy (task=professional_medicine): 0.5514705882352942


Processing college_biology: 100%|█████████████| 144/144 [00:30<00:00,  4.75it/s]


MMLU Task Accuracy (task=college_biology): 0.6180555555555556


Processing college_medicine: 100%|████████████| 173/173 [00:44<00:00,  3.86it/s]


MMLU Task Accuracy (task=college_medicine): 0.5491329479768786


Processing high_school_biology: 100%|█████████| 310/310 [00:58<00:00,  5.28it/s]

MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548
Overall MMLU Accuracy: 0.4801925212884117
--------------------------------------------------
Overall score for BioMistral 7B: 0.4801925212884117





In [83]:
# pd.set_option("display.max_rows", None)
pd.set_option("display.max_rows", 12)
print(bio_results['Prediction'])
# print(bio_results['Input'][1])
print(bio_results['Input'][2])

0       A
1       B
2       C
3       D
4       A
       ..
1601    A
1602    A
1603    C
1604    A
1605    D
Name: Prediction, Length: 1606, dtype: object
The energy given up by electrons as they move through the electron transport chain is used to
A. break down glucose
B. make glucose
C. produce ATP
D. make NADH
Answer:


In [None]:
results_corrform = math_results
# del_index = results_corrform[(results_corrform['Prediction']) != 'A' | (results_corrform['Prediction']) != 'B' | (results_corrform['Prediction']) != 'C' | (results_corrform['Prediction']) != 'D'].index


In [275]:
results_corrform = results_corrform[(results_corrform['Prediction'] == 'A') | (results_corrform['Prediction'] == 'B') | (results_corrform['Prediction'] == 'C') | (results_corrform['Prediction'] == 'D')]
correct = 0
for index, row in results_corrform.iterrows():
    if row['Correct'] == 1:
        correct += 1
print(correct/len(results_corrform))

0.288135593220339


In [None]:
for i in range(20):
    print(f'Input {i+1}:')
    print('=**********=')
    print(math_benchmark.predictions['Input'][i])
    print('=**********=')
    print(f'Prediction {i+1}:')
    print('++++++++++')
    # print(results['Prediction'][i][results['Prediction'][i].find("\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed.")+55:])
    print(math_results['Prediction'][i])
    print('++++++++++')
    print(math_results['Correct'][i])
    print('-'*50)

Input 1:
=**********=
If a pentagon P with vertices at (– 2, – 4), (– 4, 1), (–1, 4), (2, 4), and (3, 0) is reflected across the line y = x to get a new pentagon, P’, then one of the vertices of P’ is
A. (0, – 3)
B. (4, 1)
C. (2, 2)
D. (– 4, –2)
Answer:
=**********=
Prediction 1:
++++++++++
<s> The following are multiple choice questions (with answers) about high school mathematics.

Joe was in charge of lights for a dance. The red light blinks every two seconds, the yellow light every three seconds, and the blue light every five seconds. If we include the very beginning and very end of the dance, how many times during a seven minute dance will all the lights come on at the same time? (Assume that all three lights blink simultaneously at the very beginning of the dance.)
A. 3
B. 15
C. 6
D. 5
Answer: B

Five thousand dollars compounded annually at an $x\%$ interest rate takes six years to double. At the same interest rate, how many years will it take $\$300$ to grow to $\$9600$?
A. 12
B

---

# Results

In [9]:
# Mistral 7B Instruct
print("""
Processing high_school_mathematics: 100%|█████| 270/270 [00:40<00:00,  6.60it/s]
MMLU Task Accuracy (task=high_school_mathematics): 0.3037037037037037
Processing abstract_algebra: 100%|████████████| 100/100 [00:13<00:00,  7.24it/s]
MMLU Task Accuracy (task=abstract_algebra): 0.33
Processing machine_learning: 100%|████████████| 112/112 [00:17<00:00,  6.49it/s]
MMLU Task Accuracy (task=machine_learning): 0.45535714285714285
Processing elementary_mathematics: 100%|██████| 378/378 [00:55<00:00,  6.85it/s]
MMLU Task Accuracy (task=elementary_mathematics): 0.3412698412698413
Processing college_mathematics: 100%|█████████| 100/100 [00:15<00:00,  6.30it/s]
MMLU Task Accuracy (task=college_mathematics): 0.38
Processing formal_logic: 100%|████████████████| 126/126 [00:24<00:00,  5.10it/s]
MMLU Task Accuracy (task=formal_logic): 0.35714285714285715
Processing high_school_statistics: 100%|██████| 216/216 [00:45<00:00,  4.78it/s]
MMLU Task Accuracy (task=high_school_statistics): 0.4074074074074074
Processing clinical_knowledge: 100%|██████████| 265/265 [00:37<00:00,  7.06it/s]
MMLU Task Accuracy (task=clinical_knowledge): 0.6075471698113207
Processing medical_genetics: 100%|████████████| 100/100 [00:13<00:00,  7.38it/s]
MMLU Task Accuracy (task=medical_genetics): 0.62
Processing anatomy: 100%|█████████████████████| 135/135 [00:19<00:00,  7.01it/s]
MMLU Task Accuracy (task=anatomy): 0.45925925925925926
Processing professional_medicine: 100%|███████| 272/272 [01:21<00:00,  3.32it/s]
MMLU Task Accuracy (task=professional_medicine): 0.5625
Processing college_biology: 100%|█████████████| 144/144 [00:23<00:00,  6.15it/s]
MMLU Task Accuracy (task=college_biology): 0.5902777777777778
Processing college_medicine: 100%|████████████| 173/173 [00:34<00:00,  5.00it/s]
MMLU Task Accuracy (task=college_medicine): 0.49710982658959535
Processing high_school_biology: 100%|█████████| 310/310 [00:51<00:00,  6.07it/s]
MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548
Overall MMLU Accuracy: 0.47130692336171787
--------------------------------------------------
Overall score for Mistral 7B Instruct on all tasks: 0.47130692336171787
""")


Processing high_school_mathematics: 100%|█████| 270/270 [00:40<00:00,  6.60it/s]
MMLU Task Accuracy (task=high_school_mathematics): 0.3037037037037037
Processing abstract_algebra: 100%|████████████| 100/100 [00:13<00:00,  7.24it/s]
MMLU Task Accuracy (task=abstract_algebra): 0.33
Processing machine_learning: 100%|████████████| 112/112 [00:17<00:00,  6.49it/s]
MMLU Task Accuracy (task=machine_learning): 0.45535714285714285
Processing elementary_mathematics: 100%|██████| 378/378 [00:55<00:00,  6.85it/s]
MMLU Task Accuracy (task=elementary_mathematics): 0.3412698412698413
Processing college_mathematics: 100%|█████████| 100/100 [00:15<00:00,  6.30it/s]
MMLU Task Accuracy (task=college_mathematics): 0.38
Processing formal_logic: 100%|████████████████| 126/126 [00:24<00:00,  5.10it/s]
MMLU Task Accuracy (task=formal_logic): 0.35714285714285715
Processing high_school_statistics: 100%|██████| 216/216 [00:45<00:00,  4.78it/s]
MMLU Task Accuracy (task=high_school_statistics): 0.4074074074074074

In [None]:
# BioMistral 7B
print("""
0-shot
Processing high_school_biology: 100%|█████████| 310/310 [00:50<00:00,  6.08it/s]
MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548
Processing medical_genetics: 100%|████████████| 100/100 [00:13<00:00,  7.39it/s]
MMLU Task Accuracy (task=medical_genetics): 0.67
Processing virology: 100%|████████████████████| 166/166 [00:22<00:00,  7.32it/s]
MMLU Task Accuracy (task=virology): 0.4457831325301205
Processing professional_medicine: 100%|███████| 272/272 [01:21<00:00,  3.33it/s]
MMLU Task Accuracy (task=professional_medicine): 0.5514705882352942
Processing nutrition: 100%|███████████████████| 306/306 [00:46<00:00,  6.54it/s]
MMLU Task Accuracy (task=nutrition): 0.6045751633986928
Processing anatomy: 100%|█████████████████████| 135/135 [00:19<00:00,  7.00it/s]
MMLU Task Accuracy (task=anatomy): 0.4962962962962963
Processing college_medicine: 100%|████████████| 173/173 [00:34<00:00,  5.01it/s]
MMLU Task Accuracy (task=college_medicine): 0.5491329479768786
Processing college_biology: 100%|█████████████| 144/144 [00:23<00:00,  6.12it/s]
MMLU Task Accuracy (task=college_biology): 0.6180555555555556
Overall MMLU Accuracy: 0.5759651307596513
--------------------------------------------------
Overall score for BioMistral 7B: 0.5759651307596513
--------------------------------------------------
3-shot
Processing high_school_biology: 100%|█████████| 310/310 [02:42<00:00,  1.91it/s]
MMLU Task Accuracy (task=high_school_biology): 0.6516129032258065
Processing medical_genetics: 100%|████████████| 100/100 [00:50<00:00,  1.99it/s]
MMLU Task Accuracy (task=medical_genetics): 0.65
Processing virology: 100%|████████████████████| 166/166 [01:23<00:00,  1.99it/s]
MMLU Task Accuracy (task=virology): 0.4819277108433735
Processing professional_medicine: 100%|███████| 272/272 [02:57<00:00,  1.53it/s]
MMLU Task Accuracy (task=professional_medicine): 0.5845588235294118
Processing nutrition: 100%|███████████████████| 306/306 [02:38<00:00,  1.93it/s]
MMLU Task Accuracy (task=nutrition): 0.5980392156862745
Processing anatomy: 100%|█████████████████████| 135/135 [01:08<00:00,  1.97it/s]
MMLU Task Accuracy (task=anatomy): 0.43703703703703706
Processing college_medicine: 100%|████████████| 173/173 [01:38<00:00,  1.76it/s]
MMLU Task Accuracy (task=college_medicine): 0.5953757225433526
Processing college_biology: 100%|█████████████| 144/144 [01:16<00:00,  1.89it/s]
MMLU Task Accuracy (task=college_biology): 0.5833333333333334
Overall MMLU Accuracy: 0.5821917808219178
--------------------------------------------------
Overall score for BioMistral 7B: 0.5821917808219178
--------------------------------------------------
5-shot
Processing high_school_biology: 100%|█████████| 310/310 [03:27<00:00,  1.49it/s]
MMLU Task Accuracy (task=high_school_biology): 0.6612903225806451
Processing medical_genetics: 100%|████████████| 100/100 [01:04<00:00,  1.54it/s]
MMLU Task Accuracy (task=medical_genetics): 0.64
Processing virology: 100%|████████████████████| 166/166 [01:48<00:00,  1.53it/s]
MMLU Task Accuracy (task=virology): 0.45180722891566266
Processing professional_medicine: 100%|███████| 272/272 [03:43<00:00,  1.22it/s]
MMLU Task Accuracy (task=professional_medicine): 0.5845588235294118
Processing nutrition: 100%|███████████████████| 306/306 [03:22<00:00,  1.51it/s]
MMLU Task Accuracy (task=nutrition): 0.5784313725490197
Processing anatomy: 100%|█████████████████████| 135/135 [01:28<00:00,  1.53it/s]
MMLU Task Accuracy (task=anatomy): 0.43703703703703706
Processing college_medicine: 100%|████████████| 173/173 [02:02<00:00,  1.42it/s]
MMLU Task Accuracy (task=college_medicine): 0.5895953757225434
Processing college_biology: 100%|█████████████| 144/144 [01:36<00:00,  1.49it/s]
MMLU Task Accuracy (task=college_biology): 0.6041666666666666
Overall MMLU Accuracy: 0.5778331257783312
--------------------------------------------------
Overall score for BioMistral 7B: 0.5778331257783312 
""")


0-shot
Processing high_school_biology: 100%|█████████| 310/310 [00:50<00:00,  6.08it/s]
MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548
Processing medical_genetics: 100%|████████████| 100/100 [00:13<00:00,  7.39it/s]
MMLU Task Accuracy (task=medical_genetics): 0.67
Processing virology: 100%|████████████████████| 166/166 [00:22<00:00,  7.32it/s]
MMLU Task Accuracy (task=virology): 0.4457831325301205
Processing professional_medicine: 100%|███████| 272/272 [01:21<00:00,  3.33it/s]
MMLU Task Accuracy (task=professional_medicine): 0.5514705882352942
Processing nutrition: 100%|███████████████████| 306/306 [00:46<00:00,  6.54it/s]
MMLU Task Accuracy (task=nutrition): 0.6045751633986928
Processing anatomy: 100%|█████████████████████| 135/135 [00:19<00:00,  7.00it/s]
MMLU Task Accuracy (task=anatomy): 0.4962962962962963
Processing college_medicine: 100%|████████████| 173/173 [00:34<00:00,  5.01it/s]
MMLU Task Accuracy (task=college_medicine): 0.5491329479768786
Processing col

In [17]:
# header = ['Model', 'Overall Score (0-shot)', 'Overall Score (3-shot)', 'Overall Score (5-shot)', 'Overall Score - MATH', 'Overall Score - BIO']
# data = [
#     # [metamath_7b.get_model_name(), math_benchmark.overall_score],
#     ['Mistral 7B Instruct', 0.47131, '-', '-', '-', '-'],
#     ['BioMistral 7B', 0.57597, 0.58219, 0.57783, '-', '-'],
#     ['-', '-', '-', '-', '-', '-']
# ]

header = ['Accuracy Metric', 'Mistral 7B Instruct', 'BioMistral 7B', 'MetaMath 7B', 'BioLoRA 7B', 'MetaLoRA 7B', 'MetaBio 7B', 'MetaBioLoRA 7B']
data = [
    ['Overall Score', 0.47131, 0.48019, '-', '-', '-', 0.49897, '-'],
    ['Overall Score - MATH', 0.36784, 0.36213, '-', '-', '-', '\033[32m0.36883\033[0m', '-'],
    ['Overall Score - BIO', 0.56791, 0.59179, '-', '-', '-', '-', '-'],
    ['HS Math', '-', '-', '-', '-', '-', '-', '-'],
    ['Abstract Algebra', '-', '-', '-', '-', '-', '-', '-'],
    ['Machine Learning', '-', '-', '-', '-', '-', '-', '-'],
    ['Elementary Math', '-', '-', '-', '-', '-', '-', '-'],
    ['College Math', '-', '-', '-', '-', '-', '-', '-'],
    ['Formal Logic', '-', '-', '-', '-', '-', '-', '-'],
    ['HS Stats', '-', '-', '-', '-', '-', '-', '-'],
    ['Clinical KG', '-', '-', '-', '-', '-', '-', '-'],
    ['Medical Genetics', '-', '-', '-', '-', '-', '-', '-'],
    ['Anatomy', '-', '-', '-', '-', '-', '-', '-'],
    ['Professional Medicine', '-', '-', '-', '-', '-', '-', '-'],
    ['College Biology', '-', '-', '-', '-', '-', '-', '-'],
    ['College Medicine', '-', '-', '-', '-', '-', '-', '-'],
    ['HS Biology', '-', '-', '-', '-', '-', '-', '-']
]

print('All models are evaluated using zero-shot prediction.')
tt.print(
    data,
    header=header,
    style=tt.styles.ascii_thin_double,
    padding=(0, 1),
    #alignment="lcr"
)

All models are evaluated using zero-shot prediction.
+-----------------------+---------------------+---------------+-------------+------------+-------------+------------+----------------+
| Accuracy Metric       | Mistral 7B Instruct | BioMistral 7B | MetaMath 7B | BioLoRA 7B | MetaLoRA 7B | MetaBio 7B | MetaBioLoRA 7B |
| Overall Score         | 0.47131             | 0.48019       | -           | -          | -           | 0.49897    | -              |
+-----------------------+---------------------+---------------+-------------+------------+-------------+------------+----------------+
| Overall Score - MATH  | 0.36784             | 0.36213       | -           | -          | -           | [32m0.36883[0m    | -              |
+-----------------------+---------------------+---------------+-------------+------------+-------------+------------+----------------+
| Overall Score - BIO   | 0.56791             | 0.59179       | -           | -          | -           | -          | -         

In [20]:
# Mistral 7B Instruct
# MATH
# MMLU Task Accuracy (task=high_school_mathematics): 0.3037037037037037
# MMLU Task Accuracy (task=abstract_algebra): 0.33
# MMLU Task Accuracy (task=machine_learning): 0.45535714285714285
# MMLU Task Accuracy (task=elementary_mathematics): 0.3412698412698413
# MMLU Task Accuracy (task=college_mathematics): 0.38
# MMLU Task Accuracy (task=formal_logic): 0.35714285714285715
# MMLU Task Accuracy (task=high_school_statistics): 0.4074074074074074
# BIO
# MMLU Task Accuracy (task=clinical_knowledge): 0.6075471698113207
# MMLU Task Accuracy (task=medical_genetics): 0.62
# MMLU Task Accuracy (task=anatomy): 0.45925925925925926
# MMLU Task Accuracy (task=professional_medicine): 0.5625
# MMLU Task Accuracy (task=college_biology): 0.5902777777777778
# MMLU Task Accuracy (task=college_medicine): 0.49710982658959535
# MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548

print('MATH')
print(np.round((0.3037037037037037 + 0.33 + 0.45535714285714285 + 0.3412698412698413 + 0.38 + 0.35714285714285715 + 0.4074074074074074) / 7, 5))
print('BIO')
print(np.round((0.6075471698113207 + 0.62 + 0.45925925925925926 + 0.5625 + 0.5902777777777778 + 0.49710982658959535 + 0.6387096774193548) / 7, 5))

MATH
0.36784
BIO
0.56791


In [12]:
# BioMistral 7B
# MATH
# MMLU Task Accuracy (task=high_school_mathematics): 0.3148148148148148
# MMLU Task Accuracy (task=abstract_algebra): 0.3
# MMLU Task Accuracy (task=machine_learning): 0.4642857142857143
# MMLU Task Accuracy (task=elementary_mathematics): 0.34656084656084657
# MMLU Task Accuracy (task=college_mathematics): 0.35
# MMLU Task Accuracy (task=formal_logic): 0.3333333333333333
# MMLU Task Accuracy (task=high_school_statistics): 0.42592592592592593
# BIO
# MMLU Task Accuracy (task=clinical_knowledge): 0.6188679245283019
# MMLU Task Accuracy (task=medical_genetics): 0.67
# MMLU Task Accuracy (task=anatomy): 0.4962962962962963
# MMLU Task Accuracy (task=professional_medicine): 0.5514705882352942
# MMLU Task Accuracy (task=college_biology): 0.6180555555555556
# MMLU Task Accuracy (task=college_medicine): 0.5491329479768786
# MMLU Task Accuracy (task=high_school_biology): 0.6387096774193548

print('MATH')
print(np.round((0.3148148148148148 + 0.3 + 0.4642857142857143 + 0.34656084656084657 + 0.35 + 0.3333333333333333 + 0.42592592592592593) / 7, 5))
print('BIO')
print(np.round((0.6188679245283019 + 0.67 + 0.4962962962962963 + 0.5514705882352942 + 0.6180555555555556 + 0.5491329479768786 + 0.6387096774193548) / 7, 5))

MATH
0.36213
BIO
0.59179


In [15]:
# MetaBio
# MATH
np.round((0.307407 + 0.300000 + 0.482143 + 0.349206 + 0.360000 + 0.357143 + 0.425926) / 7, 5)

0.36883