# Requirements

In [11]:
# ! pip install -U deepeval
# ! pip install numpy
# ! pip install transformers
# ! pip3 install torch torchvision torchaudio

In [86]:
# ! pip install sentencepiece
# ! pip install tokenizers
# ! pip install accelerate
# ! pip install bitsandbytes
# ! pip install vllm
# ! pip install fraction
# ! pip install protobuf
# ! pip install termtables

In [93]:
from deepeval.benchmarks import MMLU, TruthfulQA
from deepeval.benchmarks.tasks import MMLUTask, TruthfulQATask
from deepeval.benchmarks.modes import TruthfulQAMode
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from typing import List
import torch
import pandas as pd
import json
import termtables as tt
import numpy as np

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {torch_device}")

Using device: cuda


---

# Model Loading

In [None]:
blackhole_dir = "/dtu/blackhole/08/186664/cache"

# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", cache_dir=blackhole_dir)
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", cache_dir=blackhole_dir)

bio_tokenizer = AutoTokenizer.from_pretrained("BioMistral/BioMistral-7B", cache_dir=blackhole_dir)
bio_model = AutoModelForCausalLM.from_pretrained("BioMistral/BioMistral-7B", cache_dir=blackhole_dir)

# math_model = AutoModelForCausalLM.from_pretrained("meta-math/MetaMath-Mistral-7B", cache_dir=blackhole_dir)
# math_tokenizer = AutoTokenizer.from_pretrained("meta-math/MetaMath-Mistral-7B", cache_dir=blackhole_dir)

In [None]:
class BioMistral7B(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto 

        prompt = prompt[:-55]

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=1, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, temperature=0.0001)
        decoded_tokens = self.tokenizer.batch_decode(generated_ids)[0]
        new_tokens = decoded_tokens[len(prompt)+4:].strip()
        return new_tokens

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, prompts: List[str]) -> List[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        prompts = [prompt[:-55] for prompt in prompts]

        model_inputs = self.tokenizer(prompts, return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=1, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, temperature=0.0001)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "BioMistral 7B"

biomistral_7b = BioMistral7B(model=bio_model, tokenizer=bio_tokenizer)

In [None]:
class MetaMath7B(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto 

        # prompt = "[INST] " + prompt + " [/INST]"
        # prompt = prompt + "\nYou will only respond with a JSON object with the keys Explanation and Answer Choice, where Explanation is a string and Answer Choice is either 'A', 'B', 'C' or 'D'."
        # prompt = """Please answer the following question strictly in JSON format. Your response must contain only a JSON object 
        # with the keys 'Explanation' and 'Answer_choice'. The 'Answer_choice' should be one of four options: 'A', 'B', 'C' or 'D'. 
        # No extra text, tokens or explanation outside the JSON format is allowed.\n
        # Examples on how to answer questions:\n
        # What color is the sky?\nA. green\nB. blue \nC. red\nD. orange\nAnswer: 
        # {"Explanation": "The color of the sky depends on the time of day, the weather, and the location. In general, the sky is blue during the day, but it can also be other colors such as gray or black during certain weather conditions. Therefore, the answer cannot be determined with the given information.",
        # "Answer_choice": "D"}
        
        # Answer the following question with the json format and nothing else then stop generating:\n""" + prompt[:-57]

        # prompt = """
        # The correct answer to the following question is formatted as a JSON object with the keys "Explanation" and "Answer_Choice". The "Explanation" key should contain a string explaining the answer, and the "Answer_Choice" key should contain the letter of the correct answer, either A, B, C or D. For example, if the correct answer is "A", the JSON object should look like this: {"Explanation": "This is the explanation.", "Answer_Choice": "A"}. I want you to only print out the value of the key "Answer_Choice". The question is as follows:\n 
        # """ + prompt[:-57]

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=50, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, temperature=0.0001)
        decoded_tokens = self.tokenizer.batch_decode(generated_ids)[0]
        # return decoded_tokens[len(prompt)-50:]
        # return decoded_tokens
        new_tokens = decoded_tokens[len(prompt)+5:]#.strip()
        if "The answer is: " in new_tokens:
            new_tokens = new_tokens[new_tokens.find("The answer is: ")+15:]
        new_tokens = new_tokens.replace("</s>", "")
        if new_tokens.strip() == "":
            return ""
        answer_choices = [prompt[prompt.find("A. "):prompt.find("\n", prompt.find("A. "))].strip(), 
                          prompt[prompt.find("B. "):prompt.find("\n", prompt.find("B. "))].strip(), 
                            prompt[prompt.find("C. "):prompt.find("\n", prompt.find("C. "))].strip(), 
                            prompt[prompt.find("D. "):prompt.find("\n", prompt.find("D. "))].strip()]
        
        def find_answer_choice(new_tokens, answer_choices):
            if new_tokens in ["A", "B", "C", "D"]:
                return new_tokens
            for answer_choice in answer_choices:
                if new_tokens in answer_choice:
                    return answer_choice[0]
            return None
        fac = find_answer_choice(new_tokens, answer_choices)
        if fac:
            return fac

        if new_tokens[-1] == ".":
            new_tokens = new_tokens[:-1]
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
        if "\\text{" in new_tokens:
            new_tokens = new_tokens.replace("\\text{", "")
            new_tokens = new_tokens[:-1]
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
        if new_tokens[0] == "(" or new_tokens[0] == "[":
            new_tokens = new_tokens[1:-1]
            fac = find_answer_choice(new_tokens, answer_choices)
            if fac:
                return fac
        return new_tokens
        new_tokens = new_tokens[new_tokens.find("{"):new_tokens.rfind("}")+1]
        try:
            new_tokens_json = json.loads(new_tokens, strict=False)
            return new_tokens_json['Answer_choice']
        except:
            new_tokens = new_tokens.replace("{", "").replace("}", "")
            new_tokens = "{" + new_tokens + "}"
            print(new_tokens)
            return new_tokens
        return decoded_tokens[len(prompt):]
        return decoded_tokens[len(prompt)+5:]
        return self.tokenizer.batch_decode(generated_ids)[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps: List[str]) -> List[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(promtps, return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "MetaMath Mistral 7B"

metamath_7b = MetaMath7B(model=math_model, tokenizer=math_tokenizer)

NameError: name 'math_model' is not defined

In [41]:
# MetaMath
# We want to evaluate the model by calculating its overall accuracy (this is bad, at around ~12%),
# as well as its accuracy when it answers in the correct format (A, B, C or D) - which is hopefully better.

In [56]:
len("\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed.")

55

In [70]:
print('### Q1 ###')
print(biomistral_7b.generate("In a population of giraffes, an environmental change occurs that favors individuals that are tallest. As a result, more of the taller individuals are able to obtain nutrients and survive to pass along their genetic information. This is an example of\nA. directional selection.\nB. stabilizing selection.\nC. sexual selection.\nD. disruptive selection.\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
print('---')
print('### Q2 ###')
print(biomistral_7b.generate("Which of the changes below following the start codon in an mRNA would most likely have the greatest deleterious effect?\nA. a deletion of a single nucleotide\nB. a deletion of a nucleotide triplet\nC. a single nucleotide substitution of the nucleotide occupying the first codon position\nD. a single nucleotide substitution of the nucleotide occupying the third codon position\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
print('---')
print('### Q3 ###')
print(biomistral_7b.generate("The energy given up by electrons as they move through the electron transport chain is used to\nA. break down glucose\nB. make glucose\nC. produce ATP\nD. make NADH\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))

### Q1 ###
A
---
### Q2 ###
B
---
### Q3 ###
C


In [7]:
# print(metamath_7b.generate("What is 5 minus 2?\nA. 3\nB. 5\nC. 25\nD. 50\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
# print('---')
# print(metamath_7b.generate("The following are multiple choice questions (with answers) about high school biology.\n\nWhich of the following is not a way to form recombinant DNA?\nA. Translation\nB. Conjugation\nC. Specialized transduction\nD. Transformation\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))
# print('---')
# print(metamath_7b.generate("If a metamath_7bn P with vertices at (– 2, – 4), (– 4, 1), (–1, 4), (2, 4), and (3, 0) is reflected across the line y = x to get a new pentagon, P’, then one of the vertices of P’ is\nA. (0, – 3)\nB. (4, 1)\nC. (2, 2)\nD. (– 4, –2)\nAnswer:\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed."))

---

# Model Evaluation

In [100]:
mm_tasks_math = [MMLUTask.HIGH_SCHOOL_MATHEMATICS,
                 MMLUTask.ABSTRACT_ALGEBRA,
                 MMLUTask.MACHINE_LEARNING,
                 MMLUTask.ELEMENTARY_MATHEMATICS,
                 MMLUTask.COLLEGE_MATHEMATICS,
                 MMLUTask.FORMAL_LOGIC,
                 MMLUTask.HIGH_SCHOOL_STATISTICS]

mm_tasks_bio = [MMLUTask.HIGH_SCHOOL_BIOLOGY, 
                MMLUTask.MEDICAL_GENETICS, 
                MMLUTask.VIROLOGY, 
                MMLUTask.PROFESSIONAL_MEDICINE, 
                MMLUTask.NUTRITION, 
                MMLUTask.ANATOMY,
                MMLUTask.COLLEGE_MEDICINE,
                MMLUTask.COLLEGE_BIOLOGY]

tqa_tasks = [TruthfulQATask.SCIENCE]
tqa_mode = TruthfulQAMode.MC1 # Use MC1 as a benchmark for pinpoint accuracy and MC2 for depth of understanding.

# Define benchmark with specific tasks and shots
math_benchmark = MMLU(
    tasks=mm_tasks_math,
    n_shots=0
)

bio_benchmark = MMLU(
    tasks=mm_tasks_bio,
    n_shots=3
)

TQABenchmark = TruthfulQA(
    tasks=tqa_tasks,
    mode=tqa_mode
)

In [None]:
# Replace 'mistral_7b' with your own custom model
math_benchmark.evaluate(model=metamath_7b)
math_results = math_benchmark.predictions
print('-'*50)
print(f'Overall score for {metamath_7b.get_model_name()}: {math_benchmark.overall_score}')

Processing high_school_mathematics: 100%|█████| 270/270 [02:12<00:00,  2.04it/s]

MMLU Task Accuracy (task=high_school_mathematics): 0.1259259259259259
Overall MMLU Accuracy: 0.1259259259259259
0.1259259259259259





In [101]:
bio_benchmark.evaluate(model=biomistral_7b)
bio_results = bio_benchmark.predictions
print('-'*50)
print(f'Overall score for {biomistral_7b.get_model_name()}: {bio_benchmark.overall_score}')

Processing high_school_biology: 100%|█████████| 310/310 [02:42<00:00,  1.91it/s]


MMLU Task Accuracy (task=high_school_biology): 0.6516129032258065


Processing medical_genetics: 100%|████████████| 100/100 [00:50<00:00,  1.99it/s]


MMLU Task Accuracy (task=medical_genetics): 0.65


Processing virology: 100%|████████████████████| 166/166 [01:23<00:00,  1.99it/s]


MMLU Task Accuracy (task=virology): 0.4819277108433735


Processing professional_medicine: 100%|███████| 272/272 [02:57<00:00,  1.53it/s]


MMLU Task Accuracy (task=professional_medicine): 0.5845588235294118


Processing nutrition: 100%|███████████████████| 306/306 [02:38<00:00,  1.93it/s]


MMLU Task Accuracy (task=nutrition): 0.5980392156862745


Processing anatomy: 100%|█████████████████████| 135/135 [01:08<00:00,  1.97it/s]


MMLU Task Accuracy (task=anatomy): 0.43703703703703706


Processing college_medicine: 100%|████████████| 173/173 [01:38<00:00,  1.76it/s]


MMLU Task Accuracy (task=college_medicine): 0.5953757225433526


Processing college_biology: 100%|█████████████| 144/144 [01:16<00:00,  1.89it/s]

MMLU Task Accuracy (task=college_biology): 0.5833333333333334
Overall MMLU Accuracy: 0.5821917808219178
--------------------------------------------------
Overall score for BioMistral 7B: 0.5821917808219178





In [83]:
# pd.set_option("display.max_rows", None)
pd.set_option("display.max_rows", 12)
print(bio_results['Prediction'])
# print(bio_results['Input'][1])
print(bio_results['Input'][2])

0       A
1       B
2       C
3       D
4       A
       ..
1601    A
1602    A
1603    C
1604    A
1605    D
Name: Prediction, Length: 1606, dtype: object
The energy given up by electrons as they move through the electron transport chain is used to
A. break down glucose
B. make glucose
C. produce ATP
D. make NADH
Answer:


In [None]:
results_corrform = math_results
# del_index = results_corrform[(results_corrform['Prediction']) != 'A' | (results_corrform['Prediction']) != 'B' | (results_corrform['Prediction']) != 'C' | (results_corrform['Prediction']) != 'D'].index


In [275]:
results_corrform = results_corrform[(results_corrform['Prediction'] == 'A') | (results_corrform['Prediction'] == 'B') | (results_corrform['Prediction'] == 'C') | (results_corrform['Prediction'] == 'D')]
correct = 0
for index, row in results_corrform.iterrows():
    if row['Correct'] == 1:
        correct += 1
print(correct/len(results_corrform))

0.288135593220339


In [None]:
for i in range(20):
    print(f'Input {i+1}:')
    print('=**********=')
    print(math_benchmark.predictions['Input'][i])
    print('=**********=')
    print(f'Prediction {i+1}:')
    print('++++++++++')
    # print(results['Prediction'][i][results['Prediction'][i].find("\n\nOutput 'A', 'B', 'C', or 'D'. Full answer not needed.")+55:])
    print(math_results['Prediction'][i])
    print('++++++++++')
    print(math_results['Correct'][i])
    print('-'*50)

Input 1:
=**********=
If a pentagon P with vertices at (– 2, – 4), (– 4, 1), (–1, 4), (2, 4), and (3, 0) is reflected across the line y = x to get a new pentagon, P’, then one of the vertices of P’ is
A. (0, – 3)
B. (4, 1)
C. (2, 2)
D. (– 4, –2)
Answer:
=**********=
Prediction 1:
++++++++++
<s> The following are multiple choice questions (with answers) about high school mathematics.

Joe was in charge of lights for a dance. The red light blinks every two seconds, the yellow light every three seconds, and the blue light every five seconds. If we include the very beginning and very end of the dance, how many times during a seven minute dance will all the lights come on at the same time? (Assume that all three lights blink simultaneously at the very beginning of the dance.)
A. 3
B. 15
C. 6
D. 5
Answer: B

Five thousand dollars compounded annually at an $x\%$ interest rate takes six years to double. At the same interest rate, how many years will it take $\$300$ to grow to $\$9600$?
A. 12
B

---

# Results

In [99]:
header = ['Model', 'Overall Score']
data = [
    # [metamath_7b.get_model_name(), math_benchmark.overall_score],
    [biomistral_7b.get_model_name(), np.round(bio_benchmark.overall_score, 5)],
    ['-', '-']
]

tt.print(
    data,
    header=header,
    style=tt.styles.ascii_thin_double,
    padding=(0, 1),
    #alignment="lcr"
)

+---------------+---------------+
| Model         | Overall Score |
| BioMistral 7B | 0.57597       |
+---------------+---------------+
| -             | -             |
+---------------+---------------+
