In [2]:
#install deepeval and huggingface dependencies
!pip install transformers
!pip install -U deepeval

Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (435 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m435.0/435.0 kB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.1-cp310-cp310-

In [None]:
# log into deepeval
# do it in terminal
# deepeval login

In [None]:
# log into huggingface
# huggingface-cli login

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.benchmarks import HellaSwag
from deepeval.benchmarks.tasks import HellaSwagTask

In [3]:
# Define benchmark with specific tasks and shots
benchmark = HellaSwag(
    tasks=[HellaSwagTask.TRIMMING_BRANCHES_OR_HEDGES, HellaSwagTask.BATON_TWIRLING],
    n_shots=5
)

In [16]:
class Mistral7B(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, prompts: list[str]) -> list[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(promtps, return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "Mistral 7B"

In [7]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [17]:
mistral_7b = Mistral7B(model=model, tokenizer=tokenizer)

In [18]:
print(mistral_7b.generate("Write me a joke"))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<s> Write me a jokeGenerate only one letter choice for the following word:

loath \_ o _ th

Theletter is h. The correct answer is "loath" is an archaic word that means to feel intense aversion or dislike, and the correct spelling is "loath". However, due to its similarity to "laugh" and the context of the prompt, the most common mistake is to add an "h" at the beginning, creating the incorrect word "loathing" which


In [19]:
results = benchmark.evaluate(model=mistral_7b, batch_size=5)
print("Task-specific Scores: ", benchmark.task_scores)
print("Detailed Predictions: ", benchmark.predictions)
print(benchmark.overall_score)

Processing Trimming branches or hedges:   0%|          | 0/4 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  25%|██▌       | 1/4 [00:03<00:11,  3.89s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  50%|█████     | 2/4 [00:04<00:04,  2.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  75%|███████▌  | 3/4 [00:09<00:03,  3.10s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges: 100%|██████████| 4/4 [00:13<00:00,  3.34s/it]


HellaSwag Task Accuracy (task=Trimming branches or hedges): 0.0


Processing Baton twirling:   0%|          | 0/14 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:   7%|▋         | 1/14 [00:04<00:56,  4.36s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  14%|█▍        | 2/14 [00:08<00:52,  4.40s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  21%|██▏       | 3/14 [00:13<00:48,  4.41s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  29%|██▊       | 4/14 [00:17<00:43,  4.38s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  36%|███▌      | 5/14 [00:21<00:39,  4.40s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  43%|████▎     | 6/14 [00:26<00:35,  4.39s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


HellaSwag Task Accuracy (task=Baton twirling): 0.0
Overall HellaSwag Accuracy: 0.0
Task-specific Scores:                            Task  Score
0  Trimming branches or hedges    0.0
1               Baton twirling    0.0
Detailed Predictions:                             Task  \
0   Trimming branches or hedges   
1   Trimming branches or hedges   
2   Trimming branches or hedges   
3   Trimming branches or hedges   
4                Baton twirling   
5                Baton twirling   
6                Baton twirling   
7                Baton twirling   
8                Baton twirling   
9                Baton twirling   
10               Baton twirling   
11               Baton twirling   
12               Baton twirling   
13               Baton twirling   
14               Baton twirling   
15               Baton twirling   
16               Baton twirling   
17               Baton twirling   

                                                Input  \
0   A man is shown working in an o


