In [1]:
#install deepeval and huggingface dependencies
!pip install transformers
!pip install -U deepeval


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# log into deepeval
# do it in terminal
# deepeval login

In [None]:
# log into huggingface
# huggingface-cli login

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.benchmarks import HellaSwag
from deepeval.benchmarks.tasks import HellaSwagTask

In [2]:
# Define benchmark with specific tasks and shots
benchmark = HellaSwag(
    tasks=[HellaSwagTask.TRIMMING_BRANCHES_OR_HEDGES, HellaSwagTask.BATON_TWIRLING],
    n_shots=5
)

In [11]:
class Mistral7B(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)[0]

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps: [str]) -> [str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(
            promtps,
            padding=True,    # Ensure equal-length inputs
            truncation=True, # Truncate inputs that exceed max_length
            max_length=512,
            return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "Mistral 7B"

In [4]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
mistral_7b = Mistral7B(model=model, tokenizer=tokenizer)

In [13]:
print(mistral_7b.generate("Write me a joke"))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<s> Write me a joke about ______

There are a million jokes about ______

The more specific you are, the better

The funnier it is

Because that means the joke is not just about ______, but your ______.

What makes you ______?

What makes you unique to the world?

Write about this.

Write about this with a hearty laugh.

Because while you cannot change yourself, it is only in


In [14]:
results = benchmark.evaluate(model=mistral_7b, batch_size=10)
print("Task-specific Scores: ", benchmark.task_scores)
print("Detailed Predictions: ", benchmark.predictions)
print(benchmark.overall_score)

Processing Trimming branches or hedges:   0%|          | 0/4 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  25%|██▌       | 1/4 [00:04<00:14,  4.93s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  50%|█████     | 2/4 [00:05<00:04,  2.43s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  75%|███████▌  | 3/4 [00:10<00:03,  3.50s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges: 100%|██████████| 4/4 [00:15<00:00,  3.78s/it]

HellaSwag Task Accuracy (task=Trimming branches or hedges): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Baton twirling:   0%|          | 0/14 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:   7%|▋         | 1/14 [00:04<01:01,  4.76s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  14%|█▍        | 2/14 [00:08<00:49,  4.13s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  21%|██▏       | 3/14 [00:13<00:49,  4.48s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  29%|██▊       | 4/14 [00:18<00:45,  4.58s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  36%|███▌      | 5/14 [00:22<00:41,  4.65s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  43%|████▎     | 6/14 [00:24<00:28,  3.58s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


HellaSwag Task Accuracy (task=Baton twirling): 0.0
Overall HellaSwag Accuracy: 0.0
Task-specific Scores:                            Task  Score
0  Trimming branches or hedges    0.0
1               Baton twirling    0.0
Detailed Predictions:                             Task  \
0   Trimming branches or hedges   
1   Trimming branches or hedges   
2   Trimming branches or hedges   
3   Trimming branches or hedges   
4                Baton twirling   
5                Baton twirling   
6                Baton twirling   
7                Baton twirling   
8                Baton twirling   
9                Baton twirling   
10               Baton twirling   
11               Baton twirling   
12               Baton twirling   
13               Baton twirling   
14               Baton twirling   
15               Baton twirling   
16               Baton twirling   
17               Baton twirling   

                                                Input  \
0   A man is shown working in an o


