In [1]:
#install deepeval and huggingface dependencies
!pip install transformers
!pip install -U deepeval


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# log into deepeval
# do it in terminal
# deepeval login

In [None]:
# log into huggingface
# huggingface-cli login

In [29]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.benchmarks import HellaSwag
from deepeval.benchmarks.tasks import HellaSwagTask
import re

In [23]:
# Define benchmark with specific tasks and shots
benchmark = HellaSwag(
    tasks=[HellaSwagTask.TRIMMING_BRANCHES_OR_HEDGES, HellaSwagTask.BATON_TWIRLING],
    n_shots=5
)

In [34]:
class Mistral7B(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        sections = [section.strip() for section in prompt.split("\n\n") if section.strip()]

        # Take the last section, including "Answer:" for context
        prompt = sections[-2]
        
        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        
        ans = self.tokenizer.batch_decode(generated_ids)[0]

        match = re.search(r"Answer:\s*([A-D])", ans)

        if match:
            answer = match.group(1)
        else:
            answer = 'N/A'

        return answer

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps: [str]) -> [str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        new_p = []
        for p in promtps:
            sections = [section.strip() for section in prompt.split("\n\n") if section.strip()]
            new_p.append(sections[-2])

        model_inputs = self.tokenizer(
            new_p,
            padding=True,    # Ensure equal-length inputs
            truncation=True, # Truncate inputs that exceed max_length
            max_length=512,
            return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        decoded_responses = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        res = []
        for ans in decoded_responses:
            match = re.search(r"Answer:\s*([A-D])", ans)
    
            if match:
                res.append(match.group(1))
            else:
                res.append('N/A')
        return res

    def get_model_name(self):
        return "Mistral 7B"

In [25]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [35]:
mistral_7b = Mistral7B(model=model, tokenizer=tokenizer)

In [37]:
results = benchmark.evaluate(model=mistral_7b, batch_size=10)
print("Task-specific Scores: ", benchmark.task_scores)
print("Detailed Predictions: ", benchmark.predictions)
print(benchmark.overall_score)

Processing Trimming branches or hedges:   0%|          | 0/4 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  25%|██▌       | 1/4 [00:04<00:13,  4.40s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  50%|█████     | 2/4 [00:08<00:08,  4.42s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  75%|███████▌  | 3/4 [00:13<00:04,  4.41s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges: 100%|██████████| 4/4 [00:17<00:00,  4.41s/it]


HellaSwag Task Accuracy (task=Trimming branches or hedges): 1.0


Processing Baton twirling:   0%|          | 0/14 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:   7%|▋         | 1/14 [00:04<00:57,  4.45s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  14%|█▍        | 2/14 [00:04<00:23,  1.97s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  21%|██▏       | 3/14 [00:04<00:13,  1.18s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  29%|██▊       | 4/14 [00:09<00:25,  2.50s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  36%|███▌      | 5/14 [00:13<00:28,  3.18s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  43%|████▎     | 6/14 [00:18<00:28,  3.60s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


HellaSwag Task Accuracy (task=Baton twirling): 0.35714285714285715
Overall HellaSwag Accuracy: 0.5
Task-specific Scores:                            Task     Score
0  Trimming branches or hedges  1.000000
1               Baton twirling  0.357143
Detailed Predictions:                             Task  \
0   Trimming branches or hedges   
1   Trimming branches or hedges   
2   Trimming branches or hedges   
3   Trimming branches or hedges   
4                Baton twirling   
5                Baton twirling   
6                Baton twirling   
7                Baton twirling   
8                Baton twirling   
9                Baton twirling   
10               Baton twirling   
11               Baton twirling   
12               Baton twirling   
13               Baton twirling   
14               Baton twirling   
15               Baton twirling   
16               Baton twirling   
17               Baton twirling   

                                                Input Prediction  


