In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.benchmarks import HellaSwag
from deepeval.benchmarks.tasks import HellaSwagTask
import re

In [24]:
class Llama3(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        sections = [section.strip() for section in prompt.split("\n\n") if section.strip()]

        # Take the last section, including "Answer:" for context
        prompt = sections[-2]

        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(
            **model_inputs, 
            max_new_tokens=100, 
            use_cache=True)
        
        ans = self.tokenizer.batch_decode(generated_ids)[0]

        match = re.search(r"Answer:\s*([A-D])", ans)

        if match:
            answer = match.group(1)
        else:
            answer = 'N/A'

        return answer

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps: list[str]) -> list[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        new_p = []
        for p in promtps:
            sections = [section.strip() for section in prompt.split("\n\n") if section.strip()]
            new_p.append(sections[-2])
            
        model_inputs = self.tokenizer(
            new_p,
            padding=True,    # Ensure equal-length inputs
            truncation=True, # Truncate inputs that exceed max_length
            max_length=512,
            return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=100, use_cache=True)
        decoded_responses = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        res = []
        for ans in decoded_responses:
            match = re.search(r"Answer:\s*([A-D])", ans)
    
            if match:
                res.append(match.group(1))
            else:
                res.append('N/A')
        return res

    def get_model_name(self):
        return "Llama 3"


llama3 = Llama3(model=model, tokenizer=tokenizer)

test = """

The following are multiple choice questions (with answers) are sentence completion problems about Applying sunscreen.

Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. then
A., the man adds wax to the windshield and cuts it.
B., a person board a ski lift, while two men supporting the head of the person wearing winter clothes snow as the we girls sled.
C., the man puts on a christmas coat, knitted with netting.
D., the man continues removing the snow on his car.
Answer: D

A female chef in white uniform shows a stack of baking pans in a large kitchen presenting them. the pans
A. contain egg yolks and baking soda.
B. are then sprinkled with brown sugar.
C. are placed in a strainer on the counter.
D. are filled with pastries and loaded into the oven.
Answer: D

The man in the center is demonstrating a hairstyle on the person wearing the blue shirt. the man in the blue shirt
A. is standing on the sponge cutting the hair of the person wearing the blue shirt.
B. is doing the hairstyle with his hand and the hairspray.
C. sits on the chair next to the sink.
D. is being shown eye to eye.
Answer: C

Two bodybuilder women are seated at a table. they
A. are talking about diving techniques, bribing each other with muscle' n strength.
B. are working out on exercise bikes.
C. are arm wrestling, vieing to win.
D. are shown on parallel bars.
Answer: C

This is a tutorial on how to start a campfire. it
A. shows how to light the fire by rubbing a lid on it.
B. is supposed to be a fire log, but your dad said that he might have burned it, and that if he catches fire it will hurt him.
C. shows the campfire burning on the ground.
D. is a green and red sweet and the recipe is to make it hot and then puts it in a pan to simmer.
Answer: C

A woman puts some lotion on her hand. She rubs the lotion onto her face. a cartoon demonstration
A. is shown with a curling brush.
B. is then shown of a woman crying.
C. is shown on the screen.
D. of a cat is shown.
Answer:

Output A, B, C, or D. Full answer not needed.
"""

# Call the generate method
print(llama3.generate(test))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


C


In [27]:
benchmark = HellaSwag(
    # tasks=[HellaSwagTask.APPLYING_SUNSCREEN, HellaSwagTask.SKATEBOARDING],
    n_shots=3)

In [None]:
results = benchmark.evaluate(model=llama3, batch_size = 5)
print("Task-specific Scoress: ", benchmark.task_scores)
print("Detailed Predictions: ", benchmark.predictions)
print(benchmark.overall_score)

In [None]:
print(benchmark.predictions['Prediction'].iloc[1])