In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

In [4]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [66]:
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.benchmarks import HellaSwag
from deepeval.benchmarks.tasks import HellaSwagTask
import re

In [200]:
class Llama3(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        sections = [section.strip() for section in prompt.split("\n\n") if section.strip()]

        # Take the last section, including "Answer:" for context
        prompt = sections[-2]

        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(
            **model_inputs, 
            max_new_tokens=100, 
            use_cache=True)
        
        ans = self.tokenizer.batch_decode(generated_ids)[0]

        match = re.search(r"Answer:\s*([A-D])", ans)

        if match:
            answer = match.group(1)
        else:
            answer = 'N/A'

        return answer

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps: list[str]) -> list[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(
            promtps,
            padding=True,    # Ensure equal-length inputs
            truncation=True, # Truncate inputs that exceed max_length
            max_length=512,
            return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=100, use_cache=True)
        decoded_responses = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        return decoded_responses

    def get_model_name(self):
        return "Llama 3"


llama3 = Llama3(model=model, tokenizer=tokenizer)

test = """

The following are multiple choice questions (with answers) are sentence completion problems about Applying sunscreen.

Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. then
A., the man adds wax to the windshield and cuts it.
B., a person board a ski lift, while two men supporting the head of the person wearing winter clothes snow as the we girls sled.
C., the man puts on a christmas coat, knitted with netting.
D., the man continues removing the snow on his car.
Answer: D

A female chef in white uniform shows a stack of baking pans in a large kitchen presenting them. the pans
A. contain egg yolks and baking soda.
B. are then sprinkled with brown sugar.
C. are placed in a strainer on the counter.
D. are filled with pastries and loaded into the oven.
Answer: D

The man in the center is demonstrating a hairstyle on the person wearing the blue shirt. the man in the blue shirt
A. is standing on the sponge cutting the hair of the person wearing the blue shirt.
B. is doing the hairstyle with his hand and the hairspray.
C. sits on the chair next to the sink.
D. is being shown eye to eye.
Answer: C

Two bodybuilder women are seated at a table. they
A. are talking about diving techniques, bribing each other with muscle' n strength.
B. are working out on exercise bikes.
C. are arm wrestling, vieing to win.
D. are shown on parallel bars.
Answer: C

This is a tutorial on how to start a campfire. it
A. shows how to light the fire by rubbing a lid on it.
B. is supposed to be a fire log, but your dad said that he might have burned it, and that if he catches fire it will hurt him.
C. shows the campfire burning on the ground.
D. is a green and red sweet and the recipe is to make it hot and then puts it in a pan to simmer.
Answer: C

A woman puts some lotion on her hand. She rubs the lotion onto her face. a cartoon demonstration
A. is shown with a curling brush.
B. is then shown of a woman crying.
C. is shown on the screen.
D. of a cat is shown.
Answer:

Output A, B, C, or D. Full answer not needed.
"""

# Call the generate method
print(llama3.generate(test))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


C


In [203]:
benchmark = HellaSwag(
    # tasks=[HellaSwagTask.APPLYING_SUNSCREEN, HellaSwagTask.SKATEBOARDING],
    n_shots=5)

In [None]:
results = benchmark.evaluate(model=llama3)
print("Task-specific Scoress: ", benchmark.task_scores)
print("Detailed Predictions: ", benchmark.predictions)
print(benchmark.overall_score)

Processing Applying sunscreen:   0%|          | 0/20 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:   5%|▌         | 1/20 [00:00<00:03,  4.75it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:  10%|█         | 2/20 [00:00<00:02,  6.26it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:  15%|█▌        | 3/20 [00:00<00:02,  6.00it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:  20%|██        | 4/20 [00:00<00:02,  5.89it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:  25%|██▌       | 5/20 [00:00<00:02,  5.84it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:  30%|███       | 6/20 [00:01<00:02,  5.82it/s]Setting `pad_token_id` to `eos_token_id`:No

HellaSwag Task Accuracy (task=Applying sunscreen): 0.35


Processing Trimming branches or hedges:   0%|          | 0/4 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  25%|██▌       | 1/4 [00:00<00:00,  6.66it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  50%|█████     | 2/4 [00:00<00:00,  6.07it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  75%|███████▌  | 3/4 [00:00<00:00,  6.29it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges: 100%|██████████| 4/4 [00:00<00:00,  6.35it/s]


HellaSwag Task Accuracy (task=Trimming branches or hedges): 0.75


Processing Disc dog:   0%|          | 0/32 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:   3%|▎         | 1/32 [00:00<00:04,  6.68it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:   6%|▋         | 2/32 [00:00<00:04,  6.62it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:   9%|▉         | 3/32 [00:04<00:53,  1.83s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:  12%|█▎        | 4/32 [00:04<00:32,  1.17s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:  16%|█▌        | 5/32 [00:04<00:21,  1.24it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:  19%|█▉        | 6/32 [00:04<00:15,  1.72it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:  22%|██▏       | 7/32

HellaSwag Task Accuracy (task=Disc dog): 0.5


Processing Wakeboarding:   0%|          | 0/13 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:   8%|▊         | 1/13 [00:00<00:01,  6.64it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:  15%|█▌        | 2/13 [00:00<00:01,  7.40it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:  23%|██▎       | 3/13 [00:00<00:01,  7.00it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:  31%|███       | 4/13 [00:00<00:01,  6.43it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:  38%|███▊      | 5/13 [00:00<00:01,  6.48it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:  46%|████▌     | 6/13 [00:00<00:01,  6.51it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wak

HellaSwag Task Accuracy (task=Wakeboarding): 0.6923076923076923


Processing Skateboarding:   0%|          | 0/9 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  11%|█         | 1/9 [00:00<00:01,  6.64it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  22%|██▏       | 2/9 [00:00<00:01,  6.59it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  33%|███▎      | 3/9 [00:00<00:00,  6.12it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  44%|████▍     | 4/9 [00:04<00:08,  1.60s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  56%|█████▌    | 5/9 [00:04<00:04,  1.07s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  67%|██████▋   | 6/9 [00:04<00:02,  1.32it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ska

HellaSwag Task Accuracy (task=Skateboarding): 0.3333333333333333


Processing Waterskiing:   0%|          | 0/19 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:   5%|▌         | 1/19 [00:00<00:02,  6.56it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:  11%|█         | 2/19 [00:00<00:02,  6.57it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:  16%|█▌        | 3/19 [00:00<00:02,  6.56it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:  21%|██        | 4/19 [00:00<00:02,  6.55it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:  26%|██▋       | 5/19 [00:00<00:02,  6.53it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:  32%|███▏      | 6/19 [00:00<00:01,  6.51it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiin

HellaSwag Task Accuracy (task=Waterskiing): 0.47368421052631576


Processing Washing hands:   0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Washing hands:  20%|██        | 1/5 [00:00<00:00,  6.62it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Washing hands:  40%|████      | 2/5 [00:00<00:00,  6.57it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Washing hands:  60%|██████    | 3/5 [00:00<00:00,  6.55it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Washing hands:  80%|████████  | 4/5 [00:00<00:00,  6.55it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Washing hands: 100%|██████████| 5/5 [00:00<00:00,  6.37it/s]


HellaSwag Task Accuracy (task=Washing hands): 0.2


Processing Sailing:   0%|          | 0/4 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Sailing:  25%|██▌       | 1/4 [00:00<00:00,  4.89it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Sailing:  50%|█████     | 2/4 [00:00<00:00,  5.76it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Sailing:  75%|███████▌  | 3/4 [00:00<00:00,  6.09it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Sailing: 100%|██████████| 4/4 [00:00<00:00,  6.02it/s]


HellaSwag Task Accuracy (task=Sailing): 0.25


Processing Playing congas:   0%|          | 0/8 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  12%|█▎        | 1/8 [00:00<00:01,  5.75it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  25%|██▌       | 2/8 [00:00<00:00,  6.21it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  38%|███▊      | 3/8 [00:00<00:00,  6.36it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  50%|█████     | 4/8 [00:00<00:00,  6.44it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  62%|██████▎   | 5/8 [00:00<00:00,  6.46it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  75%|███████▌  | 6/8 [00:00<00:00,  6.93it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Process

HellaSwag Task Accuracy (task=Playing congas): 0.25


Processing Ballet:   0%|          | 0/7 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  14%|█▍        | 1/7 [00:00<00:00,  6.47it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  29%|██▊       | 2/7 [00:04<00:11,  2.36s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  43%|████▎     | 3/7 [00:04<00:05,  1.41s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  57%|█████▋    | 4/7 [00:04<00:02,  1.10it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  71%|███████▏  | 5/7 [00:04<00:01,  1.44it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  86%|████████▌ | 6/7 [00:04<00:00,  1.96it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet: 100%|██████████| 7/7 [00:05<00:00,  1.37it/s

HellaSwag Task Accuracy (task=Ballet): 0.42857142857142855


Processing Roof shingle removal:   0%|          | 0/14 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:   7%|▋         | 1/14 [00:00<00:01,  8.18it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:  14%|█▍        | 2/14 [00:00<00:01,  7.15it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:  21%|██▏       | 3/14 [00:00<00:01,  6.89it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:  29%|██▊       | 4/14 [00:00<00:01,  6.75it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:  36%|███▌      | 5/14 [00:00<00:01,  6.25it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:  43%|████▎     | 6/14 [00:03<00:09,  1.17s/it]Setting `pad_token_id` to `eo

HellaSwag Task Accuracy (task=Roof shingle removal): 0.5714285714285714


Processing Hand car wash:   0%|          | 0/23 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:   4%|▍         | 1/23 [00:00<00:03,  5.78it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:   9%|▊         | 2/23 [00:00<00:03,  5.76it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:  13%|█▎        | 3/23 [00:00<00:03,  6.64it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:  17%|█▋        | 4/23 [00:00<00:02,  7.06it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:  22%|██▏       | 5/23 [00:00<00:02,  6.87it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:  26%|██▌       | 6/23 [00:00<00:02,  7.25it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Process

HellaSwag Task Accuracy (task=Hand car wash): 0.30434782608695654


Processing Kite flying:   0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Kite flying:  50%|█████     | 1/2 [00:00<00:00,  8.14it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Kite flying: 100%|██████████| 2/2 [00:00<00:00,  7.22it/s]


HellaSwag Task Accuracy (task=Kite flying): 0.5


Processing Playing pool:   0%|          | 0/26 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:   4%|▍         | 1/26 [00:00<00:03,  6.62it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:   8%|▊         | 2/26 [00:00<00:03,  6.58it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:  12%|█▏        | 3/26 [00:00<00:03,  7.23it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:  15%|█▌        | 4/26 [00:00<00:03,  6.93it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:  19%|█▉        | 5/26 [00:00<00:03,  6.76it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:  23%|██▎       | 6/26 [00:00<00:02,  7.15it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Pla

HellaSwag Task Accuracy (task=Playing pool): 0.5


Processing Playing lacrosse:   0%|          | 0/15 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:   7%|▋         | 1/15 [00:00<00:04,  3.29it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:  13%|█▎        | 2/15 [00:00<00:02,  4.38it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:  20%|██        | 3/15 [00:00<00:02,  5.55it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:  27%|██▋       | 4/15 [00:00<00:02,  5.05it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:  33%|███▎      | 5/15 [00:00<00:01,  5.85it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:  40%|████      | 6/15 [00:01<00:01,  5.48it/s]Setting `pad_token_id` to `eos_token_id`:None for open-en

HellaSwag Task Accuracy (task=Playing lacrosse): 0.13333333333333333


Processing Layup drill in basketball:   0%|          | 0/111 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   1%|          | 1/111 [00:00<00:17,  6.41it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   2%|▏         | 2/111 [00:00<00:20,  5.41it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   3%|▎         | 3/111 [00:00<00:21,  5.13it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   4%|▎         | 4/111 [00:00<00:19,  5.36it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   5%|▍         | 5/111 [00:00<00:18,  5.74it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   5%|▌         | 6/111 [00:01<00:1

HellaSwag Task Accuracy (task=Layup drill in basketball): 0.4594594594594595


Processing Home and Garden:   0%|          | 0/390 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   0%|          | 1/390 [00:04<26:30,  4.09s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   1%|          | 2/390 [00:08<25:50,  4.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   1%|          | 3/390 [00:11<25:31,  3.96s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   1%|          | 4/390 [00:15<25:21,  3.94s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   1%|▏         | 5/390 [00:19<25:15,  3.94s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   2%|▏         | 6/390 [00:23<25:11,  3.93s/it]Setting `pad_token_id` to `eos_token_id`:None for open-en

In [124]:
print(benchmark.predictions['Prediction'].iloc[1])

<|begin_of_text|>The following are multiple choice questions (with answers) are sentence completion problems about Applying sunscreen.

Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. then
A., the man adds wax to the windshield and cuts it.
B., a person board a ski lift, while two men supporting the head of the person wearing winter clothes snow as the we girls sled.
C., the man puts on a christmas coat, knitted with netting.
D., the man continues removing the snow on his car.
Answer: D

A female chef in white uniform shows a stack of baking pans in a large kitchen presenting them. the pans
A. contain egg yolks and baking soda.
B. are then sprinkled with brown sugar.
C. are placed in a strainer on the counter.
D. are filled with pastries and loaded into the oven.
Answer: D

The man in the center is demonstrating a hairstyle on the person wearing the blue shirt. the man in the blue shirt
A. is standing on the sponge cutting the