In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.benchmarks import HellaSwag
from deepeval.benchmarks.tasks import HellaSwagTask

In [2]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# https://huggingface.co/docs/transformers/model_doc/phi
inputs = tokenizer('Can you help me write a formal email to a potential business partner proposing a joint venture?', return_tensors="pt", return_attention_mask=False)
outputs = model.generate(**inputs, max_length=30)
text = tokenizer.batch_decode(outputs)[0]
print(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Can you help me write a formal email to a potential business partner proposing a joint venture?
Input: Company A: ABC Inc.
Company B


In [4]:
prompt = "If I were an AI that had just achieved"
tokens = tokenizer(prompt, return_tensors="pt")
# use the model to generate new tokens.
generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10)
tokenizer.batch_decode(generated_output)[0]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'If I were an AI that had just achieved consciousness, I would be very confused. I would'

In [21]:
# Define benchmark with specific tasks and shots
benchmark = HellaSwag()

In [22]:
class Phi2(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, use_cache=True)
        return self.tokenizer.batch_decode(generated_ids)[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps: list[str]) -> list[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(promtps, return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, use_cache=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "Phi 2"

In [23]:
phi2 = Phi2(model=model, tokenizer=tokenizer)

In [24]:
print(phi2.generate("Write me a joke"))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Write me a joke that makes fun of our teacher. Teacher: Mr. Smith.
Assistant: Joke: What do you call a teacher who wears glasses and a tie? Mr. Smith-tory.
<|endoftext|>


In [25]:
benchmark.evaluate(model=phi2)
print(benchmark.overall_score)

Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Applying sunscreen:   0%|          | 0/20 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:   5%|▌         | 1/20 [00:03<00:57,  3.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:  10%|█         | 2/20 [00:05<00:53,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:  15%|█▌        | 3/20 [00:09<00:51,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:  20%|██        | 4/20 [00:12<00:47,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:  25%|██▌       | 5/20 [00:14<00:44,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Applying sunscreen:  30%|███       | 6/20 [00:17<00:41,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:No

HellaSwag Task Accuracy (task=Applying sunscreen): 0.0


Processing Trimming branches or hedges:   0%|          | 0/4 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  25%|██▌       | 1/4 [00:03<00:09,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  50%|█████     | 2/4 [00:06<00:06,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  75%|███████▌  | 3/4 [00:09<00:02,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges: 100%|██████████| 4/4 [00:11<00:00,  3.00s/it]

HellaSwag Task Accuracy (task=Trimming branches or hedges): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Disc dog:   0%|          | 0/32 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:   3%|▎         | 1/32 [00:02<01:32,  2.97s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:   6%|▋         | 2/32 [00:05<01:29,  2.98s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:   9%|▉         | 3/32 [00:08<01:26,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:  12%|█▎        | 4/32 [00:11<01:24,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:  16%|█▌        | 5/32 [00:14<01:21,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:  19%|█▉        | 6/32 [00:17<01:17,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Disc dog:  22%|██▏       | 7/32

HellaSwag Task Accuracy (task=Disc dog): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Wakeboarding:   0%|          | 0/13 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:   8%|▊         | 1/13 [00:02<00:35,  2.98s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:  15%|█▌        | 2/13 [00:05<00:32,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:  23%|██▎       | 3/13 [00:08<00:29,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:  31%|███       | 4/13 [00:11<00:26,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:  38%|███▊      | 5/13 [00:14<00:23,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wakeboarding:  46%|████▌     | 6/13 [00:17<00:20,  2.98s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Wak

HellaSwag Task Accuracy (task=Wakeboarding): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Skateboarding:   0%|          | 0/9 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  11%|█         | 1/9 [00:02<00:23,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  22%|██▏       | 2/9 [00:05<00:20,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  33%|███▎      | 3/9 [00:08<00:17,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  44%|████▍     | 4/9 [00:11<00:14,  2.98s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  56%|█████▌    | 5/9 [00:14<00:11,  2.98s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Skateboarding:  67%|██████▋   | 6/9 [00:17<00:08,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ska

HellaSwag Task Accuracy (task=Skateboarding): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Waterskiing:   0%|          | 0/19 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:   5%|▌         | 1/19 [00:02<00:53,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:  11%|█         | 2/19 [00:05<00:50,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:  16%|█▌        | 3/19 [00:08<00:47,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:  21%|██        | 4/19 [00:11<00:44,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:  26%|██▋       | 5/19 [00:14<00:41,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiing:  32%|███▏      | 6/19 [00:17<00:38,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Waterskiin

HellaSwag Task Accuracy (task=Waterskiing): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Washing hands:   0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Washing hands:  20%|██        | 1/5 [00:02<00:11,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Washing hands:  40%|████      | 2/5 [00:06<00:09,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Washing hands:  60%|██████    | 3/5 [00:09<00:06,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Washing hands:  80%|████████  | 4/5 [00:11<00:02,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Washing hands: 100%|██████████| 5/5 [00:15<00:00,  3.00s/it]

HellaSwag Task Accuracy (task=Washing hands): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Sailing:   0%|          | 0/4 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Sailing:  25%|██▌       | 1/4 [00:03<00:09,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Sailing:  50%|█████     | 2/4 [00:05<00:05,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Sailing:  75%|███████▌  | 3/4 [00:08<00:02,  2.98s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Sailing: 100%|██████████| 4/4 [00:11<00:00,  2.99s/it]

HellaSwag Task Accuracy (task=Sailing): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Playing congas:   0%|          | 0/8 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  12%|█▎        | 1/8 [00:02<00:20,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  25%|██▌       | 2/8 [00:05<00:17,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  38%|███▊      | 3/8 [00:08<00:14,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  50%|█████     | 4/8 [00:12<00:12,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  62%|██████▎   | 5/8 [00:15<00:09,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing congas:  75%|███████▌  | 6/8 [00:18<00:05,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Process

HellaSwag Task Accuracy (task=Playing congas): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Ballet:   0%|          | 0/7 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  14%|█▍        | 1/7 [00:02<00:17,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  29%|██▊       | 2/7 [00:06<00:15,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  43%|████▎     | 3/7 [00:09<00:12,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  57%|█████▋    | 4/7 [00:12<00:09,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  71%|███████▏  | 5/7 [00:15<00:06,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet:  86%|████████▌ | 6/7 [00:18<00:03,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ballet: 100%|██████████| 7/7 [00:21<00:00,  3.00s/it

HellaSwag Task Accuracy (task=Ballet): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Roof shingle removal:   0%|          | 0/14 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:   7%|▋         | 1/14 [00:03<00:39,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:  14%|█▍        | 2/14 [00:06<00:36,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:  21%|██▏       | 3/14 [00:06<00:20,  1.82s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:  29%|██▊       | 4/14 [00:09<00:22,  2.28s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:  36%|███▌      | 5/14 [00:09<00:14,  1.61s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Roof shingle removal:  43%|████▎     | 6/14 [00:10<00:09,  1.21s/it]Setting `pad_token_id` to `eo

HellaSwag Task Accuracy (task=Roof shingle removal): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Hand car wash:   0%|          | 0/23 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:   4%|▍         | 1/23 [00:03<01:06,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:   9%|▊         | 2/23 [00:06<01:04,  3.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:  13%|█▎        | 3/23 [00:09<01:00,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:  17%|█▋        | 4/23 [00:12<00:57,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:  22%|██▏       | 5/23 [00:15<00:53,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Hand car wash:  26%|██▌       | 6/23 [00:18<00:50,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Process

HellaSwag Task Accuracy (task=Hand car wash): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Kite flying:   0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Kite flying:  50%|█████     | 1/2 [00:03<00:03,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Kite flying: 100%|██████████| 2/2 [00:06<00:00,  3.01s/it]

HellaSwag Task Accuracy (task=Kite flying): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Playing pool:   0%|          | 0/26 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:   4%|▍         | 1/26 [00:03<01:15,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:   8%|▊         | 2/26 [00:06<01:12,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:  12%|█▏        | 3/26 [00:09<01:09,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:  15%|█▌        | 4/26 [00:12<01:06,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:  19%|█▉        | 5/26 [00:15<01:03,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing pool:  23%|██▎       | 6/26 [00:18<01:00,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Pla

HellaSwag Task Accuracy (task=Playing pool): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Playing lacrosse:   0%|          | 0/15 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:   7%|▋         | 1/15 [00:02<00:41,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:  13%|█▎        | 2/15 [00:06<00:39,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:  20%|██        | 3/15 [00:09<00:36,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:  27%|██▋       | 4/15 [00:12<00:33,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:  33%|███▎      | 5/15 [00:15<00:30,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing lacrosse:  40%|████      | 6/15 [00:18<00:27,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-en

HellaSwag Task Accuracy (task=Playing lacrosse): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Layup drill in basketball:   0%|          | 0/111 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   1%|          | 1/111 [00:02<05:29,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   2%|▏         | 2/111 [00:05<05:25,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   3%|▎         | 3/111 [00:08<05:24,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   4%|▎         | 4/111 [00:12<05:21,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   5%|▍         | 5/111 [00:14<05:18,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Layup drill in basketball:   5%|▌         | 6/111 [00:15<03:4

HellaSwag Task Accuracy (task=Layup drill in basketball): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Home and Garden:   0%|          | 0/390 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   0%|          | 1/390 [00:03<19:50,  3.06s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   1%|          | 2/390 [00:06<19:46,  3.06s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   1%|          | 3/390 [00:09<19:43,  3.06s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   1%|          | 4/390 [00:12<19:38,  3.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   1%|▏         | 5/390 [00:15<19:33,  3.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Home and Garden:   2%|▏         | 6/390 [00:18<19:35,  3.06s/it]Setting `pad_token_id` to `eos_token_id`:None for open-en

HellaSwag Task Accuracy (task=Home and Garden): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Playing beach volleyball:   0%|          | 0/17 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing beach volleyball:   6%|▌         | 1/17 [00:03<00:48,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing beach volleyball:  12%|█▏        | 2/17 [00:06<00:45,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing beach volleyball:  18%|█▊        | 3/17 [00:09<00:42,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing beach volleyball:  24%|██▎       | 4/17 [00:12<00:39,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing beach volleyball:  29%|██▉       | 5/17 [00:15<00:35,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing beach volleyball:  35%|███▌      | 6/17 [00:18<00:32,  3.00s/it]S

HellaSwag Task Accuracy (task=Playing beach volleyball): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Calf roping:   0%|          | 0/27 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Calf roping:   4%|▎         | 1/27 [00:02<01:17,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Calf roping:   7%|▋         | 2/27 [00:05<01:14,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Calf roping:  11%|█         | 3/27 [00:08<01:11,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Calf roping:  15%|█▍        | 4/27 [00:12<01:09,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Calf roping:  19%|█▊        | 5/27 [00:15<01:06,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Calf roping:  22%|██▏       | 6/27 [00:18<01:02,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Calf ropin

HellaSwag Task Accuracy (task=Calf roping): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Scuba diving:   0%|          | 0/23 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Scuba diving:   4%|▍         | 1/23 [00:03<01:06,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Scuba diving:   9%|▊         | 2/23 [00:06<01:03,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Scuba diving:  13%|█▎        | 3/23 [00:09<01:00,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Scuba diving:  17%|█▋        | 4/23 [00:12<00:57,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Scuba diving:  22%|██▏       | 5/23 [00:15<00:54,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Scuba diving:  26%|██▌       | 6/23 [00:18<00:51,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Scu

HellaSwag Task Accuracy (task=Scuba diving): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Mixing drinks:   0%|          | 0/13 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Mixing drinks:   8%|▊         | 1/13 [00:03<00:36,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Mixing drinks:  15%|█▌        | 2/13 [00:06<00:33,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Mixing drinks:  23%|██▎       | 3/13 [00:09<00:30,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Mixing drinks:  31%|███       | 4/13 [00:12<00:27,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Mixing drinks:  38%|███▊      | 5/13 [00:15<00:24,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Mixing drinks:  46%|████▌     | 6/13 [00:18<00:21,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Process

HellaSwag Task Accuracy (task=Mixing drinks): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Putting on shoes:   0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Putting on shoes:  50%|█████     | 1/2 [00:03<00:03,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Putting on shoes: 100%|██████████| 2/2 [00:06<00:00,  3.01s/it]

HellaSwag Task Accuracy (task=Putting on shoes): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Making a lemonade:   0%|          | 0/32 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Making a lemonade:   3%|▎         | 1/32 [00:03<01:33,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Making a lemonade:   6%|▋         | 2/32 [00:06<01:30,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Making a lemonade:   9%|▉         | 3/32 [00:09<01:27,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Making a lemonade:  12%|█▎        | 4/32 [00:12<01:24,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Making a lemonade:  16%|█▌        | 5/32 [00:15<01:21,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Making a lemonade:  19%|█▉        | 6/32 [00:18<01:18,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for 

HellaSwag Task Accuracy (task=Making a lemonade): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Uncategorized:   0%|          | 0/7 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Uncategorized:  14%|█▍        | 1/7 [00:03<00:18,  3.12s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Uncategorized:  29%|██▊       | 2/7 [00:06<00:15,  3.10s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Uncategorized:  43%|████▎     | 3/7 [00:09<00:12,  3.07s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Uncategorized:  57%|█████▋    | 4/7 [00:12<00:09,  3.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Uncategorized:  71%|███████▏  | 5/7 [00:15<00:06,  3.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Uncategorized:  86%|████████▌ | 6/7 [00:15<00:02,  2.16s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Unc

HellaSwag Task Accuracy (task=Uncategorized): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Zumba:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Zumba:  10%|█         | 1/10 [00:03<00:27,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Zumba:  20%|██        | 2/10 [00:06<00:24,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Zumba:  30%|███       | 3/10 [00:09<00:21,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Zumba:  40%|████      | 4/10 [00:12<00:18,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Zumba:  50%|█████     | 5/10 [00:15<00:15,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Zumba:  60%|██████    | 6/10 [00:18<00:12,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Zumba:  70%|███████   | 7/10 [00:21<00:09,  3.01s/it

HellaSwag Task Accuracy (task=Zumba): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Playing badminton:   0%|          | 0/15 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing badminton:   7%|▋         | 1/15 [00:03<00:42,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing badminton:  13%|█▎        | 2/15 [00:05<00:38,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing badminton:  20%|██        | 3/15 [00:08<00:35,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing badminton:  27%|██▋       | 4/15 [00:11<00:33,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing badminton:  33%|███▎      | 5/15 [00:12<00:20,  2.07s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing badminton:  40%|████      | 6/15 [00:15<00:21,  2.41s/it]Setting `pad_token_id` to `eos_token_id`:None for 

HellaSwag Task Accuracy (task=Playing badminton): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Playing bagpipes:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing bagpipes:  10%|█         | 1/10 [00:03<00:27,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing bagpipes:  20%|██        | 2/10 [00:06<00:24,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing bagpipes:  30%|███       | 3/10 [00:09<00:21,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing bagpipes:  40%|████      | 4/10 [00:12<00:18,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing bagpipes:  50%|█████     | 5/10 [00:15<00:15,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Playing bagpipes:  60%|██████    | 6/10 [00:18<00:12,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-en

HellaSwag Task Accuracy (task=Playing bagpipes): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Food and Entertaining:   0%|          | 0/500 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Food and Entertaining:   0%|          | 1/500 [00:03<25:20,  3.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Food and Entertaining:   0%|          | 2/500 [00:06<25:12,  3.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Food and Entertaining:   1%|          | 3/500 [00:09<25:12,  3.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Food and Entertaining:   1%|          | 4/500 [00:12<25:13,  3.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Food and Entertaining:   1%|          | 5/500 [00:12<17:29,  2.12s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Food and Entertaining:   1%|          | 6/500 [00:13<12:47,  1.55s/it]Setting `pad_to

HellaSwag Task Accuracy (task=Food and Entertaining): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Personal Care and Style:   0%|          | 0/2627 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Personal Care and Style:   0%|          | 1/2627 [00:03<2:20:29,  3.21s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Personal Care and Style:   0%|          | 2/2627 [00:06<2:21:12,  3.23s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Personal Care and Style:   0%|          | 3/2627 [00:09<2:18:09,  3.16s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Personal Care and Style:   0%|          | 4/2627 [00:09<1:31:24,  2.09s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Personal Care and Style:   0%|          | 5/2627 [00:10<1:05:35,  1.50s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Personal Care and Style:   0%|          | 6/2627 [00:13<1

HellaSwag Task Accuracy (task=Personal Care and Style): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Cricket:   0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Cricket:  20%|██        | 1/5 [00:02<00:11,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Cricket:  40%|████      | 2/5 [00:06<00:09,  3.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Cricket:  60%|██████    | 3/5 [00:09<00:06,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Cricket:  80%|████████  | 4/5 [00:12<00:03,  3.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Cricket: 100%|██████████| 5/5 [00:15<00:00,  3.01s/it]

HellaSwag Task Accuracy (task=Cricket): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Shoveling snow:   0%|          | 0/18 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Shoveling snow:   6%|▌         | 1/18 [00:02<00:50,  3.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Shoveling snow:  11%|█         | 2/18 [00:06<00:48,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Shoveling snow:  17%|█▋        | 3/18 [00:09<00:45,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Shoveling snow:  22%|██▏       | 4/18 [00:12<00:42,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Shoveling snow:  28%|██▊       | 5/18 [00:15<00:39,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Shoveling snow:  33%|███▎      | 6/18 [00:18<00:36,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


HellaSwag Task Accuracy (task=Shoveling snow): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Ping-pong:   0%|          | 0/9 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ping-pong:  11%|█         | 1/9 [00:03<00:24,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ping-pong:  22%|██▏       | 2/9 [00:06<00:21,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ping-pong:  33%|███▎      | 3/9 [00:09<00:18,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ping-pong:  44%|████▍     | 4/9 [00:12<00:15,  3.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ping-pong:  56%|█████▌    | 5/9 [00:15<00:12,  3.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ping-pong:  67%|██████▋   | 6/9 [00:18<00:09,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ping-pong:  78%|███████▊  | 7/9

HellaSwag Task Accuracy (task=Ping-pong): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Holidays and Traditions:   0%|          | 0/38 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Holidays and Traditions:   3%|▎         | 1/38 [00:00<00:16,  2.20it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Holidays and Traditions:   5%|▌         | 2/38 [00:03<01:11,  1.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Holidays and Traditions:   8%|▊         | 3/38 [00:03<00:45,  1.29s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Holidays and Traditions:  11%|█         | 4/38 [00:07<01:07,  2.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Holidays and Traditions:  13%|█▎        | 5/38 [00:07<00:47,  1.44s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Holidays and Traditions:  16%|█▌        | 6/38 [00:07<00:35,  1.11s/it]Setting 

HellaSwag Task Accuracy (task=Holidays and Traditions): 0.0





Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Processing Ice fishing:   0%|          | 0/127 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ice fishing:   1%|          | 1/127 [00:03<06:22,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ice fishing:   2%|▏         | 2/127 [00:06<06:18,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ice fishing:   2%|▏         | 3/127 [00:09<06:16,  3.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ice fishing:   3%|▎         | 4/127 [00:12<06:12,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ice fishing:   4%|▍         | 5/127 [00:15<06:11,  3.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ice fishing:   5%|▍         | 6/127 [00:18<06:07,  3.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Ice

KeyboardInterrupt: 