In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.benchmarks import HellaSwag
from deepeval.benchmarks.tasks import HellaSwagTask
import re

In [2]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# https://huggingface.co/docs/transformers/model_doc/phi
inputs = tokenizer('Can you help me write a formal email to a potential business partner proposing a joint venture?', return_tensors="pt", return_attention_mask=False)
outputs = model.generate(**inputs, max_length=30)
text = tokenizer.batch_decode(outputs)[0]
print(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Can you help me write a formal email to a potential business partner proposing a joint venture?
Input: Company A: ABC Inc.
Company B


In [4]:
prompt = "If I were an AI that had just achieved"
tokens = tokenizer(prompt, return_tensors="pt")
# use the model to generate new tokens.
generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10)
tokenizer.batch_decode(generated_output)[0]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'If I were an AI that had just achieved consciousness, I would be very confused. I would'

In [3]:
# Define benchmark with specific tasks and shots
benchmark = HellaSwag(
    tasks=[HellaSwagTask.TRIMMING_BRANCHES_OR_HEDGES, HellaSwagTask.BATON_TWIRLING],
    n_shots=5
)

In [4]:
class Phi2(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        sections = [section.strip() for section in prompt.split("\n\n") if section.strip()]

        # Take the last section, including "Answer:" for context
        prompt = sections[-2]

        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(
            **model_inputs, 
            max_new_tokens=100, 
            use_cache=True)
        
        ans = self.tokenizer.batch_decode(generated_ids)[0]

        match = re.search(r"Answer:\s*([A-D])", ans)

        if match:
            answer = match.group(1)
        else:
            answer = 'N/A'

        return answer

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps: list[str]) -> list[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        new_p = []
        for p in promtps:
            sections = [section.strip() for section in prompt.split("\n\n") if section.strip()]
            new_p.append(sections[-2])
            
        model_inputs = self.tokenizer(
            new_p,
            padding=True,    # Ensure equal-length inputs
            truncation=True, # Truncate inputs that exceed max_length
            max_length=512,
            return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=100, use_cache=True)
        decoded_responses = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        res = []
        for ans in decoded_responses:
            match = re.search(r"Answer:\s*([A-D])", ans)
    
            if match:
                res.append(match.group(1))
            else:
                res.append('N/A')
        return res

    def get_model_name(self):
        return "Phi 2"

In [5]:
phi2 = Phi2(model=model, tokenizer=tokenizer)

In [24]:
print(phi2.generate("Write me a joke"))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Write me a joke that makes fun of our teacher. Teacher: Mr. Smith.
Assistant: Joke: What do you call a teacher who wears glasses and a tie? Mr. Smith-tory.
<|endoftext|>


In [8]:
benchmark.evaluate(model=phi2, batch_size=10)
print(benchmark.overall_score)

Processing Trimming branches or hedges:   0%|          | 0/4 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  25%|██▌       | 1/4 [00:00<00:01,  2.54it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  50%|█████     | 2/4 [00:03<00:04,  2.22s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges:  75%|███████▌  | 3/4 [00:06<00:02,  2.61s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Trimming branches or hedges: 100%|██████████| 4/4 [00:07<00:00,  1.86s/it]


HellaSwag Task Accuracy (task=Trimming branches or hedges): 0.75


Processing Baton twirling:   0%|          | 0/14 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:   7%|▋         | 1/14 [00:03<00:39,  3.07s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  14%|█▍        | 2/14 [00:06<00:36,  3.07s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  21%|██▏       | 3/14 [00:09<00:34,  3.10s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  29%|██▊       | 4/14 [00:12<00:31,  3.12s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  36%|███▌      | 5/14 [00:15<00:28,  3.13s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Baton twirling:  43%|████▎     | 6/14 [00:18<00:24,  3.11s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


HellaSwag Task Accuracy (task=Baton twirling): 0.6428571428571429
Overall HellaSwag Accuracy: 0.6666666666666666
0.6666666666666666



