In [1]:
import os 
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,5"
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_list = [
    "microsoft/Phi-3-mini-128k-instruct",
    "microsoft/Phi-3-small-128k-instruct",
    "microsoft/Phi-3-medium-128k-instruct",
]

In [3]:
question = "How many prime numbers less than 100 have a units digit of 3?"
system_message = "please reason step by step"

In [4]:
phi3mini_format = "<|system|>\n{system_message}<|end|>\n<|user|>\n{input}<|end|>\n<|assistant|>\n"
    
phi3small_format = "<|endoftext|><|system|>\n{system_message}<|end|>\n<|user|>\n{input}<|end|>\n<|assistant|>\n"

phi3medium_format = "<|user|>\n{input}<|end|>\n<|assistant|>\n"

In [5]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-128k-instruct", trust_remote_code=True)
tokenizer.chat_template

"{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"

In [6]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
tokenizer.chat_template

"{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"

In [7]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-small-128k-instruct", trust_remote_code=True)
tokenizer.chat_template

"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"

In [7]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-small-128k-instruct", trust_remote_code=True)

phi3small_format = phi3small_format.format(system_message=system_message, input=question)
tokenized_input = tokenizer(phi3small_format, return_tensors="pt")

model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-small-128k-instruct", trust_remote_code=True)
device = torch.cuda.current_device()
model = model.to(device)
tokenized_input = tokenized_input.to(device)
output = model.generate(**tokenized_input, max_new_tokens=1024)
output = tokenizer.decode(output[0], skip_special_tokens=True)
print(output)

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.38it/s]
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  x = [xi.to_sparse_csr() for xi in x]


OutOfResources: out of resource: shared memory, Required: 180224, Hardware limit: 166912. Reducing block sizes or `num_stages` may help.

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)
model_id = "microsoft/Phi-3-small-128k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
assert torch.cuda.is_available(), "This model needs a GPU to run ..."
device = torch.cuda.current_device()
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])
