In [5]:
# #| label: auth
# # Authenticate to Hugging Face
# from huggingface_hub import login

# login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import setup_chat_format
import torch

# SmolLM2 Chat Template

In [2]:
device = "mps" if torch.backends.mps.is_available() else "cpu"

model_name = "HuggingFaceTB/SmolLM2-135M"

model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

In [3]:
# Define messages for SmolLM2
messages = [
    {"role": "user", "content": "Hello, how are you?"},
    {
        "role": "assistant",
        "content": "I'm doing well, thank you! How can I assist you today?",
    },
]

# Apply chat template without tokenization

In [4]:
input_text = tokenizer.apply_chat_template(messages, tokenize=False)

print("Conversation with template:", input_text)

Conversation with template: <|im_start|>user
Hello, how are you?<|im_end|>
<|im_start|>assistant
I'm doing well, thank you! How can I assist you today?<|im_end|>



In [5]:
# add_generation_prompt adds the "assistant" role to the end of the conversation
input_text = tokenizer.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True
)

print("Conversation decoded:", tokenizer.decode(token_ids=input_text))

Conversation decoded: <|im_start|>user
Hello, how are you?<|im_end|>
<|im_start|>assistant
I'm doing well, thank you! How can I assist you today?<|im_end|>
<|im_start|>assistant



In [6]:
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

print("Conversation tokenized:", input_text)

Conversation tokenized: [1, 4093, 198, 19556, 28, 638, 359, 346, 47, 2, 198, 1, 520, 9531, 198, 57, 5248, 2567, 876, 28, 9984, 346, 17, 1073, 416, 339, 4237, 346, 1834, 47, 2, 198, 1, 520, 9531, 198]


# Exercise: Convert datasets into appropriate format for supervised fine tuning (SFT)

In [7]:
from IPython.core.display import display, HTML

display(
    HTML(
        """<iframe
  src="https://huggingface.co/datasets/HuggingFaceTB/smoltalk/embed/viewer/all/train?row=0"
  frameborder="0"
  width="100%"
  height="360px"
></iframe>
"""
    )
)

  from IPython.core.display import display, HTML


In [8]:
from datasets import load_dataset

ds = load_dataset("HuggingFaceTB/smoltalk", "everyday-conversations")

README.md:   0%|          | 0.00/9.72k [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


train-00000-of-00001.parquet:   0%|          | 0.00/946k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/52.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2260 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/119 [00:00<?, ? examples/s]

In [19]:
ds['train'][0]['messages'][1]

{'content': 'Hello! How can I help you today?', 'role': 'assistant'}

In [21]:
tokenizer.apply_chat_template(ds['train'][0]['messages'], tokenize=False, add_generation_prompt=True)

"<|im_start|>user\nHi there<|im_end|>\n<|im_start|>assistant\nHello! How can I help you today?<|im_end|>\n<|im_start|>user\nI'm looking for a beach resort for my next vacation. Can you recommend some popular ones?<|im_end|>\n<|im_start|>assistant\nSome popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.<|im_end|>\n<|im_start|>user\nThat sounds great. Are there any resorts in the Caribbean that are good for families?<|im_end|>\n<|im_start|>assistant\nYes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.<|im_end|>\n<|im_start|>user\nOkay, I'll look into those. Thanks for the recommendations!<|im_end|>\n<|im_start|>assistant\nYou're welcome. I hope you find the perfect resort for your vacation.<|im_end|>\n<|im_start|>assistant\n"

In [23]:
def process_dataset(sample):
    # TODO: 🐢 Convert the sample into a chat format
    # use the tokenizer's method to apply the chat template
    sample['chatml'] = tokenizer.apply_chat_template(sample['messages'], tokenize=False, add_generation_prompt=True)
    return sample

ds = ds.map(process_dataset)

Map:   0%|          | 0/2260 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

In [24]:
ds['train'][0]

{'full_topic': 'Travel/Vacation destinations/Beach resorts',
 'messages': [{'content': 'Hi there', 'role': 'user'},
  {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
  {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?",
   'role': 'user'},
  {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.",
   'role': 'assistant'},
  {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?',
   'role': 'user'},
  {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.',
   'role': 'assistant'},
  {'content': "Okay, I'll look into those. Thanks for the recommendations!",
   'role': 'user'},
  {'content': "You're welcome. I hope you find

In [25]:
display(
    HTML(
        """<iframe
  src="https://huggingface.co/datasets/openai/gsm8k/embed/viewer/main/train"
  frameborder="0"
  width="100%"
  height="360px"
></iframe>
"""
    )
)

In [26]:
# Define messages for gsm8k
messages = [
    {"role": "question", "content": "Hello, how are you?"},
    {
        "role": "answer",
        "content": "I'm doing well, thank you! How can I assist you today?",
    },
]

In [27]:
ds = load_dataset("openai/gsm8k", "main")


def process_dataset(sample):
    # TODO: 🐕 Convert the sample into a chat format

    # 1. create a message format with the role and content

    # 2. apply the chat template to the samples using the tokenizer's method
    temp = [
        {"role": "question", "content": sample['question']},
        {"role": "answer", "content": sample['answer']},
    ]

    sample['chatml'] = tokenizer.apply_chat_template(temp, tokenize=False, add_generation_prompt=True)

    return sample


ds = ds.map(process_dataset)

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [30]:
ds['train']['chatml'][0]

'<|im_start|>question\nNatalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>\n<|im_start|>answer\nNatalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72<|im_end|>\n<|im_start|>assistant\n'