In [1]:
from huggingface_hub import login

login(token="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")

## Chat Templates 예시

In [2]:
from transformers import AutoTokenizer

chat = [
  {"role": "user", "content": "Hello, how are you?"},
  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
  {"role": "user", "content": "I'd like to show off how chat templating works!"},
]

In [3]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

tokenizer.apply_chat_template(chat, tokenize=False)

"<s> [INST] Hello, how are you? [/INST] I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"

In [4]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")

tokenizer.apply_chat_template(chat, tokenize=False)

"<|user|>\nHello, how are you?</s>\n<|assistant|>\nI'm doing great. How can I help you today?</s>\n<|user|>\nI'd like to show off how chat templating works!</s>\n"

## SmolLM2 Chat Templates

In [5]:
import torch

from trl import setup_chat_format
from transformers import AutoModelForCausalLM, AutoTokenizer

device = (
    "cuda" 
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

In [6]:
model_name = "HuggingFaceTB/SmolLM2-135M"

model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

# Set up the chat format with default 'chatml' format
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

In [7]:
messages = [
    {"role": "user", "content": "Hello, how are you?"},
    {
        "role": "assistant",
        "content": "I'm doing well, thank you! How can I assist you today?",
    },
]

In [8]:
input_text = tokenizer.apply_chat_template(messages, tokenize=False)

print("Conversation with template (tokenize=False):\n", input_text)

Conversation with template (tokenize=False):
 <|im_start|>user
Hello, how are you?<|im_end|>
<|im_start|>assistant
I'm doing well, thank you! How can I assist you today?<|im_end|>



In [9]:
input_text = tokenizer.apply_chat_template(messages, tokenize=True)

print("Conversation with template (tokenize=True):\n", input_text)

Conversation with template (tokenize=True):
 [1, 4093, 198, 19556, 28, 638, 359, 346, 47, 2, 198, 1, 520, 9531, 198, 57, 5248, 2567, 876, 28, 9984, 346, 17, 1073, 416, 339, 4237, 346, 1834, 47, 2, 198]


## HuggingFaceTB/smoltalk to ChatML format

In [10]:
from datasets import load_dataset

dataset = load_dataset("HuggingFaceTB/smoltalk", "everyday-conversations")

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 119
    })
})

In [12]:
print(dataset["train"]["messages"][0])
print(dataset["train"]["full_topic"][0])

[{'content': 'Hi there', 'role': 'user'}, {'content': 'Hello! How can I help you today?', 'role': 'assistant'}, {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?", 'role': 'user'}, {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.", 'role': 'assistant'}, {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?', 'role': 'user'}, {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.', 'role': 'assistant'}, {'content': "Okay, I'll look into those. Thanks for the recommendations!", 'role': 'user'}, {'content': "You're welcome. I hope you find the perfect resort for your vacation.", 'role': 'assistant'}]
Travel/Vacation destinations/Beach resor

In [13]:
def process_smoltalk_dataset(sample):    
    chatml_format_msg = tokenizer.apply_chat_template(sample["messages"], tokenize=False)
    
    return {"chatml_format_no_tokenize": chatml_format_msg}

chat_dataset = dataset.map(process_smoltalk_dataset)
chat_dataset

Map:   0%|          | 0/2260 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['full_topic', 'messages', 'chatml_format_no_tokenize'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['full_topic', 'messages', 'chatml_format_no_tokenize'],
        num_rows: 119
    })
})

In [14]:
chat_dataset["train"]["chatml_format_no_tokenize"][:5]

["<|im_start|>user\nHi there<|im_end|>\n<|im_start|>assistant\nHello! How can I help you today?<|im_end|>\n<|im_start|>user\nI'm looking for a beach resort for my next vacation. Can you recommend some popular ones?<|im_end|>\n<|im_start|>assistant\nSome popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.<|im_end|>\n<|im_start|>user\nThat sounds great. Are there any resorts in the Caribbean that are good for families?<|im_end|>\n<|im_start|>assistant\nYes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.<|im_end|>\n<|im_start|>user\nOkay, I'll look into those. Thanks for the recommendations!<|im_end|>\n<|im_start|>assistant\nYou're welcome. I hope you find the perfect resort for your vacation.<|im_end|>\n",
 "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\nHel

## openai/gsm8k to ChatML format

In [15]:
dataset = load_dataset("openai/gsm8k", "main")
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [16]:
for q, a in zip(dataset["train"]["question"][:5], dataset["train"]["answer"][:5]):
    print(q, " *** ", a)   

Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?  ***  Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?  ***  Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.
Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.
#### 10
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?  ***  In the beginning, Betty has only 100 / 2 = $<<100/2=50>>50.
Betty's grandparents gave her 15 * 2 = $<<15*2=30>>30.
This means, Betty needs 100 - 50 - 30 - 15 = $<<100-50-30-15=5>>5 more.
#### 5
Julie is read

In [17]:
'''
messages = [
    {"role": "user", "content": "Hello, how are you?"},
    {
        "role": "assistant",
        "content": "I'm doing well, thank you! How can I assist you today?",
    },
]
'''
def process_gsm8k_dataset(sample):
    question = sample["question"]
    answer = sample["answer"]
    
    chatml_format_msg = [
        {"role": "user", "content": question},
        {"role": "assistant", "content": answer}
    ]
    
    return {"chatml_format_no_tokenize": chatml_format_msg}

In [18]:
chat_dataset = dataset.map(process_gsm8k_dataset)
chat_dataset

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'chatml_format_no_tokenize'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer', 'chatml_format_no_tokenize'],
        num_rows: 1319
    })
})

In [19]:
chat_dataset["train"]["chatml_format_no_tokenize"][:5]

[[{'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
   'role': 'user'},
  {'content': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
   'role': 'assistant'}],
 [{'content': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
   'role': 'user'},
  {'content': 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10',
   'role': 'assistant'}],
 [{'content': 'Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?',
   'role': 'user'},
  {'content': "In the be