In [15]:
from datasets import load_dataset

model_name = "HuggingFaceTB/SmolLM2-135M"
dataset_path = "HuggingFaceTB/smoltalk"
dataset_name = "everyday-conversations"

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [77]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'additional_special_tokens': ['<|endoftext|>',
  '<|im_start|>',
  '<|im_end|>',
  '<repo_name>',
  '<reponame>',
  '<file_sep>',
  '<filename>',
  '<gh_stars>',
  '<issue_start>',
  '<issue_comment>',
  '<issue_closed>',
  '<jupyter_start>',
  '<jupyter_text>',
  '<jupyter_code>',
  '<jupyter_output>',
  '<jupyter_script>',
  '<empty_output>']}

In [16]:
dataset = load_dataset(dataset_path, dataset_name)
dataset

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


DatasetDict({
    train: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 119
    })
})

In [19]:
dataset["train"][0]["messages"]

[{'content': 'Hi there', 'role': 'user'},
 {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
 {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?",
  'role': 'user'},
 {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.",
  'role': 'assistant'},
 {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?',
  'role': 'user'},
 {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.',
  'role': 'assistant'},
 {'content': "Okay, I'll look into those. Thanks for the recommendations!",
  'role': 'user'},
 {'content': "You're welcome. I hope you find the perfect resort for your vacation.",
  'role': 'assistant'}]

In [85]:
dataset["train"].features

{'full_topic': Value(dtype='string', id=None),
 'messages': [{'content': Value(dtype='string', id=None),
   'role': Value(dtype='string', id=None)}]}

In [106]:
dataset["train"][0]

{'full_topic': 'Travel/Vacation destinations/Beach resorts',
 'messages': [{'content': 'Hi there', 'role': 'user'},
  {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
  {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?",
   'role': 'user'},
  {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.",
   'role': 'assistant'},
  {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?',
   'role': 'user'},
  {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.',
   'role': 'assistant'},
  {'content': "Okay, I'll look into those. Thanks for the recommendations!",
   'role': 'user'},
  {'content': "You're welcome. I hope you find

In [27]:
from transformers import AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [28]:
tokenizer.vocab_size

49152

In [29]:
print(":", tokenizer.decode(49151))
print(":", tokenizer.decode(49152))
print(":", tokenizer.decode(49153))

: ectable
: 
: 


In [30]:
tokenizer("<|pad|>")

{'input_ids': [44, 108, 16384, 108, 46], 'attention_mask': [1, 1, 1, 1, 1]}

In [31]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'additional_special_tokens': ['<|endoftext|>',
  '<|im_start|>',
  '<|im_end|>',
  '<repo_name>',
  '<reponame>',
  '<file_sep>',
  '<filename>',
  '<gh_stars>',
  '<issue_start>',
  '<issue_comment>',
  '<issue_closed>',
  '<jupyter_start>',
  '<jupyter_text>',
  '<jupyter_code>',
  '<jupyter_output>',
  '<jupyter_script>',
  '<empty_output>']}

In [32]:
len(tokenizer)

49152

In [33]:
result = tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

In [34]:
len(tokenizer)

49153

In [35]:
tokenizer.vocab_size

49152

In [36]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<|pad|>',
 'additional_special_tokens': ['<|endoftext|>',
  '<|im_start|>',
  '<|im_end|>',
  '<repo_name>',
  '<reponame>',
  '<file_sep>',
  '<filename>',
  '<gh_stars>',
  '<issue_start>',
  '<issue_comment>',
  '<issue_closed>',
  '<jupyter_start>',
  '<jupyter_text>',
  '<jupyter_code>',
  '<jupyter_output>',
  '<jupyter_script>',
  '<empty_output>']}

In [37]:
tokens = tokenizer("hello world")

In [38]:
print(tokenizer.decode(tokens["input_ids"][0]))
print(tokenizer.decode(tokens["input_ids"][1]))

hello
 world


In [39]:
print(tokenizer.convert_ids_to_tokens(tokens["input_ids"][0]))
print(tokenizer.convert_ids_to_tokens(tokens["input_ids"][1]))

hello
Ġworld


In [40]:
tokens = tokenizer(
    [
        "hello world I'm hugging face tokenizer",
        "hello world I'm hugging face tokenizer padding paddding",
    ],
    padding=True,
    truncation=True,
)
print(tokens)

{'input_ids': [[28120, 905, 339, 5248, 294, 19712, 2715, 46119, 49152, 49152, 49152], [28120, 905, 339, 5248, 294, 19712, 2715, 46119, 16809, 10567, 3443]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [41]:
print(tokenizer.decode(tokens["input_ids"][0]))

hello world I'm hugging face tokenizer<|pad|><|pad|><|pad|>


In [42]:
sample_data = dataset["train"][0]["messages"]
print(sample_data)

[{'content': 'Hi there', 'role': 'user'}, {'content': 'Hello! How can I help you today?', 'role': 'assistant'}, {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?", 'role': 'user'}, {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.", 'role': 'assistant'}, {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?', 'role': 'user'}, {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.', 'role': 'assistant'}, {'content': "Okay, I'll look into those. Thanks for the recommendations!", 'role': 'user'}, {'content': "You're welcome. I hope you find the perfect resort for your vacation.", 'role': 'assistant'}]


In [43]:
print(":", tokenizer.chat_template)

: None


In [44]:
from trl import setup_chat_format

model, tokenizer = setup_chat_format(model, tokenizer)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [45]:
print(tokenizer.chat_template)

{% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}


In [46]:
print(tokenizer.apply_chat_template(sample_data, tokenize=False))

<|im_start|>user
Hi there<|im_end|>
<|im_start|>assistant
Hello! How can I help you today?<|im_end|>
<|im_start|>user
I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?<|im_end|>
<|im_start|>assistant
Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.<|im_end|>
<|im_start|>user
That sounds great. Are there any resorts in the Caribbean that are good for families?<|im_end|>
<|im_start|>assistant
Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.<|im_end|>
<|im_start|>user
Okay, I'll look into those. Thanks for the recommendations!<|im_end|>
<|im_start|>assistant
You're welcome. I hope you find the perfect resort for your vacation.<|im_end|>



In [47]:
dataset["train"][0]

{'full_topic': 'Travel/Vacation destinations/Beach resorts',
 'messages': [{'content': 'Hi there', 'role': 'user'},
  {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
  {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?",
   'role': 'user'},
  {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.",
   'role': 'assistant'},
  {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?',
   'role': 'user'},
  {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.',
   'role': 'assistant'},
  {'content': "Okay, I'll look into those. Thanks for the recommendations!",
   'role': 'user'},
  {'content': "You're welcome. I hope you find

In [48]:
# To keep the data as a dataset, use the Dataset.map() method.
# HF datasets are Apace Arrow files. It keeps data on the disk and only loads the samples in memory when needed.

dataset = dataset.map(
    lambda x: {"input_ids": tokenizer.apply_chat_template(x["messages"])},
    batched=True,
)

Map:   0%|          | 0/2260 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

In [51]:
print(dataset["train"][0]["input_ids"])

[1, 4093, 198, 26843, 665, 2, 198, 1, 520, 9531, 198, 19556, 17, 1073, 416, 339, 724, 346, 1834, 47, 2, 198, 1, 4093, 198, 57, 5248, 3012, 327, 253, 10724, 14500, 327, 957, 1867, 17584, 30, 1978, 346, 3730, 634, 2378, 2911, 47, 2, 198, 1, 520, 9531, 198, 4449, 2378, 10724, 36088, 1453, 48326, 281, 14126, 28, 260, 48148, 898, 28, 284, 260, 44057, 30, 1069, 2316, 1343, 327, 480, 3953, 16351, 284, 13253, 29, 10086, 5656, 30, 2, 198, 1, 4093, 198, 5195, 4598, 1109, 30, 4184, 665, 750, 36088, 281, 260, 11981, 338, 359, 1123, 327, 3168, 47, 2, 198, 1, 520, 9531, 198, 10539, 28, 260, 25518, 284, 7784, 48096, 10015, 284, 47557, 395, 359, 5412, 4975, 327, 1564, 29, 9263, 36088, 281, 260, 11981, 30, 1069, 2626, 253, 1845, 282, 2123, 284, 32255, 5712, 327, 511, 6399, 30, 2, 198, 1, 4093, 198, 39122, 28, 339, 3060, 1492, 618, 967, 30, 10090, 327, 260, 7400, 17, 2, 198, 1, 520, 9531, 198, 2683, 2316, 10668, 30, 339, 3826, 346, 1042, 260, 3468, 14500, 327, 469, 17584, 30, 2, 198]


In [52]:
print(tokenizer.decode(dataset["train"][0]["input_ids"]))

<|im_start|>user
Hi there<|im_end|>
<|im_start|>assistant
Hello! How can I help you today?<|im_end|>
<|im_start|>user
I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?<|im_end|>
<|im_start|>assistant
Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.<|im_end|>
<|im_start|>user
That sounds great. Are there any resorts in the Caribbean that are good for families?<|im_end|>
<|im_start|>assistant
Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.<|im_end|>
<|im_start|>user
Okay, I'll look into those. Thanks for the recommendations!<|im_end|>
<|im_start|>assistant
You're welcome. I hope you find the perfect resort for your vacation.<|im_end|>



In [53]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [54]:
batch_sample = dataset["train"][:2]
batch_sample = {k: v for k, v in batch_sample.items() if k in ["input_ids"]}
print([len(v) for v in batch_sample["input_ids"]])

[185, 224]


In [55]:
batch = data_collator(batch_sample)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([2, 224]), 'attention_mask': torch.Size([2, 224])}

In [56]:
# Full code for Trainer API

# from torch.utils.data import DataLoader
# from transformers import AdamW, get_scheduler

# train_dataloader = DataLoader(
#     tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
# )

# optimizer = AdamW(model.parameters(), lr=3e-5)

# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )

# model.train()
# for batch in train_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     outputs = model(**batch)

#     loss = outputs.loss
#     loss.backward()

#     optimizer.step()
#     lr_scheduler.step()
#     optimizer.zero_grad()

In [53]:
interview_questions = [
    {
        "role": "user",
        "content": "What insights did Alain Connes share in his talk with G. B. Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "Can you summarize Alain Connes' key points from his interview with G. B. Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "What were the main topics discussed between Alain Connes and G. B. Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "What did Alain Connes mention in his conversation with G. B. Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "Could you provide an overview of what Alain Connes said during his interview with G. B. Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "What points did Alain Connes focus on in his interview with G. B. Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "What subjects did Alain Connes cover when speaking with G. B. Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "What ideas did Alain Connes express in his discussion with G. B. Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "What can you tell me about the interview Alain Connes had with G. B. Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "What statements did Alain Connes make in his interview with G. B. Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "What were the key points Alain Connes discussed in his interview with G. B. Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "How did Alain Connes explain his mathematical journey during the Khosrovshahi interview?",
    },
    {
        "role": "user",
        "content": "What insights about non-commutative geometry did Connes share in the Khosrovshahi conversation?",
    },
    {
        "role": "user",
        "content": "Can you summarize Connes' views on mathematics education from his interview?",
    },
    {
        "role": "user",
        "content": "What philosophical perspectives did Alain Connes express to Khosrovshahi?",
    },
    {
        "role": "user",
        "content": "How did Connes describe his research methodology in the interview?",
    },
    {
        "role": "user",
        "content": "What were Connes' thoughts on the future of mathematics according to the interview?",
    },
    {
        "role": "user",
        "content": "How did Alain Connes reflect on his major mathematical discoveries?",
    },
    {
        "role": "user",
        "content": "What advice did Connes give to young mathematicians in his interview?",
    },
    {
        "role": "user",
        "content": "How did Connes explain the relationship between physics and mathematics?",
    },
]

In [59]:
all_prompts = [
    tokenizer.apply_chat_template(
        [question],
        tokenize=False,
        generation_prompt=True,
    )
    for question in interview_questions
]

In [70]:
import torch

tokens = tokenizer(all_prompts, padding=True)["input_ids"]

# print shape of tokens list. list object has not shape so we need to convert it to numpy array
tokens = torch.tensor(tokens)
print(tokens.shape)

torch.Size([20, 31])


In [74]:
embedding_matrix = model.get_input_embeddings().weight
print(embedding_matrix.shape)

torch.Size([49153, 576])
