In [1]:
from datasets import load_dataset

model_name = "HuggingFaceTB/SmolLM2-135M"
dataset_path = "HuggingFaceTB/smoltalk"
dataset_name = "everyday-conversations"

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("mps")

In [7]:
# embeddying layer

embedding_layer = model.model.embed_tokens
embedding_layer.weight.shape

torch.Size([49152, 576])

In [22]:
import torch
test = torch.tensor([[1,2,3],[4,5,6]])
print(test)
print(test.shape)
print(test[])

tensor([[1, 2, 3],
        [4, 5, 6]])
torch.Size([2, 3])
tensor([[4, 5, 6]])


In [8]:
embedding_layer.weight[0]

tensor([-1.1768e-01,  2.7832e-02,  4.8096e-02, -7.9346e-03, -5.6152e-02,
        -5.2002e-02,  1.6479e-02, -1.3379e-01,  1.0791e-01, -2.2949e-01,
         3.1982e-02, -4.6631e-02,  2.2852e-01, -3.3398e-01, -3.1836e-01,
         2.7832e-02, -1.3611e-02,  6.3965e-02, -1.2109e-01, -5.1758e-02,
         3.1250e-02,  2.2461e-01, -6.6406e-02,  8.2031e-02, -8.3008e-03,
        -1.0620e-02, -6.7871e-02,  5.3223e-02,  1.6406e-01,  1.3672e-01,
         7.2266e-02, -2.0020e-01, -1.0059e-01, -1.5137e-01,  2.2095e-02,
        -3.1128e-03,  4.5410e-02,  8.3008e-02,  6.3477e-02, -1.0205e-01,
         8.4473e-02,  1.4160e-01,  3.2471e-02,  2.4048e-02,  9.5703e-02,
        -5.8594e-02,  1.4746e-01, -1.9629e-01,  3.8086e-02, -1.4844e-01,
         1.8652e-01, -1.1719e-01,  9.9609e-02, -1.3184e-01,  2.5513e-02,
         1.1133e-01,  4.0527e-02,  1.8164e-01, -7.0312e-02,  1.6724e-02,
         9.7046e-03,  2.1515e-03,  1.2891e-01, -1.0010e-02, -8.9844e-02,
        -1.1572e-01,  1.0254e-01, -1.8158e-03, -1.5

In [5]:
model.resize_token_embeddings(len(tokenizer)+8)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


NotImplementedError: The operator 'aten::_linalg_eigvals' is not currently implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.

# Exploring Tokenizer and Dataset

In [122]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'additional_special_tokens': ['<|endoftext|>',
  '<|im_start|>',
  '<|im_end|>',
  '<repo_name>',
  '<reponame>',
  '<file_sep>',
  '<filename>',
  '<gh_stars>',
  '<issue_start>',
  '<issue_comment>',
  '<issue_closed>',
  '<jupyter_start>',
  '<jupyter_text>',
  '<jupyter_code>',
  '<jupyter_output>',
  '<jupyter_script>',
  '<empty_output>']}

In [123]:
dataset = load_dataset(dataset_path, dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 119
    })
})

In [124]:
dataset["train"][0]["messages"]

[{'content': 'Hi there', 'role': 'user'},
 {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
 {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?",
  'role': 'user'},
 {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.",
  'role': 'assistant'},
 {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?',
  'role': 'user'},
 {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.',
  'role': 'assistant'},
 {'content': "Okay, I'll look into those. Thanks for the recommendations!",
  'role': 'user'},
 {'content': "You're welcome. I hope you find the perfect resort for your vacation.",
  'role': 'assistant'}]

In [125]:
dataset["train"].features

{'full_topic': Value(dtype='string', id=None),
 'messages': [{'content': Value(dtype='string', id=None),
   'role': Value(dtype='string', id=None)}]}

In [126]:
dataset["train"][0]

{'full_topic': 'Travel/Vacation destinations/Beach resorts',
 'messages': [{'content': 'Hi there', 'role': 'user'},
  {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
  {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?",
   'role': 'user'},
  {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.",
   'role': 'assistant'},
  {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?',
   'role': 'user'},
  {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.',
   'role': 'assistant'},
  {'content': "Okay, I'll look into those. Thanks for the recommendations!",
   'role': 'user'},
  {'content': "You're welcome. I hope you find

In [127]:
from transformers import AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [128]:
tokenizer.vocab_size

49152

In [129]:
print(":", tokenizer.decode(49151))
print(":", tokenizer.decode(49152))
print(":", tokenizer.decode(49153))

: ectable
: 
: 


In [130]:
tokenizer("<|pad|>")

{'input_ids': [44, 108, 16384, 108, 46], 'attention_mask': [1, 1, 1, 1, 1]}

In [131]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'additional_special_tokens': ['<|endoftext|>',
  '<|im_start|>',
  '<|im_end|>',
  '<repo_name>',
  '<reponame>',
  '<file_sep>',
  '<filename>',
  '<gh_stars>',
  '<issue_start>',
  '<issue_comment>',
  '<issue_closed>',
  '<jupyter_start>',
  '<jupyter_text>',
  '<jupyter_code>',
  '<jupyter_output>',
  '<jupyter_script>',
  '<empty_output>']}

In [132]:
len(tokenizer)

49152

In [133]:
result = tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

In [134]:
len(tokenizer)

49153

In [135]:
tokenizer.vocab_size

49152

In [136]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<|pad|>',
 'additional_special_tokens': ['<|endoftext|>',
  '<|im_start|>',
  '<|im_end|>',
  '<repo_name>',
  '<reponame>',
  '<file_sep>',
  '<filename>',
  '<gh_stars>',
  '<issue_start>',
  '<issue_comment>',
  '<issue_closed>',
  '<jupyter_start>',
  '<jupyter_text>',
  '<jupyter_code>',
  '<jupyter_output>',
  '<jupyter_script>',
  '<empty_output>']}

In [137]:
tokens = tokenizer("hello world")

In [138]:
print(tokenizer.decode(tokens["input_ids"][0]))
print(tokenizer.decode(tokens["input_ids"][1]))

hello
 world


In [139]:
print(tokenizer.convert_ids_to_tokens(tokens["input_ids"][0]))
print(tokenizer.convert_ids_to_tokens(tokens["input_ids"][1]))

hello
Ġworld


In [140]:
tokens = tokenizer(
    [
        "hello world I'm hugging face tokenizer",
        "hello world I'm hugging face tokenizer padding paddding",
    ],
    padding=True,
    truncation=True,
)
print(tokens)

{'input_ids': [[28120, 905, 339, 5248, 294, 19712, 2715, 46119, 49152, 49152, 49152], [28120, 905, 339, 5248, 294, 19712, 2715, 46119, 16809, 10567, 3443]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [141]:
print(tokenizer.decode(tokens["input_ids"][0]))

hello world I'm hugging face tokenizer<|pad|><|pad|><|pad|>


In [142]:
sample_data = dataset["train"][0]["messages"]
print(sample_data)

[{'content': 'Hi there', 'role': 'user'}, {'content': 'Hello! How can I help you today?', 'role': 'assistant'}, {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?", 'role': 'user'}, {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.", 'role': 'assistant'}, {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?', 'role': 'user'}, {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.', 'role': 'assistant'}, {'content': "Okay, I'll look into those. Thanks for the recommendations!", 'role': 'user'}, {'content': "You're welcome. I hope you find the perfect resort for your vacation.", 'role': 'assistant'}]


In [143]:
print(":", tokenizer.chat_template)

: None


In [144]:
from trl import setup_chat_format

model, tokenizer = setup_chat_format(model, tokenizer)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [145]:
print(tokenizer.chat_template)

{% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}


In [146]:
print(tokenizer.apply_chat_template(sample_data, tokenize=False))

<|im_start|>user
Hi there<|im_end|>
<|im_start|>assistant
Hello! How can I help you today?<|im_end|>
<|im_start|>user
I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?<|im_end|>
<|im_start|>assistant
Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.<|im_end|>
<|im_start|>user
That sounds great. Are there any resorts in the Caribbean that are good for families?<|im_end|>
<|im_start|>assistant
Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.<|im_end|>
<|im_start|>user
Okay, I'll look into those. Thanks for the recommendations!<|im_end|>
<|im_start|>assistant
You're welcome. I hope you find the perfect resort for your vacation.<|im_end|>



In [147]:
dataset["train"][0]

{'full_topic': 'Travel/Vacation destinations/Beach resorts',
 'messages': [{'content': 'Hi there', 'role': 'user'},
  {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
  {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?",
   'role': 'user'},
  {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.",
   'role': 'assistant'},
  {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?',
   'role': 'user'},
  {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.',
   'role': 'assistant'},
  {'content': "Okay, I'll look into those. Thanks for the recommendations!",
   'role': 'user'},
  {'content': "You're welcome. I hope you find

In [148]:
# To keep the data as a dataset, use the Dataset.map() method.
# HF datasets are Apace Arrow files. It keeps data on the disk and only loads the samples in memory when needed.

dataset = dataset.map(
    lambda x: {"input_ids": tokenizer.apply_chat_template(x["messages"])},
    batched=True,
)

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

In [149]:
print(dataset["train"][0]["input_ids"])

[1, 4093, 198, 26843, 665, 2, 198, 1, 520, 9531, 198, 19556, 17, 1073, 416, 339, 724, 346, 1834, 47, 2, 198, 1, 4093, 198, 57, 5248, 3012, 327, 253, 10724, 14500, 327, 957, 1867, 17584, 30, 1978, 346, 3730, 634, 2378, 2911, 47, 2, 198, 1, 520, 9531, 198, 4449, 2378, 10724, 36088, 1453, 48326, 281, 14126, 28, 260, 48148, 898, 28, 284, 260, 44057, 30, 1069, 2316, 1343, 327, 480, 3953, 16351, 284, 13253, 29, 10086, 5656, 30, 2, 198, 1, 4093, 198, 5195, 4598, 1109, 30, 4184, 665, 750, 36088, 281, 260, 11981, 338, 359, 1123, 327, 3168, 47, 2, 198, 1, 520, 9531, 198, 10539, 28, 260, 25518, 284, 7784, 48096, 10015, 284, 47557, 395, 359, 5412, 4975, 327, 1564, 29, 9263, 36088, 281, 260, 11981, 30, 1069, 2626, 253, 1845, 282, 2123, 284, 32255, 5712, 327, 511, 6399, 30, 2, 198, 1, 4093, 198, 39122, 28, 339, 3060, 1492, 618, 967, 30, 10090, 327, 260, 7400, 17, 2, 198, 1, 520, 9531, 198, 2683, 2316, 10668, 30, 339, 3826, 346, 1042, 260, 3468, 14500, 327, 469, 17584, 30, 2, 198]


In [150]:
print(tokenizer.decode(dataset["train"][0]["input_ids"]))

<|im_start|>user
Hi there<|im_end|>
<|im_start|>assistant
Hello! How can I help you today?<|im_end|>
<|im_start|>user
I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?<|im_end|>
<|im_start|>assistant
Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.<|im_end|>
<|im_start|>user
That sounds great. Are there any resorts in the Caribbean that are good for families?<|im_end|>
<|im_start|>assistant
Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.<|im_end|>
<|im_start|>user
Okay, I'll look into those. Thanks for the recommendations!<|im_end|>
<|im_start|>assistant
You're welcome. I hope you find the perfect resort for your vacation.<|im_end|>



# Collator

In [151]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [153]:
batch_sample = dataset["train"][1]
batch_sample = {k: v for k, v in batch_sample.items() if k in ["input_ids"]}
# print([len(v) for v in batch_sample["input_ids"]])

In [155]:
batch = data_collator(batch_sample)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([224]), 'attention_mask': torch.Size([224])}

# Loss 

In [105]:
import torch

input = "Hi, how are you feeling now?"

batch = tokenizer(input, return_tensors="pt").to("mps")

In [106]:
batch

{'input_ids': tensor([[26843,    28,   638,   359,   346,  4330,  1209,    47]],
       device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='mps:0')}

In [107]:
batch["labels"] = batch["input_ids"].clone()

In [108]:
model.train()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((576,), eps=1e-05)
    (rotary_emb): LlamaRotaryEm

In [109]:
outputs = model(**batch)

In [113]:
outputs

CausalLMOutputWithPast(loss=tensor(2.4997, device='mps:0', grad_fn=<NllLossBackward0>), logits=tensor([[[ 19.0812,   7.8402,   7.9945,  ...,  14.6980,  17.8165,  15.4678],
         [  3.3337, -13.5207, -13.4428,  ...,  -5.5125,   3.0314,  -4.4235],
         [ 13.7915,  -3.0126,  -2.8766,  ...,   6.5830,  10.6706,   5.3863],
         ...,
         [  7.8826,  -8.5522,  -8.5129,  ...,  -2.4695,   6.1667,  -3.0178],
         [  3.6068, -11.8111, -11.7401,  ...,  -5.4752,   2.4249,  -6.6690],
         [  6.6683, -11.8877, -11.8476,  ...,  -5.0822,   2.9220,  -3.3430]]],
       device='mps:0', grad_fn=<LinearBackward0>), past_key_values=DynamicCache(), hidden_states=None, attentions=None)

In [101]:
print(outputs["logits"][0].shape)

tensor([[ 19.0812,   7.8402,   7.9945,  ...,  14.6980,  17.8165,  15.4678],
        [  3.3337, -13.5207, -13.4428,  ...,  -5.5125,   3.0314,  -4.4235],
        [ 13.7915,  -3.0126,  -2.8766,  ...,   6.5830,  10.6706,   5.3863],
        ...,
        [  7.8826,  -8.5522,  -8.5129,  ...,  -2.4695,   6.1667,  -3.0178],
        [  3.6068, -11.8111, -11.7401,  ...,  -5.4752,   2.4249,  -6.6690],
        [  6.6683, -11.8877, -11.8476,  ...,  -5.0822,   2.9220,  -3.3430]],
       device='mps:0', grad_fn=<SelectBackward0>)
torch.Size([8, 49152])


In [102]:
print(f"{'Input':30} | {'GroundTruth':12} | {'Prediction':10} | {'Top10':35}")
print("-" * 100)

for i, logits in enumerate(ouputs["logits"][0]):
    if i + 1 >= len(batch["input_ids"][0]):
        break
    values, token_ids = torch.topk(logits, k=10)
    input = (
        tokenizer.decode(batch["input_ids"][0][: i + 1]).strip().replace("\n", "\\n")
    )
    truth = tokenizer.decode(batch["input_ids"][0][i + 1]).strip().replace("\n", "\\n")
    pred = tokenizer.decode(token_ids[0]).strip().replace("\n", "\\n")
    top_5 = tokenizer.decode(token_ids).strip().replace("\n", "\\n")

    print(f"{input:30} | {truth:12} | {pred:10} | {top_5:35}")

Input                          | GroundTruth  | Prediction | Top10                              
----------------------------------------------------------------------------------------------------
Hi                             | ,            | ,          | ,.!- to?:);]                       
Hi,                            | how          | I          | I you it that Mr\n this my how thanks
Hi, how                        | are          | can        | can are do did about many coulddy much is
Hi, how are                    | you          | you        | you we the your things they my these YOU ya
Hi, how are you                | feeling      | doing      | doing? feeling?", going today?” getting all
Hi, how are you feeling        | now          | ?          | ? today?",?” about this now right so
Hi, how are you feeling now    | ?            | ?          | ??",?”!.?' that?! and              


In [103]:
# last logit is for the prediction after the last input token. 
# we don't have ground truth for it.

tokenizer.decode(torch.argmax(ouputs["logits"][0][-1]))

'\n'

Loss is a single float value, meaning it has been summed up losses across all tokens and averaged them. 
When this loss is backpropagated, the loss for each token will be applied separately though. Keep in mind that this loss value is a sum of all weights and input embedding vector's multiplications. When derivate this w.r.t certain parameter, you are going to apply that to all losses of the tokens. 

In [114]:
outputs.loss

tensor(2.4997, device='mps:0', grad_fn=<NllLossBackward0>)

# Full code for training


In [56]:

from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)

optimizer = AdamW(model.parameters(), lr=3e-5)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model.train()
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)

    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()