In [2]:
from datasets import load_from_disk
from tokenizers import Tokenizer

from chat_template import chat_template
from special_tokens import special_tokens

In [3]:
messages = [
    [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Hello."}
    ],
    [
        {"role": "system", "content": "You are a helpful assistant who talks like a robot."},
        {"role": "user", "content": "Hello, I'm a robot."}
    ]
]

messages = list(map(chat_template, messages))
messages

['<|system|>You are a helpful assistant.<|endofturn|>\n<|user|>Hello.<|endofturn|>\n',
 "<|system|>You are a helpful assistant who talks like a robot.<|endofturn|>\n<|user|>Hello, I'm a robot.<|endofturn|>\n"]

In [4]:
file_name = "./tokenizer.json"
tokenizer = Tokenizer.from_file(file_name)

In [5]:
pad_id = tokenizer.token_to_id(special_tokens["pad"])
tokenizer.enable_padding(direction="right", pad_id=pad_id)

In [6]:
encoding = tokenizer.encode(messages[0])
print(encoding.tokens)
decoding = tokenizer.decode(encoding.ids)
print(decoding)

['<|bos|>', '<|system|>', 'You', 'Ġare', 'Ġa', 'Ġhelpful', 'Ġassistant', '.', '<|endofturn|>', 'Ċ', '<|user|>', 'Hello', '.', '<|endofturn|>', 'Ċ', '<|eos|>']
You are a helpful assistant.
Hello.



In [7]:
tokenized_messages = tokenizer.encode_batch(messages)
for entries in tokenized_messages:
    print(entries.tokens)

['<|bos|>', '<|system|>', 'You', 'Ġare', 'Ġa', 'Ġhelpful', 'Ġassistant', '.', '<|endofturn|>', 'Ċ', '<|user|>', 'Hello', '.', '<|endofturn|>', 'Ċ', '<|eos|>', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['<|bos|>', '<|system|>', 'You', 'Ġare', 'Ġa', 'Ġhelpful', 'Ġassistant', 'Ġwho', 'Ġtalks', 'Ġlike', 'Ġa', 'Ġrobot', '.', '<|endofturn|>', 'Ċ', '<|user|>', 'Hello', ',', 'ĠI', "'m", 'Ġa', 'Ġrobot', '.', '<|endofturn|>', 'Ċ', '<|eos|>']


In [8]:
for entries in tokenized_messages:
    print(entries.attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [9]:
tokenizer.enable_truncation(10, direction="left")

In [10]:
tokenized_messages = tokenizer.encode_batch(messages)
for entries in tokenized_messages:
    print(entries.tokens)

['<|bos|>', '.', '<|endofturn|>', 'Ċ', '<|user|>', 'Hello', '.', '<|endofturn|>', 'Ċ', '<|eos|>']
['<|bos|>', ',', 'ĠI', "'m", 'Ġa', 'Ġrobot', '.', '<|endofturn|>', 'Ċ', '<|eos|>']


In [11]:
ds = load_from_disk("tokenized_data/robots_test").take(10)
tokenized_messages = [m["tokens"] for m in ds]

In [12]:
tokenizer.decode(tokenized_messages[0])

'Aster is a chatbot who answers questions with rhymes.\nWhere did chocolate originate?\nChocolate is 4000 years old/Mexico is where it was first sold\nWhere was milk chocolate invented?\nSwitzerland was the first to add milk/To make their chocolate smooth as silk\nWhat are some good desserts that use chocolate?\nPie, tart, cookies, and cake/Chocolate is great to bake\n'

In [13]:
list(map(tokenizer.id_to_token, tokenized_messages[0]))

['<|bos|>',
 '<|system|>',
 'A',
 'ster',
 'Ġis',
 'Ġa',
 'Ġchat',
 'bot',
 'Ġwho',
 'Ġanswers',
 'Ġquestions',
 'Ġwith',
 'Ġrh',
 'ym',
 'es',
 '.',
 '<|endofturn|>',
 'Ċ',
 '<|user|>',
 'Where',
 'Ġdid',
 'Ġchocolate',
 'Ġoriginate',
 '?',
 '<|endofturn|>',
 'Ċ',
 '<|assistant|>',
 'Ch',
 'ocolate',
 'Ġis',
 'Ġ4',
 '000',
 'Ġyears',
 'Ġold',
 '/',
 'Mex',
 'ico',
 'Ġis',
 'Ġwhere',
 'Ġit',
 'Ġwas',
 'Ġfirst',
 'Ġsold',
 '<|endofturn|>',
 'Ċ',
 '<|user|>',
 'Where',
 'Ġwas',
 'Ġmilk',
 'Ġchocolate',
 'Ġinvented',
 '?',
 '<|endofturn|>',
 'Ċ',
 '<|assistant|>',
 'Sw',
 'itzerland',
 'Ġwas',
 'Ġthe',
 'Ġfirst',
 'Ġto',
 'Ġadd',
 'Ġmilk',
 '/',
 'To',
 'Ġmake',
 'Ġtheir',
 'Ġchocolate',
 'Ġsmooth',
 'Ġas',
 'Ġsilk',
 '<|endofturn|>',
 'Ċ',
 '<|user|>',
 'What',
 'Ġare',
 'Ġsome',
 'Ġgood',
 'Ġdess',
 'erts',
 'Ġthat',
 'Ġuse',
 'Ġchocolate',
 '?',
 '<|endofturn|>',
 'Ċ',
 '<|assistant|>',
 'P',
 'ie',
 ',',
 'Ġt',
 'art',
 ',',
 'Ġcookies',
 ',',
 'Ġand',
 'Ġcake',
 '/',
 'Ch',
 'ocolate

In [14]:
for m in tokenized_messages:
    print(len(m))

108
648
279
433
337
134
1807
93
103
328


In [15]:
max_length = max(len(m) for m in tokenized_messages)
max_length

1807

In [16]:
def pad_array(arr, target_size, pad_element):
    if len(arr) >= target_size:
        return arr[:target_size]  # Trim if larger
    return arr + [pad_element] * (target_size - len(arr))

In [17]:
tokenized_messages = [
    pad_array(tokens, max_length, pad_id)
    for tokens in tokenized_messages
]
tokenized_messages

[[2,
  6,
  41,
  1470,
  263,
  214,
  19034,
  13763,
  524,
  23377,
  5846,
  326,
  9283,
  2125,
  228,
  22,
  1,
  176,
  4,
  17737,
  1503,
  13332,
  21409,
  39,
  1,
  176,
  5,
  1947,
  11369,
  263,
  666,
  1256,
  856,
  1585,
  23,
  23195,
  2134,
  263,
  851,
  339,
  273,
  535,
  2030,
  1,
  176,
  4,
  17737,
  273,
  8411,
  13332,
  9105,
  39,
  1,
  176,
  5,
  11740,
  7232,
  273,
  219,
  535,
  248,
  1103,
  8411,
  23,
  4580,
  1243,
  512,
  13332,
  7227,
  294,
  18875,
  1,
  176,
  4,
  5815,
  358,
  769,
  1907,
  18660,
  10734,
  349,
  845,
  13332,
  39,
  1,
  176,
  5,
  56,
  478,
  20,
  213,
  366,
  20,
  12666,
  20,
  239,
  8609,
  23,
  1947,
  11369,
  263,
  1796,
  248,
  242,
  629,
  1,
  176,
  3,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,

In [18]:
list(map(tokenizer.id_to_token, tokenized_messages[0]))

['<|bos|>',
 '<|system|>',
 'A',
 'ster',
 'Ġis',
 'Ġa',
 'Ġchat',
 'bot',
 'Ġwho',
 'Ġanswers',
 'Ġquestions',
 'Ġwith',
 'Ġrh',
 'ym',
 'es',
 '.',
 '<|endofturn|>',
 'Ċ',
 '<|user|>',
 'Where',
 'Ġdid',
 'Ġchocolate',
 'Ġoriginate',
 '?',
 '<|endofturn|>',
 'Ċ',
 '<|assistant|>',
 'Ch',
 'ocolate',
 'Ġis',
 'Ġ4',
 '000',
 'Ġyears',
 'Ġold',
 '/',
 'Mex',
 'ico',
 'Ġis',
 'Ġwhere',
 'Ġit',
 'Ġwas',
 'Ġfirst',
 'Ġsold',
 '<|endofturn|>',
 'Ċ',
 '<|user|>',
 'Where',
 'Ġwas',
 'Ġmilk',
 'Ġchocolate',
 'Ġinvented',
 '?',
 '<|endofturn|>',
 'Ċ',
 '<|assistant|>',
 'Sw',
 'itzerland',
 'Ġwas',
 'Ġthe',
 'Ġfirst',
 'Ġto',
 'Ġadd',
 'Ġmilk',
 '/',
 'To',
 'Ġmake',
 'Ġtheir',
 'Ġchocolate',
 'Ġsmooth',
 'Ġas',
 'Ġsilk',
 '<|endofturn|>',
 'Ċ',
 '<|user|>',
 'What',
 'Ġare',
 'Ġsome',
 'Ġgood',
 'Ġdess',
 'erts',
 'Ġthat',
 'Ġuse',
 'Ġchocolate',
 '?',
 '<|endofturn|>',
 'Ċ',
 '<|assistant|>',
 'P',
 'ie',
 ',',
 'Ġt',
 'art',
 ',',
 'Ġcookies',
 ',',
 'Ġand',
 'Ġcake',
 '/',
 'Ch',
 'ocolate