In [17]:
from datasets import load_from_disk
from datasets.formatting.formatting import LazyBatch

from special_tokens import special_tokens

In [18]:
test_dataset = "tokenized_data/robots_test"
train_dataset = "tokenized_data/robots_train"

batch_size = 2000
processes = 8

In [19]:
from tokenizers.tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("tokenizer.json")
assistant_id = tokenizer.token_to_id(special_tokens["assistant"])
eot_it = tokenizer.token_to_id(special_tokens["end_of_turn"])

In [20]:
def inspect_dataset(name):
    ds = load_from_disk(name).take(1)
    row = next(iter(ds))
    token_ids = row["tokens"]
    assistant_mask = row["assistant_mask"]
    tokens = list(map(tokenizer.id_to_token, token_ids))
    for i, t in enumerate(tokens):
        mask = assistant_mask[i]
        print(mask, t)

In [21]:
def enrich_chat(batch: LazyBatch):
    assistant_masks = []
    chats = batch["tokens"]
    for b, token_ids in enumerate(chats):
        assistant_mask = [False] * len(token_ids)
        inside_assistant = False
        for i, t in enumerate(token_ids):
            if t == assistant_id:
                inside_assistant = True
                assistant_mask[i] = True
                continue
            if t == eot_it:
                assistant_mask[i] = inside_assistant
                inside_assistant = False
                continue
            assistant_mask[i] = inside_assistant
        assistant_masks.append(assistant_mask)
    return {
        "assistant_mask": assistant_masks
    }

In [22]:
output = "tokenized_data/test_chats"
(
    load_from_disk(test_dataset)
    .map(enrich_chat, batched=True, batch_size=batch_size, num_proc=processes)
    .save_to_disk(output)
)

Map (num_proc=8):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

In [23]:
inspect_dataset(output)

False <|bos|>
False <|system|>
False A
False ster
False Ġis
False Ġa
False Ġchat
False bot
False Ġwho
False Ġanswers
False Ġquestions
False Ġwith
False Ġrh
False ym
False es
False .
False <|endofturn|>
False Ċ
False <|user|>
False Where
False Ġdid
False Ġchocolate
False Ġoriginate
False ?
False <|endofturn|>
False Ċ
True <|assistant|>
True Ch
True ocolate
True Ġis
True Ġ4
True 000
True Ġyears
True Ġold
True /
True Mex
True ico
True Ġis
True Ġwhere
True Ġit
True Ġwas
True Ġfirst
True Ġsold
True <|endofturn|>
False Ċ
False <|user|>
False Where
False Ġwas
False Ġmilk
False Ġchocolate
False Ġinvented
False ?
False <|endofturn|>
False Ċ
True <|assistant|>
True Sw
True itzerland
True Ġwas
True Ġthe
True Ġfirst
True Ġto
True Ġadd
True Ġmilk
True /
True To
True Ġmake
True Ġtheir
True Ġchocolate
True Ġsmooth
True Ġas
True Ġsilk
True <|endofturn|>
False Ċ
False <|user|>
False What
False Ġare
False Ġsome
False Ġgood
False Ġdess
False erts
False Ġthat
False Ġuse
False Ġchocolate
False ?
False <|en

In [24]:
output = "tokenized_data/train_chats"
(
    load_from_disk(train_dataset)
    .map(enrich_chat, batched=True, batch_size=batch_size, num_proc=processes)
    .save_to_disk(output)
)
inspect_dataset(output)

Map (num_proc=8):   0%|          | 0/9500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9500 [00:00<?, ? examples/s]

False <|bos|>
False <|user|>
False P
False lease
False Ġsummar
False ize
False Ġthe
False Ġgoals
False Ġfor
False Ġscientists
False Ġin
False Ġthis
False Ġtext
False :
False Ċ
False Ċ
False With
False in
False Ġthree
False Ġdays
False ,
False Ġthe
False Ġinter
False tw
False ined
False Ġcup
False Ġnest
False Ġof
False Ġgrass
False es
False Ġwas
False Ġcomplete
False ,
False Ġfeaturing
False Ġa
False Ġcan
False opy
False Ġof
False Ġover
False hang
False ing
False Ġgrass
False es
False Ġto
False Ġconce
False al
False Ġit
False .
False ĠAnd
False Ġdecades
False Ġlater
False ,
False Ġit
False Ġserved
False Ġas
False ĠR
False ink
False ert
False âĢĻ
False s
False Ġport
False al
False Ġto
False Ġthe
False Ġpast
False Ġinside
False Ġthe
False ĠCalifornia
False ĠAcademy
False Ġof
False ĠSciences
False .
False ĠInformation
False Ġg
False lean
False ed
False Ġfrom
False Ġsuch
False Ġnests
False ,
False Ġw
False oven
False Ġlong
False Ġago
False Ġfrom
False Ġspecies
False Ġin
False Ġplant
False Ġ