In [1]:
from streaming import MDSWriter
from streaming import StreamingDataset, LocalDataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from tqdm import tqdm
import json
import streaming

In [2]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/mallam-5B-4096')

In [3]:
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
tokenizer.padding_side = "right"

In [4]:
columns = {
    'text': 'str',
}

hashes = 'sha1', 'xxh64'

In [5]:
!rm -rf mosaic-instructions-no-ultrachat

In [7]:
with MDSWriter(out='mosaic-instructions-no-ultrachat', columns=columns, compression=None, hashes=hashes) as out:
    with open('shuf-combine-malay-multitasks-no-ultrachat.jsonl') as fopen:
        for l in tqdm(fopen):
            sample = {
                'text': l
            }
            out.write(sample)

1413385it [00:16, 87439.30it/s] 


In [8]:
dataset = LocalDataset('mosaic-instructions-no-ultrachat')

In [9]:
json.loads(dataset[0]['text'])

{'prompt_input': None,
 'input': 'Tulis satu butiran SQL untuk menyenaraikan nama 3 pelajar terbaik yang mencapai markah purata tertinggi dalam semua peperiksaan, sambil mengecualikan mana-mana pelajar yang mendapat markah di bawah 60% dalam mana-mana peperiksaan individu. Bagi ajaran Malaysia, tulis satu pertanyaan SQL untuk menyenaraikan nama 3 pelajar terbaik yang mempunyai kehadiran sekolah tertinggi dalam semua subjek, selain daripada mana-mana pelajar yang mempunyai kehadiran sekolah kurang daripada 90% dalam mana-mana subjek individu.',
 'output': 'Untuk menyenaraikan nama-nama 3 pelajar terbaik yang mencapai markah purata tertinggi dalam semua peperiksaan, sambil mengecualikan pelajar yang memperoleh markah di bawah 60% dalam mana-mana peperiksaan individu, query SQL yang boleh digunakan adalah:\n\n```\nSELECT pelajar.nama\nFROM pelajar\nWHERE pelajar.id IN (\n    SELECT markah.pelajar_id\n    FROM markah\n    GROUP BY markah.pelajar_id\n    HAVING AVG(markah.markah) >= 60\n)\n

In [10]:
import torch

block_size = 20480

def generate_and_tokenize_prompt(row):
    texts = ['<s>']

    if 'function_call' in row:
        t = row['function_call']
        texts.append(f'\n[FUNCTIONCALL]\n{t}\n')

    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except BaseException:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human.strip())
            outputs.append(bot.strip())
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    for u, a in zip(inputs, outputs):
        texts.append(f'[INST] {u.strip()} [/INST] {a.strip()}</s>')

    prompt = ''.join(texts)
    return {'text': prompt}

class DatasetFixed(torch.utils.data.Dataset):
    def __init__(self, remote):

        streaming.base.util.clean_stale_shared_memory()
        self.dataset = LocalDataset(local=remote)
    
    def __getitem__(self, idx):
        row = json.loads(self.dataset[idx]['text'])
        element = generate_and_tokenize_prompt(row)
        outputs = tokenizer(
            element['text'],
            truncation=True,
            padding=False,
            max_length=block_size,
            return_overflowing_tokens=False,
            return_length=False,
        )
        return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]}
    
    def __len__(self):
        return len(self.dataset)

In [12]:
dataset = DatasetFixed('mosaic-instructions-no-ultrachat')

In [14]:
len(dataset)

1413385

In [15]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [16]:
batch = [dataset[i] for i in range(5)]

In [17]:
data_collator(batch)

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[   1, 1588,  395,  ...,    0,    0,    0],
        [   1, 1588,  395,  ...,    0,    0,    0],
        [   1, 1588,  395,  ...,    0,    0,    0],
        [   1, 1588,  395,  ...,    0,    0,    0],
        [   1, 1588,  395,  ..., 2246,   17,    2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[   1, 1588,  395,  ..., -100, -100, -100],
        [   1, 1588,  395,  ..., -100, -100, -100],
        [   1, 1588,  395,  ..., -100, -100, -100],
        [   1, 1588,  395,  ..., -100, -100, -100],
        [   1, 1588,  395,  ..., 2246,   17,    2]])}