In [1]:
# !wget https://huggingface.co/datasets/mesolitica/instructions-dataset/resolve/main/shuf-combine-malay-no-alignment-multitasks-v5.jsonl

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it')
tokenizer.pad_token = tokenizer.unk_token
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
tokenizer.padding_side = "right"

In [3]:
tokenizer.chat_template

"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"

In [4]:
def generate_and_tokenize_prompt(row):
    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except BaseException:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human)
            outputs.append(bot)
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    chat = []
    for input, output in zip(inputs, outputs):
        chat.extend([
            {'role': 'user', 'content': input.strip()},
            {'role': 'assistant', 'content': output.strip()},
        ])
    prompt = tokenizer.apply_chat_template(chat, tokenize=False)
    return {'text': prompt}

In [5]:
!rm -rf packing-gemma

In [6]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
import json

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
}
hashes = 'sha1', 'xxh64'

block_size = 16384
temp = []
with MDSWriter(out='packing-gemma', columns=columns, compression=None, hashes=hashes) as out:
    with open('shuf-combine-malay-no-alignment-multitasks-v5.jsonl') as fopen:
        for l in tqdm(fopen):
            row = json.loads(l)
            element = generate_and_tokenize_prompt(row)
            outputs = tokenizer(element['text'])
            temp.extend(outputs['input_ids'])
            done = False
            while len(temp) >= block_size:
                block = temp[:block_size]
                temp = temp[block_size:]
                if len(block) == block_size:
                    out.write({
                        'input_ids': np.array(block).astype(np.uint32)
                    })
                    done = True
                

3408182it [1:43:47, 547.28it/s]


In [7]:
dataset = LocalDataset('packing-gemma')

In [8]:
dataset[0]

{'input_ids': array([    2,   106,  1645, ..., 39944, 59832, 17698], dtype=uint32)}

In [9]:
tokenizer.decode(dataset[0]['input_ids'])

'<bos><start_of_turn>user\nteks: 1) 现在搭公交的群众肯定会少于到超市的群众；  2）怕危险的话，可以选择别搭公交， 但几乎无法选择不去超市。  不是每个人有车，不搭公交怎么去上班啊？\nterjemah ke melayu baku<end_of_turn>\n<start_of_turn>model\n1) Jumlah orang yang menggunakan pengangkutan awam sekarang pasti lebih sedikit daripada jumlah orang yang pergi ke pasaraya; 2) Jika anda takut bahaya, anda boleh memilih untuk tidak menggunakan pengangkutan awam, tetapi hampir mustahil untuk memilih untuk tidak pergi ke pasaraya. Tidak semua orang memiliki kereta, jadi bagaimana mereka boleh pergi bekerja tanpa menggunakan pengangkutan awam?<end_of_turn>\n<bos><start_of_turn>user\nWhat is da perimeter of a triangle wif sides of length 3, 4, n 5?<end_of_turn>\n<start_of_turn>model\nPerimeter segitiga ialah jumlah panjang sisinya. Dalam kes ini, sisi mempunyai panjang 3, 4, dan 5. Untuk mencari perimeter, kami hanya menambah panjang ini bersama-sama:\n\nPerimeter = Sisi 1 + Sisi 2 + Sisi 3\n           = 3 + 4 + 5\n           = 12\n\nJadi, perimeter segi tiga itu ialah

In [10]:
dataset = LocalDataset('packing-gemma')

In [11]:
len(dataset) * block_size

2592653312