In [7]:
# !wget https://huggingface.co/datasets/mesolitica/instructions-dataset/resolve/main/shuf-combine-malay-no-alignment-multitasks-v5.jsonl

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('mesolitica/mistral-7b-4096-fpf')
tokenizer.pad_token = tokenizer.unk_token
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
tokenizer.padding_side = "right"
tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"

In [9]:
def generate_and_tokenize_prompt(row):
    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except BaseException:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human)
            outputs.append(bot)
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    chat = []
    for input, output in zip(inputs, outputs):
        chat.extend([
            {'role': 'user', 'content': input.strip()},
            {'role': 'assistant', 'content': output.strip()},
        ])
    prompt = tokenizer.apply_chat_template(chat, tokenize=False)
    return {'text': prompt}

In [10]:
!rm -rf packing-mistral

In [11]:
!head -n 10 shuf-combine-malay-no-alignment-multitasks-v5.jsonl

{"prompt_input": null, "input": "teks: 1) \u73b0\u5728\u642d\u516c\u4ea4\u7684\u7fa4\u4f17\u80af\u5b9a\u4f1a\u5c11\u4e8e\u5230\u8d85\u5e02\u7684\u7fa4\u4f17\uff1b  2\uff09\u6015\u5371\u9669\u7684\u8bdd\uff0c\u53ef\u4ee5\u9009\u62e9\u522b\u642d\u516c\u4ea4\uff0c \u4f46\u51e0\u4e4e\u65e0\u6cd5\u9009\u62e9\u4e0d\u53bb\u8d85\u5e02\u3002  \u4e0d\u662f\u6bcf\u4e2a\u4eba\u6709\u8f66\uff0c\u4e0d\u642d\u516c\u4ea4\u600e\u4e48\u53bb\u4e0a\u73ed\u554a\uff1f\nterjemah ke melayu baku", "output": "1) Jumlah orang yang menggunakan pengangkutan awam sekarang pasti lebih sedikit daripada jumlah orang yang pergi ke pasaraya; 2) Jika anda takut bahaya, anda boleh memilih untuk tidak menggunakan pengangkutan awam, tetapi hampir mustahil untuk memilih untuk tidak pergi ke pasaraya. Tidak semua orang memiliki kereta, jadi bagaimana mereka boleh pergi bekerja tanpa menggunakan pengangkutan awam?"}
{"prompt_input": null, "input": "What is da perimeter of a triangle wif sides of length 3, 4, n 5?", "output": 

In [12]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
import json

class UInt16(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint16)

_encodings['uint16'] = UInt16

columns = {
    'input_ids': 'uint16',
}
hashes = 'sha1', 'xxh64'

block_size = 16384
temp = []
with MDSWriter(out='packing-mistral', columns=columns, compression=None, hashes=hashes) as out:
    with open('shuf-combine-malay-no-alignment-multitasks-v5.jsonl') as fopen:
        for l in tqdm(fopen):
            row = json.loads(l)
            element = generate_and_tokenize_prompt(row)
            outputs = tokenizer(element['text'])
            temp.extend(outputs['input_ids'])
            while len(temp) >= block_size:
                block = temp[:block_size]
                temp = temp[block_size:]
                if len(block) == block_size:
                    out.write({
                        'input_ids': np.array(block).astype(np.uint16)
                    })

3408182it [1:55:08, 493.34it/s]


In [14]:
dataset = LocalDataset('packing-mistral')

In [15]:
len(dataset) * block_size

4411342848