In [1]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py
# !wget https://huggingface.co/datasets/mesolitica/instructions-dataset/resolve/main/shuf-combine-malay-no-alignment-multitasks-v5.jsonl

In [2]:
# !split -l 213000 -d --additional-suffix=.splitted shuf-combine-malay-no-alignment-multitasks-v5.jsonl shuf-combine-malay-no-alignment-multitasks-v5.jsonl

In [4]:
# 4.34.0

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('mesolitica/tinyllama-1.1b-4096-fpf')
tokenizer.pad_token = tokenizer.unk_token
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
tokenizer.padding_side = "right"
tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"

In [6]:
def generate_and_tokenize_prompt(row):
    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except BaseException:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human)
            outputs.append(bot)
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    chat = []
    for input, output in zip(inputs, outputs):
        chat.extend([
            {'role': 'user', 'content': input.strip()},
            {'role': 'assistant', 'content': output.strip()},
        ])
    prompt = tokenizer.apply_chat_template(chat, tokenize=False)
    return {'text': prompt}

In [9]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json

class UInt16(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint16)

_encodings['uint16'] = UInt16

columns = {
    'input_ids': 'uint16',
}
hashes = 'sha1', 'xxh64'

In [10]:
!mkdir tokenized_tinyllama

mkdir: cannot create directory ‘tokenized_tinyllama’: File exists


In [11]:
def loop(files, block_size = 16384):
    files, index = files
    out_root = f'tokenized_tinyllama/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    temp = []
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for f in files:
            with open(f) as fopen:
                for l in tqdm(fopen):
                    row = json.loads(l)
                    element = generate_and_tokenize_prompt(row)
                    outputs = tokenizer(element['text'])
                    temp.extend(outputs['input_ids'])
                    done = False
                    while len(temp) >= block_size:
                        block = temp[:block_size]
                        temp = temp[block_size:]
                        if len(block) == block_size:
                            out.write({
                                'input_ids': np.array(block).astype(np.uint16)
                            })

In [12]:
files = sorted(glob('shuf-combine-malay-no-alignment-multitasks-v5.jsonl*.splitted'), key = lambda x: int(x.split('jsonl')[-1].split('.')[0]))
files

['shuf-combine-malay-no-alignment-multitasks-v5.jsonl00.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl01.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl02.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl03.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl04.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl05.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl06.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl07.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl08.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl09.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl10.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl11.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl12.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl13.splitted',
 'shuf-combine-malay-no-alignment-multitasks-v5.jsonl14.splitt

In [13]:
import mp
mp.multiprocessing(files, loop, cores = min(len(files), 30), returned = False)

196786it [06:30, 503.93it/s]
213000it [06:48, 521.90it/s]
213000it [06:49, 520.71it/s]
213000it [06:51, 517.47it/s]
213000it [06:52, 516.38it/s]
213000it [06:53, 515.73it/s]
213000it [06:53, 515.33it/s]
213000it [06:54, 514.41it/s]
206044it [06:54, 411.14it/s]
213000it [06:54, 513.70it/s]
213000it [06:55, 512.45it/s]
213000it [06:57, 510.69it/s]
213000it [06:57, 510.29it/s]
213000it [06:58, 508.69it/s]
213000it [06:59, 507.96it/s]
213000it [07:00, 507.11it/s]
213000it [07:00, 506.88it/s]
213000it [07:03, 502.90it/s]
213000it [07:05, 500.84it/s]
213000it [07:10, 495.24it/s]


In [15]:
folders = sorted(glob('tokenized_tinyllama/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized_tinyllama/tokenized-0',
 'tokenized_tinyllama/tokenized-1',
 'tokenized_tinyllama/tokenized-2',
 'tokenized_tinyllama/tokenized-3',
 'tokenized_tinyllama/tokenized-4',
 'tokenized_tinyllama/tokenized-5',
 'tokenized_tinyllama/tokenized-6',
 'tokenized_tinyllama/tokenized-7',
 'tokenized_tinyllama/tokenized-8',
 'tokenized_tinyllama/tokenized-9',
 'tokenized_tinyllama/tokenized-10',
 'tokenized_tinyllama/tokenized-11',
 'tokenized_tinyllama/tokenized-12',
 'tokenized_tinyllama/tokenized-13',
 'tokenized_tinyllama/tokenized-14',
 'tokenized_tinyllama/tokenized-15',
 'tokenized_tinyllama/tokenized-16',
 'tokenized_tinyllama/tokenized-17',
 'tokenized_tinyllama/tokenized-18',
 'tokenized_tinyllama/tokenized-19']

In [16]:
!rm -rf packing-tinyllama

In [17]:
with MDSWriter(out='packing-tinyllama', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|██████████| 15682/15682 [00:01<00:00, 11686.89it/s]
100%|██████████| 15678/15678 [00:01<00:00, 11200.33it/s]
100%|██████████| 15670/15670 [00:01<00:00, 11687.52it/s]
100%|██████████| 15711/15711 [00:01<00:00, 10857.97it/s]
100%|██████████| 15728/15728 [00:01<00:00, 10298.85it/s]
100%|██████████| 15709/15709 [00:01<00:00, 10351.95it/s]
100%|██████████| 15738/15738 [00:01<00:00, 11067.77it/s]
100%|██████████| 15773/15773 [00:01<00:00, 10775.88it/s]
100%|██████████| 15683/15683 [00:01<00:00, 10755.28it/s]
100%|██████████| 15698/15698 [00:01<00:00, 10729.15it/s]
100%|██████████| 15737/15737 [00:01<00:00, 11011.11it/s]
100%|██████████| 15744/15744 [00:01<00:00, 11174.23it/s]
100%|██████████| 15726/15726 [00:01<00:00, 12028.46it/s]
100%|██████████| 15745/15745 [00:01<00:00, 11100.35it/s]
100%|██████████| 15678/15678 [00:01<00:00, 10956.40it/s]
100%|██████████| 15712/15712 [00:01<00:00, 11947.17it/s]
100%|██████████| 15669/15669 [00:01<00:00, 10939.29it/s]
100%|██████████| 15704/15704 [0

In [20]:
dataset = LocalDataset('packing-tinyllama')

In [23]:
len(dataset) * 16384

5128011776

In [22]:
dataset[0]

{'input_ids': array([    1,   518, 25580, ...,   574, 26024,  1045], dtype=uint16)}