In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('mesolitica/tinyllama-1.1b-4096-fpf')
tokenizer.pad_token = tokenizer.unk_token
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
tokenizer.padding_side = "right"
tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"



In [2]:
def generate_and_tokenize_prompt(row):
    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except BaseException:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human)
            outputs.append(bot)
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    chat = []
    for input, output in zip(inputs, outputs):
        chat.extend([
            {'role': 'user', 'content': input.strip()},
            {'role': 'assistant', 'content': output.strip()},
        ])
    prompt = tokenizer.apply_chat_template(chat, tokenize=False)
    return {'text': prompt}

In [3]:
!rm -rf packing-tinyllama

In [4]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
import json

class UInt16(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint16)

_encodings['uint16'] = UInt16

columns = {
    'input_ids': 'uint16',
}
hashes = 'sha1', 'xxh64'

block_size = 16384
temp = []
with MDSWriter(out='packing-tinyllama', columns=columns, compression=None, hashes=hashes) as out:
    with open('shuf-combine-malay-no-alignment-multitasks-v4.jsonl') as fopen:
        for l in tqdm(fopen):
            row = json.loads(l)
            element = generate_and_tokenize_prompt(row)
            outputs = tokenizer(element['text'])
            temp.extend(outputs['input_ids'])
            while len(temp) >= block_size:
                block = temp[:block_size]
                temp = temp[block_size:]
                if len(block) == block_size:
                    out.write({
                        'input_ids': np.array(block).astype(np.uint16)
                    })

3713793it [1:08:45, 900.24it/s] 


In [5]:
!du -hs packing-tinyllama

8.4G	packing-tinyllama


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
!cp packing-tinyllama/* ~/ssd3/mosaic-chat-instruction-v5-tinyllama-16k

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
