In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('canopylabs/orpheus-3b-0.1-ft')

In [2]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
}
hashes = 'sha1', 'xxh64'

In [3]:
from datasets import load_dataset

dataset = load_dataset('mesolitica/TTS-Combined')['train']
rows = dataset.to_list()

In [4]:
rows[0]

{'audio_filename': 'husein-chatbot/husein-chatbot-politics-normalized-v2-5674.mp3',
 'prompt': 'Husein',
 'transcription': 'Berikut ialah beberapa cara langkah - langkah ini boleh dilaksanakan dengan berkesan ,'}

In [5]:
import gc

def new_path(f):
    f = f.replace('.mp3', '.snac')
    splitted = f.split('/')
    base_folder = splitted[0] + '_snac'
    splitted = '/'.join([base_folder] + splitted[1:])
    return splitted

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [6]:
!mkdir tokenized-3k

mkdir: cannot create directory ‘tokenized-3k’: File exists


In [7]:
import time

block_size = 3072

def loop(rows, block_size = block_size):
    rows, index = rows
    out_root = f'tokenized-3k/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            speaker = row['prompt']
            text = row['transcription']
            with open(new_path(row['audio_filename'])) as fopen:
                myts = json.load(fopen)
            prompt = f'<custom_token_3><|begin_of_text|>{speaker}: {text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>'
            outputs = tokenizer(prompt, add_special_tokens = False, return_attention_mask = False)
            outputs = outputs['input_ids'] + myts + [128258]
            temp.append(outputs)
            position_ids.append(range(len(outputs)))
            count += len(outputs)
            while count >= block_size:
                block, temp = slice_and_balance(temp, block_size)
                block_position, position_ids = slice_and_balance(position_ids, block_size)
                count = count - block_size
                o = collator(block, block_position)
                last_block = block
                last_position_block = block_position
                out.write(o)
                
        block, _ = slice_and_balance(last_block, block_size - count)
        block_position, _ = slice_and_balance(last_position_block, block_size - count)

        block.extend(temp)
        block_position.extend(position_ids)

        o = collator(block, block_position)
        if len(o['input_ids']) == block_size:
            out.write(o)
            return o

In [8]:
loop((rows[:1000], 0))

100%|████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 6590.97it/s]


{'input_ids': array([148634, 149446, 154746, ..., 152348, 154697, 128258], dtype=uint32),
 'position_ids': array([491, 492, 493, ..., 471, 472, 473], dtype=uint32),
 'attention_mask': array([228, 613, 450, 109, 811, 387, 474], dtype=uint32)}

In [9]:
local_dataset = LocalDataset('tokenized-3k/tokenized-0')
local_dataset[0]

{'attention_mask': array([ 483,  876, 1177,  536], dtype=uint32),
 'input_ids': array([128259, 128000,     39, ..., 134495, 139889, 141470], dtype=uint32),
 'position_ids': array([  0,   1,   2, ..., 533, 534, 535], dtype=uint32)}

In [10]:
from multiprocess import Pool

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

chunks = chunks(rows, 50000)
pool = Pool(10)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [11]:
folders = sorted(glob('tokenized-3k/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-3k/tokenized-0',
 'tokenized-3k/tokenized-1',
 'tokenized-3k/tokenized-2',
 'tokenized-3k/tokenized-3',
 'tokenized-3k/tokenized-4',
 'tokenized-3k/tokenized-5',
 'tokenized-3k/tokenized-6',
 'tokenized-3k/tokenized-7',
 'tokenized-3k/tokenized-8',
 'tokenized-3k/tokenized-9',
 'tokenized-3k/tokenized-10',
 'tokenized-3k/tokenized-11',
 'tokenized-3k/tokenized-12']

In [12]:
!rm -rf packing-3k

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
with MDSWriter(out='packing-3k', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|█████████████████████████████████████████████████████████████████████████████████████████| 10304/10304 [00:00<00:00, 19355.50it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 10277/10277 [00:00<00:00, 16557.81it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 10282/10282 [00:00<00:00, 16591.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 10274/10274 [00:00<00:00, 16502.49it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 10289/10289 [00:00<00:00, 19333.32it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 10271/10271 [00:00<00:00, 16599.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 10278/10278 [00:00<00:00, 16075.18it/s]
100%|███████████████████████████████████████████

In [14]:
dataset = LocalDataset('packing-3k')
(len(dataset) * block_size) / 1e9

0.408419328

In [2]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="packing-3k",
    repo_id="huseinzol05/orpheus-3k-multipacking",
    repo_type="dataset",
)