In [1]:
from transformers import AutoTokenizer, AutoConfig
from transformers import AddedToken
import os
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('HuggingFaceTB/SmolLM2-135M')
new = ['<|speech_start|>', '<|speech_end|>', '<|text_start|>', '<|text_end|>']
new = [AddedToken(t) for t in new]
tokenizer.add_tokens(new)
speech_tokens = [AddedToken(f"<|{i}|>") for i in range(1024)]
tokenizer.add_tokens(speech_tokens)

1024

In [2]:
import pandas as pd

df = pd.read_parquet('/home/husein/ssd3/verify-text.parquet').to_dict(orient = 'records')
len(df)

2438225

In [3]:
t = df[0]['transcription']
splitted = df[0]['audio'].split('/')
new_f = '/'.join([splitted[0] + '_vqgan'] + splitted[1:]).replace('.mp3', '.npy')
new_f = os.path.join('/home/husein/ssd3', new_f)
speech_t = np.load(new_f)
speech_t = ''.join([f'<|{t}|>' for t in speech_t.tolist()])
tts = f'<|text_start|>{t}<|text_end|><|speech_start|>{speech_t}<|speech_end|>'
stt = f'<|speech_start|>{speech_t}<|speech_end|><|text_start|>{t}<|text_end|>'

In [4]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
}
hashes = 'sha1', 'xxh64'

In [5]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [6]:
!rm -rf tokenized-2048
!mkdir tokenized-2048

In [7]:
import time

def loop(files, block_size = 2048):
    rows, index = files
    out_root = f'tokenized-2048/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            
            t = row['transcription']
            splitted = row['audio'].split('/')
            new_f = '/'.join([splitted[0] + '_vqgan'] + splitted[1:]).replace('.mp3', '.npy')
            new_f = os.path.join('/home/husein/ssd3', new_f)
            if not os.path.exists(new_f):
                continue
                
            speech_t = np.load(new_f)
            speech_t = ''.join([f'<|{t}|>' for t in speech_t.tolist()])
            tts = f'<|text_start|>{t}<|text_end|><|speech_start|>{speech_t}<|speech_end|>'
            
            outputs = tokenizer(tts, add_special_tokens = False)
            temp.append(outputs['input_ids'])
            position_ids.append(range(len(outputs['input_ids'])))
            count += len(outputs['input_ids'])
            
            while count >= block_size:
                block, temp = slice_and_balance(temp, block_size)
                block_position, position_ids = slice_and_balance(position_ids, block_size)
                count = count - block_size
                o = collator(block, block_position)
                last_block = block
                last_position_block = block_position
                out.write(o)
                
        block, _ = slice_and_balance(last_block, block_size - count)
        block_position, _ = slice_and_balance(last_position_block, block_size - count)

        block.extend(temp)
        block_position.extend(position_ids)

        o = collator(block, block_position)
        if len(o['input_ids']) == block_size:
            out.write(o)
            return o

In [8]:
loop((df[:1000], 0))

100%|██████████████████████████████████████| 1000/1000 [00:01<00:00, 644.54it/s]


{'input_ids': array([50107, 49691, 49166, ..., 49863, 49689, 49153], dtype=uint32),
 'position_ids': array([1782, 1783, 1784, ...,  852,  853,  854], dtype=uint32),
 'attention_mask': array([ 145, 1452,   47,  404], dtype=uint32)}

In [9]:
from multiprocess import Pool
import mp

chunks = mp.chunks(df, 100000)
pool = Pool(10)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [10]:
folders = sorted(glob('tokenized-2048/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-2048/tokenized-0',
 'tokenized-2048/tokenized-1',
 'tokenized-2048/tokenized-2',
 'tokenized-2048/tokenized-3',
 'tokenized-2048/tokenized-4',
 'tokenized-2048/tokenized-5',
 'tokenized-2048/tokenized-6',
 'tokenized-2048/tokenized-7',
 'tokenized-2048/tokenized-8',
 'tokenized-2048/tokenized-9',
 'tokenized-2048/tokenized-10',
 'tokenized-2048/tokenized-11',
 'tokenized-2048/tokenized-12',
 'tokenized-2048/tokenized-13',
 'tokenized-2048/tokenized-14',
 'tokenized-2048/tokenized-15',
 'tokenized-2048/tokenized-16',
 'tokenized-2048/tokenized-17',
 'tokenized-2048/tokenized-18',
 'tokenized-2048/tokenized-19',
 'tokenized-2048/tokenized-20',
 'tokenized-2048/tokenized-21',
 'tokenized-2048/tokenized-22',
 'tokenized-2048/tokenized-23',
 'tokenized-2048/tokenized-24']

In [11]:
dataset = LocalDataset(folders[0])

In [12]:
!rm -rf smollm2-speech-semantic-multipack-2048

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
with MDSWriter(
    out='smollm2-speech-semantic-multipack-2048', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|██████████████████████████████████| 90078/90078 [00:04<00:00, 21220.53it/s]
100%|██████████████████████████████████| 91347/91347 [00:04<00:00, 22195.75it/s]
100%|██████████████████████████████████| 89677/89677 [00:04<00:00, 21200.47it/s]
100%|██████████████████████████████████| 91201/91201 [00:07<00:00, 11595.93it/s]
100%|██████████████████████████████████| 90920/90920 [00:04<00:00, 19928.08it/s]
100%|██████████████████████████████████| 90876/90876 [00:04<00:00, 19986.43it/s]
100%|██████████████████████████████████| 91056/91056 [00:05<00:00, 16781.18it/s]
100%|███████████████████████████████████| 90230/90230 [00:11<00:00, 7993.23it/s]
100%|██████████████████████████████████| 90289/90289 [00:04<00:00, 20804.48it/s]
100%|██████████████████████████████████| 90723/90723 [00:04<00:00, 20546.19it/s]
100%|██████████████████████████████████| 91015/91015 [00:07<00:00, 11646.02it/s]
100%|███████████████████████████████████| 91343/91343 [00:15<00:00, 5742.13it/s]
100%|███████████████████████

In [14]:
dataset = LocalDataset('smollm2-speech-semantic-multipack-2048')

In [16]:
(len(dataset) * 2048) / 1e9

4.449196032

In [21]:
dataset[0]

{'attention_mask': array([1364,  684], dtype=uint32),
 'input_ids': array([49154,    51,  4075, ..., 49385, 49840, 50075], dtype=uint32),
 'position_ids': array([  0,   1,   2, ..., 681, 682, 683], dtype=uint32)}

In [22]:
dataset[1]

{'attention_mask': array([2048], dtype=uint32),
 'input_ids': array([49440, 49427, 49595, ..., 49697, 49837, 49491], dtype=uint32),
 'position_ids': array([ 684,  685,  686, ..., 2729, 2730, 2731], dtype=uint32)}

In [24]:
dataset[3]

{'attention_mask': array([ 544, 1504], dtype=uint32),
 'input_ids': array([49579, 49576, 49509, ..., 49647, 49995, 49401], dtype=uint32),
 'position_ids': array([4780, 4781, 4782, ..., 1501, 1502, 1503], dtype=uint32)}

In [1]:
from huggingface_hub import create_repo, delete_repo

try:
    delete_repo(repo_id="mesolitica/smollm2-speech-semantic-multipack-2048", repo_type="dataset")
except:
    pass
create_repo("mesolitica/smollm2-speech-semantic-multipack-2048", repo_type="dataset", private = True)

RepoUrl('https://huggingface.co/datasets/mesolitica/smollm2-speech-semantic-multipack-2048', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/smollm2-speech-semantic-multipack-2048')

In [None]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="smollm2-speech-semantic-multipack-2048",
    repo_id="mesolitica/smollm2-speech-semantic-multipack-2048",
    repo_type="dataset",
)

shard.00002.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00001.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00003.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Upload 532 LFS files:   0%|          | 0/532 [00:00<?, ?it/s]

shard.00000.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00004.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00005.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00006.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00007.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00008.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00009.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00010.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00011.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00012.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00013.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00014.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00015.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00016.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00017.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00018.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00019.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00020.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00021.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00022.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00023.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00024.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00025.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00026.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00027.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00028.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00029.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00030.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00031.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00032.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00033.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00034.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00035.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00036.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00037.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00038.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00039.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00040.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00041.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00042.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00043.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00044.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00045.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00046.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00047.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00048.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00049.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00050.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00051.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00052.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00053.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00054.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00055.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00056.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00057.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00058.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00059.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00060.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00061.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00062.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00063.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00064.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00065.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00066.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00067.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00068.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00069.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00070.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00071.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00072.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00073.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00074.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00075.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00076.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00077.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00078.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00079.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00080.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00081.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00082.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00083.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00084.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00085.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00086.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00087.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00088.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00089.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00090.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00091.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00092.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00093.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00094.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00095.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00096.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00097.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00098.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00099.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00100.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00101.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]