In [1]:
from glob import glob
from streaming import MDSWriter
from streaming import LocalDataset
from transformers import default_data_collator, DataCollatorForLanguageModeling
from tqdm import tqdm
import numpy as np

In [7]:
folders = sorted(glob('tokenized_indexes/tokenized-*'), key = lambda x: int(x.split('-')[-1]))

In [8]:
folders

['tokenized_indexes/tokenized-0',
 'tokenized_indexes/tokenized-1',
 'tokenized_indexes/tokenized-2',
 'tokenized_indexes/tokenized-3',
 'tokenized_indexes/tokenized-4',
 'tokenized_indexes/tokenized-5',
 'tokenized_indexes/tokenized-6',
 'tokenized_indexes/tokenized-7',
 'tokenized_indexes/tokenized-8',
 'tokenized_indexes/tokenized-9',
 'tokenized_indexes/tokenized-10',
 'tokenized_indexes/tokenized-11',
 'tokenized_indexes/tokenized-12']

In [9]:
from streaming.base.format.mds.encodings import Encoding, _encodings

class UInt16(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint16)

_encodings['uint16'] = UInt16

In [10]:
columns = {
    'input_ids': 'uint16',
}

compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [11]:
!rm -rf combine-all

In [12]:
with MDSWriter(out='combine-all', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|██████████| 917963/917963 [00:44<00:00, 20796.81it/s]
100%|██████████| 77395/77395 [00:06<00:00, 12001.85it/s]
100%|██████████| 78232/78232 [00:03<00:00, 23822.57it/s]
100%|██████████| 78239/78239 [00:06<00:00, 11667.44it/s]
100%|██████████| 78388/78388 [00:09<00:00, 7949.49it/s] 
100%|██████████| 78370/78370 [00:04<00:00, 19453.48it/s]
100%|██████████| 78304/78304 [00:10<00:00, 7511.84it/s] 
100%|██████████| 78044/78044 [00:07<00:00, 10382.35it/s]
100%|██████████| 226206/226206 [00:24<00:00, 9398.62it/s] 
100%|██████████| 391726/391726 [00:30<00:00, 13022.95it/s]
100%|██████████| 105720/105720 [00:09<00:00, 11343.67it/s]
100%|██████████| 530659/530659 [00:37<00:00, 14269.17it/s]
100%|██████████| 409162/409162 [00:30<00:00, 13255.96it/s]


In [13]:
!ls -lh combine-all

total 24G
-rw-r--r-- 1 ubuntu ubuntu 145K Dec 28 10:28 index.json
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00000.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00001.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00002.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00003.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00004.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00005.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00006.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00007.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00008.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00009.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00010.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00011.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00012.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00013.mds
-rw-r--r-- 1 ubuntu ubuntu  64M Dec 28 10:24 shard.00014.mds
-r

In [14]:
dataset = LocalDataset(local='combine-all')

In [17]:
(len(dataset) * 4096) / 1e9

12.813959168

In [19]:
!git clone https://huggingface.co/datasets/malaysia-ai/mosaic-solar
!cp combine-all/* mosaic-solar

Cloning into 'mosaic-solar'...
remote: Enumerating objects: 3, done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 3[K
Unpacking objects: 100% (3/3), 516 bytes | 516.00 KiB/s, done.
