In [1]:
from glob import glob
from streaming import MDSWriter
from streaming import LocalDataset, StreamingDataset
from transformers import default_data_collator, DataCollatorForLanguageModeling
from tqdm import tqdm
import numpy as np

In [2]:
folders = sorted(glob('tokenized_indexes/tokenized-*'), key = lambda x: int(x.split('-')[-1]))

In [3]:
folders.extend(sorted(glob('tokenized_extra/tokenized-*'), key = lambda x: int(x.split('-')[-1])))

In [4]:
folders

['tokenized_indexes/tokenized-0',
 'tokenized_indexes/tokenized-1',
 'tokenized_indexes/tokenized-2',
 'tokenized_indexes/tokenized-3',
 'tokenized_indexes/tokenized-4',
 'tokenized_extra/tokenized-0',
 'tokenized_extra/tokenized-1',
 'tokenized_extra/tokenized-2',
 'tokenized_extra/tokenized-3',
 'tokenized_extra/tokenized-4',
 'tokenized_extra/tokenized-5',
 'tokenized_extra/tokenized-6',
 'tokenized_extra/tokenized-7',
 'tokenized_extra/tokenized-8',
 'tokenized_extra/tokenized-9',
 'tokenized_extra/tokenized-10',
 'tokenized_extra/tokenized-11',
 'tokenized_extra/tokenized-12']

In [5]:
from streaming.base.format.mds.encodings import Encoding, _encodings

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

In [6]:
columns = {
    'input_ids': 'uint32',
}

compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [7]:
!rm -rf combine-all

In [8]:
with MDSWriter(out='combine-all', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = StreamingDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).
100%|██████████| 395538/395538 [00:41<00:00, 9537.06it/s] 
Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).
100%|██████████| 148712/148712 [00:28<00:00, 5188.04it/s]
Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).
100%|██████████| 48630/48630 [00:05<00:00, 9434.79it/s]
Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predown

In [10]:
dataset = LocalDataset('combine-all')

In [18]:
(len(dataset) * 8192) / 1e9

8.809668608

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b')

In [15]:
len(tokenizer.decode(dataset[-1]['input_ids']).split()) / len(dataset[-1]['input_ids'])

0.6605224609375

In [16]:
tokenizer.decode(dataset[-1]['input_ids'])

'0.1 cm (height) and the average of two measurements was used in the analyses. The age of the children was calculated in months from their birth dates (from birth certificates or mother’s memory) to the day of data collection. The age, weight and height of the children were then translated into three indices-weight-for-age (HAZ) and weight-for-height (WHZ). The three anthropometric indices were then expressed in terms of Z scores using the ANTHRO programme. The Z scores for all the indices (WAZ, HAZ and WHZ) were also categorized into the following (WHO, 1983): Significant underweight, stunting or wasting: <-2SD of the NCHS median for WAZ, HAZ or WHZ Mildly underweight, stunting or wasting: ≤ -2 SD ≤ x < -1 SD of the NCHS median for WAZ, HAZ or WHZ Zalilah Mohd. Sharif and Ang Merlin Normal: -1 SD ≤ x ≤ 2 SD of the NCHS median for WAZ, HAZ or WHZ High: > 2 SD of the NCHS median for WAZ, HAZ or WHZ Questionnaire A structured questionnaire was used to collect the demographic, socioeconom

In [17]:
len(dataset[-1]['input_ids'])

8192