In [1]:
from glob import glob
from streaming import MDSWriter
from streaming import LocalDataset, StreamingDataset
from transformers import default_data_collator, DataCollatorForLanguageModeling
from tqdm import tqdm
import numpy as np

In [2]:
folders = sorted(glob('tokenized_indexes/tokenized-*'), key = lambda x: int(x.split('-')[-1]))

In [3]:
folders.extend(sorted(glob('tokenized_extra/tokenized-*'), key = lambda x: int(x.split('-')[-1])))

In [4]:
folders

['tokenized_indexes/tokenized-0',
 'tokenized_indexes/tokenized-1',
 'tokenized_indexes/tokenized-2',
 'tokenized_indexes/tokenized-3',
 'tokenized_indexes/tokenized-4',
 'tokenized_indexes/tokenized-5',
 'tokenized_indexes/tokenized-6',
 'tokenized_indexes/tokenized-7',
 'tokenized_indexes/tokenized-8',
 'tokenized_indexes/tokenized-9',
 'tokenized_indexes/tokenized-10',
 'tokenized_indexes/tokenized-11',
 'tokenized_indexes/tokenized-12',
 'tokenized_indexes/tokenized-13',
 'tokenized_indexes/tokenized-14',
 'tokenized_indexes/tokenized-15',
 'tokenized_indexes/tokenized-16',
 'tokenized_indexes/tokenized-17',
 'tokenized_indexes/tokenized-18',
 'tokenized_indexes/tokenized-19',
 'tokenized_indexes/tokenized-20',
 'tokenized_indexes/tokenized-21',
 'tokenized_indexes/tokenized-22',
 'tokenized_indexes/tokenized-23',
 'tokenized_indexes/tokenized-24',
 'tokenized_indexes/tokenized-25',
 'tokenized_indexes/tokenized-26',
 'tokenized_indexes/tokenized-27',
 'tokenized_indexes/tokenized-

In [5]:
from streaming.base.format.mds.encodings import Encoding, _encodings

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

In [6]:
columns = {
    'input_ids': 'uint32',
}

compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [7]:
!rm -rf combine-all

In [8]:
with MDSWriter(out='combine-all', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = StreamingDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).
100%|██████████| 11787/11787 [00:01<00:00, 10289.32it/s]
Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).
100%|██████████| 12557/12557 [00:01<00:00, 10118.58it/s]
Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).
100%|██████████| 28925/28925 [00:03<00:00, 8495.45it/s] 
Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownlo

In [9]:
dataset = LocalDataset('combine-all')

In [10]:
(len(dataset) * 8192) / 1e9

9.487917056

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
tokenizer.decode(dataset[-1]['input_ids'])

" parking lot occupancy across various levels of analysis. The utilization of distribution or mapping maps notably simplifies the tracking of intricate data sets. Therefore, the formulation of distribution maps for parking lot occupancy proves imperative in addressing traffic congestion and related inconveniences. Taufiq et. al., Malaysian Journal of Computing, 8 (2): 1639-1651, 2023 1640 Employing a GIS-based approach to parking spot identification could also contribute insights into the realms of traffic congestion and pedestrian safety within urban settings. Keywords: Availability, Geographical Information System (GIS), Parking Spaces. Received for review: 02-10-2022; Accepted: 18-09-2023; Published: 10-10-2023 DOI: 10.24191/mjoc.v8i2.24075 1. Introduction According to a press release issued on April 16, 2014, by market research agency Nielsen, Malaysia has the third-highest rate of car ownership in the world, with 93 percent of households owning a car. That is to say, only 7 percen

In [16]:
len(dataset[-1]['input_ids'])

8192