In [1]:
# !pip3 install torch --index-url https://download.pytorch.org/whl/cpu --user
# !pip3 install mosaicml-streaming --user
# !pip3 install transformers --user

In [10]:
from glob import glob
from tqdm import tqdm
import json
import os
import numpy as np
from streaming import MDSWriter
from transformers import AutoTokenizer

In [11]:
from streaming.base.format.mds.encodings import Encoding, _encodings

class UInt16(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint16)

_encodings['uint16'] = UInt16

In [3]:
block_size = 256

def read_dataset(train_file, block_size = block_size):
    
    tokenizer = AutoTokenizer.from_pretrained(
        'malaysia-ai/bpe-tokenizer',
    )
    tokenizer.add_bos_token = False
    tokenizer.add_eos_token = False
    text_column_name = 'text'
    temp = []
    with open(train_file) as fopen:
        for l in fopen:
            l = json.loads(l)
            tokenized = tokenizer(l)['input_ids']
            temp.extend(tokenized)
            while len(temp) >= block_size:
                block = temp[:block_size]
                temp = temp[block_size:]
                if len(block) == block_size:
                    yield np.array(block).astype(np.uint16)

In [6]:
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/wikipedia-2023-10-01.jsonl

In [7]:
next(read_dataset('wikipedia-2023-10-01.jsonl'))

array([ 3536,  3146,   352,    55, 13338, 29575,    29, 30871,  1512,
       27499,  7928,   779,    30,   383,   884,   272,    29, 18963,
         101,   119,   170,   102,   227,   170,   101,   124, 18963,
         101,   120,   170,   102,   239,   170,   101,   126,   170,
         101,   127,   170,   102,   234,    12,  2150,  2200,   939,
         918,  2309,    16,  9584,  3146,    16, 10115,  4978,   359,
         295,  1676,  2802,  2309,  7724,  1424,    15,   313,  1427,
        2309,  7159,   295, 21329,    15,  1627,    15,   726,   323,
        9291,    15,  1313,   847,   327,  3814,   295,  4528,   287,
         413,   661,    72,   323,  7148,  6774,   295,   332,   371,
       29887,  1056, 19997,   323,  8466,    17,  7902, 29589,  2309,
        3146, 18769,    76,   822,   918,   411,  2129,  2655, 29589,
         352,  5090,  1810, 25920,  2655,   743,  1168, 10643,  2309,
        1627,    12,   510, 17182,  2604,  6514, 12048,  4225, 13335,
          17,   224,

In [9]:
columns = {
    'input_ids': 'uint16',
}
compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [10]:
out_root = 'indexed'

In [11]:
with MDSWriter(out=out_root, columns=columns, compression=compression, hashes=hashes) as out:
    for block in tqdm(read_dataset(train_file = 'wikipedia-2023-10-01.jsonl')):
        sample = {
            'input_ids': block
        }
        out.write(sample)

249563it [04:28, 929.51it/s] 


In [5]:
import os

os.environ['S3_ENDPOINT_URL'] = 'http://minio:9000'

In [29]:
import streaming

streaming.base.util.clean_stale_shared_memory()

In [30]:
from streaming import StreamingDataset

remote_dir = 's3://train/indexed'

local_dir = 'local_dir'
dataset = StreamingDataset(local=local_dir, remote=remote_dir)