In [None]:
!pip install datasets

In [2]:
import os
import multiprocessing as mp
import numpy as np
from datasets import load_dataset
from tqdm import tqdm
import json

In [3]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.token_id = None
        self.token = None

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, token, token_id):
        node = self.root
        for char in token:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.token_id = token_id
        node.token = token

    def search(self, text, start_pos):
        match_token, match_token_id = None, None
        pos = start_pos
        node = self.root
        while True:
            char = text[pos]
            if char not in node.children:
                break
            node = node.children[char]
            if node.token:
                match_token = node.token
                match_token_id = node.token_id
            pos += 1
            if pos >= len(text):
                break
        return match_token_id, match_token

def bytes_to_unicode():
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def pack_token(id, space, upper):
    return (id << 2) + (space << 1) + (upper << 0)

def upper_first(text):
    return text[0].upper() + (text[1:] if len(text) > 1 else "")

def expand_vocab(vocab, max_vocab_size):
    updated_vocab = {}
    for i, (token, id) in enumerate(vocab.items()):
        if i >= max_vocab_size:
            return updated_vocab
        updated_vocab[pack_token(id, space=False, upper=True)] = f"{upper_first(token)}"
        updated_vocab[pack_token(id, space=True, upper=True)] = f"Ġ{upper_first(token)}"
        updated_vocab[pack_token(id, space=False, upper=False)] = f"{token}"
        updated_vocab[pack_token(id, space=True, upper=False)] = f"Ġ{token}"
    return updated_vocab


class SpaceTokenizer():
    def __init__(self, vocab_config, vocab_size=None):
      self.byte_encoder = bytes_to_unicode()
      self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}

      vocab_size = len(vocab_config) if vocab_size is None else vocab_size
      self.vocab_decode = expand_vocab(vocab_config, max_vocab_size=vocab_size)
      self.vocab = {v:k for k,v in self.vocab_decode.items()}

      self.trie = Trie()
      for token, token_id in self.vocab.items():
          self.trie.insert(token, token_id)

    def encode(self, text, return_token_tuple=False):
        text = ''.join(self.byte_encoder[b] for b in text.encode('utf-8'))
        pos = 0
        ids, tokens = [], []
        while True:
            id, token = self.trie.search(text, pos)
            if id is None or token is None:
                raise Exception(f"Error encoding {text[pos:pos+16]}")
            ids.append(id)
            tokens.append(token)
            pos += len(token)
            if pos >= len(text):
                break
        return (ids, tokens) if return_token_tuple else ids

    def decode(self, ids):
        out = ""
        for id in ids:
            if not id in self.vocab_decode:
                raise Exception(f"Error decoding {id}")
            out += self.vocab_decode[id]
        return bytearray([self.byte_decoder[c] for c in out]).decode('utf-8', errors="replace")


In [4]:
with open('/content/drive/MyDrive/Colab Notebooks/space/tokenizer-space-20k-rs.json', 'r', encoding='utf-8') as f: tokenizer_config = json.load(f)

vocab_size = 20000 + 257
tokenizer = SpaceTokenizer(tokenizer_config, vocab_size)
eot = tokenizer.vocab["<|endoftext|>"]

In [12]:
from tokenizers import Tokenizer
tokenizer_raw = Tokenizer.from_file("/content/drive/MyDrive/Colab Notebooks/space/tokenizer-ref-20k.json")
eot = tokenizer_raw.token_to_id("<|endoftext|>")

class GPTTokenizer():
    def __init__(self, tokenizer):
      self.tokenizer = tokenizer

    def encode(self, text, return_token_tuple=False):
        output = self.tokenizer.encode(text)
        return (output.ids, output.tokens) if return_token_tuple else output.ids

    def decode(self, ids):
        return self.tokenizer.decode(ids)

tokenizer = GPTTokenizer(tokenizer_raw)

In [5]:
text = "The quick brown Fox jumps 1234 OVER the lazy Dog."
ids, tokens = tokenizer.encode(text, return_token_tuple=True)
decoded = tokenizer.decode(ids)

print(text)
print(ids)
print(tokens)
print(decoded)
print(decoded == text)

The quick brown Fox jumps 1234 OVER the lazy Dog.
[1029, 14478, 3922, 6483, 57934, 32402, 80, 319, 345, 28560, 1030, 27862, 356, 8403, 56]
['The', 'Ġquick', 'Ġbrown', 'ĠFox', 'Ġjumps', 'Ġ123', '4', 'ĠO', 'V', 'ER', 'Ġthe', 'Ġlaz', 'y', 'ĠDog', '.']
The quick brown Fox jumps 1234 OVER the lazy Dog.
True


In [None]:
local_dir = "edu_fineweb10B"
remote_path = "HuggingFaceFW/fineweb-edu" # "stas/openwebtext-10k"
remote_name = "sample-10BT" # None
shard_size = int(1e8) # 100M tokens per shard, total of 100 shards

dataset = load_dataset(remote_path, name=remote_name, split="train")
print(len(dataset))

README.md:   0%|          | 0.00/23.3k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/1630 [00:00<?, ?it/s]

000_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

001_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

002_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

003_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

004_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

005_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

006_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

007_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

008_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

In [7]:
# split in half for experminetal tokenizations
dataset = dataset.select(range(len(dataset) // 4))

In [None]:
# From https://github.com/karpathy/build-nanogpt/blob/master/fineweb.py

DATA_CACHE_DIR = "/content/data/"
os.makedirs(DATA_CACHE_DIR, exist_ok=True)

def tokenize(doc):
    # tokenizes a single document and returns a numpy array of uint16 tokens
    tokens = [eot] # the special <|endoftext|> token delimits all documents
    tokens.extend(tokenizer.encode(doc["text"]))
    tokens_np = np.array(tokens)
    # print(tokens_np.dtype)
    assert (0 <= tokens_np).all() and (tokens_np < 2**32).all(), "token dictionary too large for uint16"
    tokens_np_uint16 = tokens_np.astype(np.uint32)
    return tokens_np_uint16

def write_datafile(filename, tokens_np):
    print(tokens_np.dtype)
    np.save(filename, tokens_np)

# tokenize all documents and write output shards, each of shard_size tokens (last shard has remainder)
nprocs = max(1, os.cpu_count()//2)
with mp.Pool(nprocs) as pool:
    shard_index = 0
    # preallocate buffer to hold current shard
    all_tokens_np = np.empty((shard_size,), dtype=np.uint32)
    token_count = 0
    progress_bar = None
    for tokens in pool.imap(tokenize, dataset, chunksize=16):
        # is there enough space in the current shard for the new tokens?
        if token_count + len(tokens) < shard_size:
            # simply append tokens to current shard
            all_tokens_np[token_count:token_count+len(tokens)] = tokens
            token_count += len(tokens)
            # update progress bar
            if progress_bar is None:
                progress_bar = tqdm(total=shard_size, unit="tokens", desc=f"Shard {shard_index}", mininterval=10)
            progress_bar.update(len(tokens))
        else:
            # write the current shard and start a new one
            split = "val" if shard_index == 0 else "train"
            filename = os.path.join(DATA_CACHE_DIR, f"edufineweb_{split}_{shard_index:06d}")
            # split the document into whatever fits in this shard; the remainder goes to next one
            remainder = shard_size - token_count
            progress_bar.update(remainder)
            all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
            write_datafile(filename, all_tokens_np)
            shard_index += 1
            progress_bar = None
            # populate the next shard with the leftovers of the current doc
            all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
            token_count = len(tokens)-remainder

    # write any remaining tokens as the last shard
    if token_count != 0:
        split = "val" if shard_index == 0 else "train"
        filename = os.path.join(DATA_CACHE_DIR, f"edufineweb_{split}_{shard_index:06d}")
        write_datafile(filename, all_tokens_np[:token_count])

In [21]:
!zip -r /content/dataset-space-20k-rs-10BT.zip /content/data

  adding: content/data/ (stored 0%)
  adding: content/data/edufineweb_train_000007.npy (deflated 60%)
  adding: content/data/edufineweb_train_000005.npy (deflated 60%)
  adding: content/data/edufineweb_train_000006.npy (deflated 60%)
  adding: content/data/edufineweb_train_000003.npy (deflated 60%)
  adding: content/data/edufineweb_train_000023.npy (deflated 60%)
  adding: content/data/edufineweb_train_000019.npy (deflated 60%)
  adding: content/data/edufineweb_train_000018.npy (deflated 60%)
  adding: content/data/edufineweb_train_000002.npy (deflated 60%)
  adding: content/data/edufineweb_train_000012.npy (deflated 60%)
  adding: content/data/edufineweb_train_000024.npy (deflated 60%)
  adding: content/data/.ipynb_checkpoints/ (stored 0%)
  adding: content/data/edufineweb_train_000021.npy (deflated 60%)
  adding: content/data/edufineweb_train_000009.npy (deflated 60%)
  adding: content/data/edufineweb_train_000014.npy (deflated 60%)
  adding: content/data/edufineweb_train_000013.npy 

In [23]:
!cp "/content/dataset-space-20k-rs-10BT.zip" "/content/drive/MyDrive/Colab Notebooks/space/"