<a href="https://colab.research.google.com/github/omid-sar/End_to_End_GPT2/blob/main/src/GPT2/research/tokenize_and_shard_wikitext_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --q tiktoken
!pip install --q datasets
!pip install --q tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of 

In [4]:
import os
import multiprocessing as mp
import numpy as np
import tiktoken
from datasets import load_dataset # pip install datasets
from tqdm import tqdm # pip install tqdm

# ------------------------------------------
local_dir = "WIKI"
remote_name = "wikitext-2-raw-v1" #"sample-10BT"
shard_size = int(1e6) # 100M tokens per shard, total of 100 shards

# create the cache the local directory if it doesn't exist yet
DATA_CACHE_DIR = os.path.join(os.getcwd(), local_dir)
os.makedirs(DATA_CACHE_DIR, exist_ok=True)

# download the dataset
fw = load_dataset("wikitext", name=remote_name, split="train")

# init the tokenizer
enc = tiktoken.get_encoding("gpt2")
eot = enc._special_tokens['<|endoftext|>'] # end of text token
def tokenize(doc):
    # tokenizes a single document and returns a numpy array of uint16 tokens
    tokens = [eot] # the special <|endoftext|> token delimits all documents
    tokens.extend(enc.encode_ordinary(doc["text"]))
    tokens_np = np.array(tokens)
    assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
    tokens_np_uint16 = tokens_np.astype(np.uint16)
    return tokens_np_uint16

def write_datafile(filename, tokens_np):
    np.save(filename, tokens_np)

# tokenize all documents and write output shards, each of shard_size tokens (last shard has remainder)
nprocs = max(1, os.cpu_count()//2)
with mp.Pool(nprocs) as pool:
    shard_index = 0
    # preallocate buffer to hold current shard
    all_tokens_np = np.empty((shard_size,), dtype=np.uint16)
    token_count = 0
    progress_bar = None
    for tokens in pool.imap(tokenize, fw, chunksize=16):

        # is there enough space in the current shard for the new tokens?
        if token_count + len(tokens) < shard_size:
            # simply append tokens to current shard
            all_tokens_np[token_count:token_count+len(tokens)] = tokens
            token_count += len(tokens)
            # update progress bar
            if progress_bar is None:
                progress_bar = tqdm(total=shard_size, unit="tokens", desc=f"Shard {shard_index}")
            progress_bar.update(len(tokens))
        else:
            # write the current shard and start a new one
            split = "val" if shard_index == 0 else "train"
            filename = os.path.join(DATA_CACHE_DIR, f"edufineweb_{split}_{shard_index:06d}")
            # split the document into whatever fits in this shard; the remainder goes to next one
            remainder = shard_size - token_count
            progress_bar.update(remainder)
            all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
            write_datafile(filename, all_tokens_np)
            shard_index += 1
            progress_bar = None
            # populate the next shard with the leftovers of the current doc
            all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
            token_count = len(tokens)-remainder

    # write any remaining tokens as the last shard
    if token_count != 0:
        split = "val" if shard_index == 0 else "train"
        filename = os.path.join(DATA_CACHE_DIR, f"edufineweb_{split}_{shard_index:06d}")
        write_datafile(filename, all_tokens_np[:token_count])


Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Shard 0:  98%|█████████▊| 983218/1000000 [00:01<00:00, 570179.42tokens/s]
Shard 1:   0%|          | 0/1000000 [00:00<?, ?tokens/s][A
Shard 0: 100%|██████████| 1000000/1000000 [00:01<00:00, 507907.05tokens/s]

Shard 1:  12%|█▏        | 122190/1000000 [00:00<00:01, 541129.05tokens/s][A
Shard 1:  18%|█▊        | 180501/1000000 [00:00<00:01, 537142.70tokens/s][A
Shard 1:  24%|██▍       | 243450/1000000 [00:00<00:01, 562751.29tokens/s][A
Shard 1:  30%|███       | 300038/1000000 [00:00<00:01, 558931.71tokens/s][A
Shard 1:  36%|███▌      | 356186/1000000 [00:00<00:01, 556650.86tokens/s][A
Shard 1:  42%|████▏     | 421495/1000000 [00:00<00:00, 586972.76tokens/s][A
Shard 1:  48%|████▊     | 480469/1000000 [00:00<00:00, 577668.71tokens/s][A
Shard 1:  54%|█████▍    | 543390/1000000 [00:00<00:00, 591365.75tokens/s][A
Shard 1:  60%|██████    | 602703/1000000 [00:01<00:00, 549206.24tokens/s][A
Shard 1:  66%|██████▋   | 662845/1000000 [00:01<00:00, 560907.76tokens/s][A
Shard 1:  73%|██████

In [3]:
import os
import multiprocessing as mp
import numpy as np
import tiktoken
from datasets import load_dataset
from tqdm import tqdm
from pathlib import Path




class Config:
    def __init__(self):
        self.root_dir = 'artifacts/data_transformation'
        self.dataset_name = 'wikitext-2-raw-v1'
        self.dataset = 'wikitext'
        self.downloaded_files = 'artifacts/data_ingestion/data'
        self.local_data_file = 'artifacts/data_transformation/data'
        self.shard_size = 1000000  # 1M tokens per shard

class DataTokenizer:
    def __init__(self, config):
        self.config = config
        self.transformed_file_path = Path(os.path.join(self.config.local_data_file, self.config.dataset_name))
        file_path = Path(os.path.join(config.downloaded_files, config.dataset_name))

        try:
            print(f"Loading dataset from cache at {file_path}")
            self.dataset = load_dataset(config.dataset, name=config.dataset_name, split="train", cache_dir=str(file_path))
        except Exception as e:
            print(f"Error loading cached data: {e}")
            return None

        self.enc = tiktoken.get_encoding('gpt2')
        self.eot = self.enc._special_tokens['<|endoftext|>']  # end of text token

    def check_existing_tokenized_data(self):
        if not os.path.exists(self.transformed_file_path):
            print(f"No tokenized data directory found at {self.transformed_file_path}. Starting tokenization.")
            return False
        files = [os.path.join(self.transformed_file_path, f) for f in os.listdir(self.transformed_file_path) if f.endswith('.npy')]
        if files:
            print(f"Found {len(files)} pre-tokenized shards in {self.transformed_file_path}. Skipping tokenization.")
            return True
        return False

    def tokenize(self, doc):
        tokens = [self.eot]  # the special <|endoftext|> token delimits all documents
        tokens.extend(self.enc.encode_ordinary(doc["text"]))
        tokens_np = np.array(tokens)
        assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
        tokens_np_uint16 = tokens_np.astype(np.uint16)
        return tokens_np_uint16

    def write_datafile(self, filename, tokens_np):
        np.save(filename, tokens_np)

    def process_documents(self):
        if self.check_existing_tokenized_data():
            return  # Skip documents tokenizer if already exist!

        os.makedirs(self.transformed_file_path, exist_ok=True)
        print(f"Created directory at: {self.transformed_file_path}")

        nprocs = max(1, os.cpu_count()//2)
        with mp.Pool(nprocs) as pool:
            shard_index = 0
            all_tokens_np = np.empty((self.config.shard_size,), dtype=np.uint16)
            token_count = 0
            progress_bar = None

            for tokens in pool.imap(self.tokenize, self.dataset, chunksize=16):
                if token_count + len(tokens) < self.config.shard_size:
                    all_tokens_np[token_count:token_count+len(tokens)] = tokens
                    token_count += len(tokens)
                    if progress_bar is None:
                        progress_bar = tqdm(total=self.config.shard_size, unit="tokens", desc=f"Shard {shard_index}")
                    progress_bar.update(len(tokens))
                else:
                    split = "val" if shard_index == 0 else "train"
                    filename = os.path.join(self.transformed_file_path, f"edufineweb_{split}_{shard_index:06d}")
                    remainder = self.config.shard_size - token_count
                    progress_bar.update(remainder)
                    all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
                    self.write_datafile(filename, all_tokens_np)
                    shard_index += 1
                    progress_bar = None
                    all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
                    token_count = len(tokens)-remainder

            if token_count != 0:
                split = "val" if shard_index == 0 else "train"
                filename = os.path.join(self.transformed_file_path, f"edufineweb_{split}_{shard_index:06d}")
                self.write_datafile(filename, all_tokens_np[:token_count])

def main():
    config = Config()
    tokenizer = DataTokenizer(config=config)
    tokenizer.process_documents()

if __name__ == '__main__':
    mp.freeze_support()
    main()

Loading dataset from cache at artifacts/data_ingestion/data/wikitext-2-raw-v1
No tokenized data directory found at artifacts/data_transformation/data/wikitext-2-raw-v1. Starting tokenization.
Created directory at: artifacts/data_transformation/data/wikitext-2-raw-v1


Shard 0: 100%|██████████| 1000000/1000000 [00:02<00:00, 393652.39tokens/s]
Shard 1: 100%|█████████▉| 999713/1000000 [00:02<00:00, 405556.23tokens/s]
Shard 2:  43%|████▎     | 428539/1000000 [00:00<00:01, 438279.51tokens/s]


In [2]:
import os
import multiprocessing as mp
import numpy as np
import tiktoken
from datasets import load_dataset
from tqdm import tqdm
from pathlib import Path




class Config:
    def __init__(self):
        self.root_dir = 'artifacts/data_transformation'
        self.dataset_name = 'wikitext-2-raw-v1'
        self.dataset = 'wikitext'
        self.downloaded_files = 'artifacts/data_ingestion/data'
        self.local_data_file = 'artifacts/data_transformation/data'
        self.shard_size = 1000000  # 1M tokens per shard

class DataTokenizer:
    def __init__(self, config):
        self.config = config
        self.transformed_file_path = Path(os.path.join(self.config.local_data_file, self.config.dataset_name))
        file_path = Path(os.path.join(config.downloaded_files, config.dataset_name))

        try:
            print(f"Loading dataset from cache at {file_path}")
            self.dataset = load_dataset(config.dataset, name=config.dataset_name, split="train", cache_dir=str(file_path))
        except Exception as e:
            print(f"Error loading cached data: {e}")
            return None

        self.enc = tiktoken.get_encoding('gpt2')
        self.eot = self.enc._special_tokens['<|endoftext|>']  # end of text token

    def check_existing_tokenized_data(self):
        if not os.path.exists(self.transformed_file_path):
            print(f"No tokenized data directory found at {self.transformed_file_path}. Starting tokenization.")
            return False
        files = [os.path.join(self.transformed_file_path, f) for f in os.listdir(self.transformed_file_path) if f.endswith('.npy')]
        if files:
            print(f"Found {len(files)} pre-tokenized shards in {self.transformed_file_path}. Skipping tokenization.")
            return True
        return False

    def tokenize(self, doc):
        tokens = [self.eot]  # the special <|endoftext|> token delimits all documents
        tokens.extend(self.enc.encode_ordinary(doc["text"]))
        tokens_np = np.array(tokens)
        assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
        tokens_np_uint16 = tokens_np.astype(np.uint16)
        return tokens_np_uint16

    def write_datafile(self, filename, tokens_np):
        np.save(filename, tokens_np)

    def process_documents(self):
        if self.check_existing_tokenized_data():
            return  # Skip documents tokenizer if already exist!

        os.makedirs(self.transformed_file_path, exist_ok=True)
        print(f"Created directory at: {self.transformed_file_path}")

        nprocs = max(1, os.cpu_count()//2)
        with mp.Pool(nprocs) as pool:
            shard_index = 0
            all_tokens_np = np.empty((self.config.shard_size,), dtype=np.uint16)
            token_count = 0
            progress_bar = None

            for tokens in pool.imap(self.tokenize, self.dataset, chunksize=16):
                if token_count + len(tokens) < self.config.shard_size:
                    all_tokens_np[token_count:token_count+len(tokens)] = tokens
                    token_count += len(tokens)
                    if progress_bar is None:
                        progress_bar = tqdm(total=self.config.shard_size, unit="tokens", desc=f"Shard {shard_index}")
                    progress_bar.update(len(tokens))
                else:
                    split = "val" if shard_index == 0 else "train"
                    filename = os.path.join(self.transformed_file_path, f"edufineweb_{split}_{shard_index:06d}")
                    remainder = self.config.shard_size - token_count
                    progress_bar.update(remainder)
                    all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
                    self.write_datafile(filename, all_tokens_np)
                    shard_index += 1
                    progress_bar = None
                    all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
                    token_count = len(tokens)-remainder

            if token_count != 0:
                split = "val" if shard_index == 0 else "train"
                filename = os.path.join(self.transformed_file_path, f"edufineweb_{split}_{shard_index:06d}")
                self.write_datafile(filename, all_tokens_np[:token_count])


In [3]:
def main():
    config = Config()
    tokenizer = DataTokenizer(config=config)
    tokenizer.process_documents()

main()

Loading dataset from cache at artifacts/data_ingestion/data/wikitext-2-raw-v1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

No tokenized data directory found at artifacts/data_transformation/data/wikitext-2-raw-v1. Starting tokenization.
Created directory at: artifacts/data_transformation/data/wikitext-2-raw-v1


Shard 0: 100%|██████████| 1000000/1000000 [00:03<00:00, 300129.95tokens/s]
Shard 1: 100%|█████████▉| 999713/1000000 [00:04<00:00, 219832.49tokens/s]
Shard 2:  43%|████▎     | 428539/1000000 [00:01<00:01, 409158.45tokens/s]
