### Dolma Dataset Fine-Grained Sharding

We use this notebook to further shard the dataset that is created by calling `create_dolma_dataset.py`.

The original dataset consists of 100 shards, and this notebook resizes it to 10,000 smaller shards for more efficient processing and distribution.

- **Original dataset**: 100 shards with 2,048,000 samples per shard
- **New dataset**: 10,000 shards with 20,480 samples per shard (100× more granular)
- The notebook creates these new shards locally and uploads them to Hugging Face Hub as a new dataset: `pico-lm/pretokenized-dolma_v2`

In [1]:
# Setting up environment variables for HuggingFace token
with open(".env", "r") as file:
    for line in file:
        if line.startswith("HF_TOKEN"):
            HF_TOKEN = line.split("=")[1].strip()
            break

In [None]:
from datasets import load_dataset
import os

original_num_shards = 100
new_num_shards = 10000

original_num_samples_per_shard = 2_048_000
new_num_samples_per_shard = 20_480

# NOTE: create a new dataset parquet files that we then need to upload to HF
# stores data in tmp_data/new_data and uses the tmp_data/cache directory for caching

os.makedirs("tmp_data/cache", exist_ok=True)

# Iterate over each shard of the original dataset and split it into 100 new shards
for original_shard_idx in range(original_num_shards):
    print(f"Processing shard {original_shard_idx}")
    original_shard_idx_str = str(original_shard_idx).zfill(3)
    ds = load_dataset("pico-lm/pretokenized-dolma", split="train", data_files=f"data/train-{original_shard_idx_str}-of-100.parquet", cache_dir="tmp_data/cache", num_proc=10)

    curr_shard_dir = f"tmp_data/new_data/shard_{original_shard_idx}"
    os.makedirs(curr_shard_dir, exist_ok=True)

    for new_shard_idx in range(new_num_shards//original_num_shards):

        dataset_shard = ds.shard(num_shards=100, index=new_shard_idx)

        idx_start = original_shard_idx * original_num_samples_per_shard + new_shard_idx * new_num_samples_per_shard
        shard = dataset_shard.add_column("idx", range(idx_start, idx_start + new_num_samples_per_shard))

        shard_id = str(original_shard_idx * 100 + new_shard_idx).zfill(5)

        shard_file_name = f"train-{shard_id}-of-10000.parquet"

        hf_file_path = f"data/{shard_file_name}"
        shard_file_path = os.path.join(curr_shard_dir, shard_file_name)

        shard.to_parquet(shard_file_path)

In [None]:
from huggingface_hub import HfApi

HF_REPO = "pico-lm/pretokenized-dolma_v2"

api = HfApi()
api.create_repo(HF_REPO, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

# Upload the new shards to the hub

for shard_idx in range(0, 100):
    api.upload_folder(
        folder_path=f"/home/rd654/pico-dataset/tmp_data/new_data/shard_{shard_idx}",
        path_in_repo="data",
        repo_id="pico-lm/pretokenized-dolma_v2",
        token=HF_TOKEN,
        repo_type="dataset",
    )

    # NOTE: We originally named this new dataset `pretokenized-dolma_v2`; what we did is later we 
    # manually renamed it to `pretokenized-dolma` on Hugging Face and deleted the version created by `create_dolma_dataset.py`
    # In other words, this is the dataset that you see on the Hugging Face Hub.