In [1]:
with open(".env", "r") as file:
    for line in file:
        if line.startswith("HF_TOKEN"):
            HF_TOKEN = line.split("=")[1].strip()
            break

In [None]:
from datasets import load_dataset
import os

original_num_shards = 100
new_num_shards = 10000

original_num_samples_per_shard = 2_048_000
new_num_samples_per_shard = 20_480

# NOTE: create a new dataset parquet files that we then need to upload to HF
# stores data in tmp_data/new_data and uses the tmp_data/cache directory for caching

os.makedirs("tmp_data/cache", exist_ok=True)

for original_shard_idx in range(original_num_shards):
    print(f"Processing shard {original_shard_idx}")
    original_shard_idx_str = str(original_shard_idx).zfill(3)
    ds = load_dataset("pico-lm/pretokenized-dolma", split="train", data_files=f"data/train-{original_shard_idx_str}-of-100.parquet", cache_dir="tmp_data/cache", num_proc=10)

    curr_shard_dir = f"tmp_data/new_data/shard_{original_shard_idx}"
    os.makedirs(curr_shard_dir, exist_ok=True)

    for new_shard_idx in range(new_num_shards//original_num_shards):

        dataset_shard = ds.shard(num_shards=100, index=new_shard_idx)

        idx_start = original_shard_idx * original_num_samples_per_shard + new_shard_idx * new_num_samples_per_shard
        shard = dataset_shard.add_column("idx", range(idx_start, idx_start + new_num_samples_per_shard))

        shard_id = str(original_shard_idx * 100 + new_shard_idx).zfill(5)

        shard_file_name = f"train-{shard_id}-of-10000.parquet"

        hf_file_path = f"data/{shard_file_name}"
        shard_file_path = os.path.join(curr_shard_dir, shard_file_name)

        shard.to_parquet(shard_file_path)

In [5]:
from huggingface_hub import HfApi

HF_REPO = "pico-lm/pretokenized-dolma_v2"

api = HfApi()
api.create_repo(HF_REPO, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

for shard_idx in range(0, 100):
    api.upload_folder(
        folder_path=f"/home/rd654/pico-dataset/tmp_data/new_data/shard_{shard_idx}",
        path_in_repo="data",
        repo_id="pico-lm/pretokenized-dolma_v2",
        token=HF_TOKEN,
        repo_type="dataset",
    )

train-00000-of-10000.parquet:   0%|          | 0.00/78.0M [00:00<?, ?B/s]
[A

[A[A


[A[A[A



train-00000-of-10000.parquet:   0%|          | 377k/78.0M [00:00<00:20, 3.73MB/s]
[A

[A[A



[A[A[A[A
[A



train-00000-of-10000.parquet:   6%|▌         | 4.42M/78.0M [00:00<00:05, 14.6MB/s]
[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A
train-00000-of-10000.parquet:  15%|█▌        | 11.8M/78.0M [00:00<00:03, 17.0MB/s]
[A

[A[A



train-00000-of-10000.parquet:  21%|██        | 16.0M/78.0M [00:01<00:04, 12.8MB/s]
[A



[A[A[A[A

train-00000-of-10000.parquet:  34%|███▍      | 26.9M/78.0M [00:01<00:01, 25.9MB/s]
[A



[A[A[A[A

train-00000-of-10000.parquet:  41%|████      | 32.0M/78.0M [00:01<00:01, 24.2MB/s]
[A



train-00000-of-10000.parquet:  49%|████▉     | 38.1M/78.0M [00:01<00:01, 27.8MB/s]

[A[A



[A[A[A[A
train-00000-of-10000.parquet:  54%|█████▍    | 42.0M/78.0M [00:01<00:01, 26.1MB/s]

[A[A
train-00000-of-10000.parquet:  58%|█████▊    | 45.3M/

RuntimeError: Error while uploading 'data/train-02236-of-10000.parquet' to the Hub.