In [3]:
#!/usr/bin/env python3
"""
Scrape a subset of The Pile using Hugging Face datasets in streaming mode.

Steps:
 1. Install datasets library: pip install datasets
 2. Run this script to download a subset of The Pile from Hugging Face.
 3. The result is saved to 'pile_subset.json' as a list of document strings.
"""

import json
from datasets import load_dataset

def sample_c4_docs(sample_size=1000):
    """
    Stream through c4/en in a doc-by-doc manner, collecting 'sample_size' docs.
    Each doc is typically a big chunk of cleaned web text.
    """
    # c4 is ~306GB total, we avoid full download by streaming
    dataset = load_dataset("allenai/c4", "en", streaming=True)


    # 'dataset' is an iterable. We'll only iterate over 'train' split docs (the main portion).
    docs_iter = dataset["train"]  # an iterator over dicts like {'text': "...", 'timestamp': "...", ...}
    
    docs = []
    count = 0
    for doc in docs_iter:
        text = doc["text"]
        docs.append(text)
        count += 1
        if count >= sample_size:
            break
    return docs



In [5]:

sample_size = 2000000    # Adjust as desired
out_file = "C4_subset.json"
print(f"[INFO] Sampling {sample_size} docs from The Pile (streaming mode).")
docs = sample_c4_docs(sample_size)
print(f"[INFO] Collected {len(docs)} documents.")
# Save to JSON
with open(out_file, "w", encoding="utf-8") as f:
    json.dump(docs, f, indent=2)
print(f"[INFO] Wrote {len(docs)} docs to {out_file}.")

[INFO] Sampling 2000000 docs from The Pile (streaming mode).


Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: ab43a5cc-607c-4a7b-b0a6-31282eedc7f2)')' thrown while requesting GET https://huggingface.co/datasets/allenai/c4/resolve/1588ec454efa1a09f29cd18ddd04fe05fc8653a2/en/c4-train.00000-of-01024.json.gz
Retrying in 1s [Retry 1/5].
'HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/allenai/c4/resolve/1588ec454efa1a09f29cd18ddd04fe05fc8653a2/en/c4-train.00000-of-01024.json.gz
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: f0cde7a5-9642-4f04-9a0b-2ff196430fcc)')' thrown while requesting GET https://huggingface.co/datasets/allenai/c4/resolve/1588ec454efa1a09f29cd18ddd04fe05fc8653a2/en/c4-train.00000-of-01024.json.gz
Retrying in 2s [Retry 2/5].


[INFO] Collected 2000000 documents.
[INFO] Wrote 2000000 docs to C4_subset.json.
