# Upload Corpus to HF

In [1]:
import os
from datasets import load_dataset, DatasetDict
from huggingface_hub import create_repo, HfApi

In [3]:
# ==== CONSTANTS ====
INPUT_FILE   = "../data/1-corpus-all-claims/corpus.txt"
SEED         = 42

# split fractions
TRAIN_SPLIT  = 0.96
VAL_SPLIT    = 0.02
TEST_SPLIT   = 0.02
assert abs(TRAIN_SPLIT + VAL_SPLIT + TEST_SPLIT - 1.0) < 1e-8

# Hugging Face Hub
REPO_ID      = "mhurhangee/ep-patent-all-claims"
PRIVATE      = True
HF_TOKEN     = os.getenv("HF_TOKEN")  # load from .env or set manually

# README dataset card content
README = f"""---
license: cc-by-4.0
language:
- en
tags:
- patents
- claims
- legal
- gpt-training
---

# Patent All Claims

EP granted claim sets (indepdendent and dependent claims). One claim per line. Claims from 20210915-20250806 

## Splits
- Train: {int(TRAIN_SPLIT*100)}%
- Validation: {int(VAL_SPLIT*100)}%
- Test: {int(TEST_SPLIT*100)}%

## Usage
```python
from datasets import load_dataset
ds = load_dataset("{REPO_ID}")
```
"""

In [4]:
raw = load_dataset("text", data_files=INPUT_FILE, split="train")
raw = raw.shuffle(seed=SEED)

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
tmp = raw.train_test_split(test_size=TEST_SPLIT, seed=SEED, shuffle=True)
rest, test = tmp["train"], tmp["test"]

val_frac_of_rest = VAL_SPLIT / (TRAIN_SPLIT + VAL_SPLIT)
tmp2 = rest.train_test_split(test_size=val_frac_of_rest, seed=SEED, shuffle=True)
train, val = tmp2["train"], tmp2["test"]

ds = DatasetDict({"train": train, "validation": val, "test": test})
print({k: len(v) for k, v in ds.items()})

{'train': 4243904, 'validation': 88415, 'test': 88415}


In [6]:
create_repo(REPO_ID, repo_type="dataset", private=PRIVATE, exist_ok=True, token=HF_TOKEN)
ds.push_to_hub(REPO_ID, token=HF_TOKEN)
HfApi(token=HF_TOKEN).upload_file(
path_or_fileobj=README.encode("utf-8"),
path_in_repo="README.md",
repo_id=REPO_ID,
repo_type="dataset"
)
print(f"Pushed to https://huggingface.co/datasets/{REPO_ID}")

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/708 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          |  533kB /  147MB            

Creating parquet from Arrow format:   0%|          | 0/708 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          |  533kB /  147MB            

Creating parquet from Arrow format:   0%|          | 0/708 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          |  533kB /  147MB            

Creating parquet from Arrow format:   0%|          | 0/708 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          |  533kB /  147MB            

Creating parquet from Arrow format:   0%|          | 0/708 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          |  533kB /  147MB            

Creating parquet from Arrow format:   0%|          | 0/708 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          |  533kB /  147MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/89 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   3%|2         |  535kB / 18.5MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/89 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   3%|2         |  535kB / 18.5MB            

Pushed to https://huggingface.co/datasets/mhurhangee/ep-patent-all-claims
