# Upload Corpus to HF

In [13]:
# ==== CONSTANTS ====
INPUT_FILE   = "../claim1.txt"  # one claim per line
SEED         = 42

# split fractions
TRAIN_SPLIT  = 0.90
VAL_SPLIT    = 0.05
TEST_SPLIT   = 0.05
assert abs(TRAIN_SPLIT + VAL_SPLIT + TEST_SPLIT - 1.0) < 1e-8

# Hugging Face Hub
REPO_ID      = "mhurhangee/patent-ind-claim-en"
PRIVATE      = True
HF_TOKEN     = os.getenv("HF_TOKEN")  # load from .env or set manually

# README dataset card content
README = f"""---
license: cc-by-4.0
language:
- en
tags:
- patents
- claims
- legal
- gpt-training
---

# Patent Independent Claims

One claim per line. Optional `<EOS>` and `<IDX>` placeholders.

## Splits
- Train: {int(TRAIN_SPLIT*100)}%
- Validation: {int(VAL_SPLIT*100)}%
- Test: {int(TEST_SPLIT*100)}%

## Usage
```python
from datasets import load_dataset
ds = load_dataset("{REPO_ID}")
```
"""

In [14]:
import os
from datasets import load_dataset, DatasetDict
from huggingface_hub import create_repo, HfApi

In [16]:
raw = load_dataset("text", data_files=INPUT_FILE, split="train")
raw = raw.shuffle(seed=SEED)

Generating train split: 0 examples [00:00, ? examples/s]

In [17]:
tmp = raw.train_test_split(test_size=TEST_SPLIT, seed=SEED, shuffle=True)
rest, test = tmp["train"], tmp["test"]

val_frac_of_rest = VAL_SPLIT / (TRAIN_SPLIT + VAL_SPLIT)
tmp2 = rest.train_test_split(test_size=val_frac_of_rest, seed=SEED, shuffle=True)
train, val = tmp2["train"], tmp2["test"]

ds = DatasetDict({"train": train, "validation": val, "test": test})
print({k: len(v) for k, v in ds.items()})

{'train': 502757, 'validation': 27931, 'test': 27931}


In [18]:
create_repo(REPO_ID, repo_type="dataset", private=PRIVATE, exist_ok=True, token=HF_TOKEN)
ds.push_to_hub(REPO_ID, token=HF_TOKEN)
HfApi(token=HF_TOKEN).upload_file(
path_or_fileobj=README.encode("utf-8"),
path_in_repo="README.md",
repo_id=REPO_ID,
repo_type="dataset"
)
print(f"Pushed to https://huggingface.co/datasets/{REPO_ID}")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/252 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/252 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Pushed to https://huggingface.co/datasets/mhurhangee/patent-ind-claim-en
