In [2]:
import json, random
from pathlib import Path
from collections import Counter
from tqdm.auto import tqdm
from datasets import Dataset, Image, ClassLabel

# Config
SEED = 42424242
SAMPLE_SIZE_NEGATIVE = 10_000
BASE_PATH = Path("/home/pdipasquale/MIIA/stuff")
OUTPUT_PATH = BASE_PATH / "output"
ALL_TABLES_PATH = BASE_PATH / "all_tables.json"
HF_REPO = "pierjoe/sec-table-classifier"

random.seed(SEED)

In [3]:
# Step 1: Categorize documents
sct_docs, non_sct_docs, funds, multi_sct_docs = [], [], [], []

for doc_dir in tqdm(list(OUTPUT_PATH.iterdir()), desc="Scanning docs"):
    if not doc_dir.is_dir(): continue
    metadata_path = doc_dir / "metadata.json"
    if not metadata_path.exists(): continue
    
    with open(metadata_path) as f:
        meta = json.load(f)
    
    if meta.get("sic") in ("NULL", None):
        funds.append(doc_dir.name)
        continue
    
    classification_path = doc_dir / "classification_results.json"
    if classification_path.exists():
        with open(classification_path) as f:
            classification = json.load(f)
        num_sct = classification.get("total_tables_found", 0)
        if num_sct == 1:
            sct_docs.append({"doc_id": doc_dir.name, "meta": meta, "classification": classification})
        else:
            multi_sct_docs.append(doc_dir.name)
    elif (doc_dir / "no_sct_found.json").exists():
        non_sct_docs.append(doc_dir.name)

print(f"Funds: {len(funds)} | SCT (1 table): {len(sct_docs)} | SCT (multi): {len(multi_sct_docs)} | No SCT: {len(non_sct_docs)}")

Scanning docs: 100%|██████████| 9558/9558 [00:06<00:00, 1440.09it/s]

Funds: 1717 | SCT (1 table): 5009 | SCT (multi): 1123 | No SCT: 431





In [4]:
# Step 2: Build positive samples (SCT tables)
positive_samples = []
for doc in tqdm(sct_docs, desc="Positive samples"):
    doc_id = doc["doc_id"]
    for table_entry in doc["classification"].get("tables", []):
        table_data = table_entry["table"]
        img_path = OUTPUT_PATH / doc_id / doc_id / "vlm" / table_data.get("img_path", "")
        table_body = table_data.get("table_body", "")
        if img_path.exists() and table_body:
            positive_samples.append({
                "doc_id": doc_id, "image_path": str(img_path), "table_html": table_body,
                "label": 1, "year": doc["meta"].get("year"), "company": doc["meta"].get("company")
            })
print(f"Positive samples: {len(positive_samples)}")

Positive samples: 100%|██████████| 5009/5009 [00:09<00:00, 554.90it/s]

Positive samples: 4998





In [5]:
# Step 3: Load all tables and build negative samples
with open(ALL_TABLES_PATH) as f:
    all_tables = json.load(f)

# Build SCT table keys to exclude
sct_table_keys = {(s["doc_id"], "/".join(s["image_path"].split("/")[-2:])) for s in positive_samples}
sct_doc_ids = {doc["doc_id"] for doc in sct_docs}

# Get negative samples - tables from SCT docs that are NOT the SCT table
all_negative_candidates = []
for table in tqdm(all_tables, desc="Negative candidates"):
    doc_id = table.get("source_doc")
    if doc_id not in sct_doc_ids: continue
    
    img_rel_path = table.get("img_path", "")
    if (doc_id, img_rel_path) in sct_table_keys: continue
    
    img_path = OUTPUT_PATH / doc_id / doc_id / "vlm" / img_rel_path
    table_body = table.get("table_body", "")
    if not img_path.exists() or not table_body: continue
    
    meta_path = OUTPUT_PATH / doc_id / "metadata.json"
    meta = json.load(open(meta_path)) if meta_path.exists() else {}
    
    all_negative_candidates.append({
        "doc_id": doc_id, "image_path": str(img_path), "table_html": table_body,
        "label": 0, "year": meta.get("year"), "company": meta.get("company")
    })

negative_samples = random.sample(all_negative_candidates, min(SAMPLE_SIZE_NEGATIVE, len(all_negative_candidates)))
print(f"Negative samples: {len(negative_samples)} (from {len(all_negative_candidates)} candidates)")

Negative candidates: 100%|██████████| 165797/165797 [01:07<00:00, 2466.87it/s]

Negative samples: 10000 (from 82583 candidates)





In [6]:
# Step 4: Create HuggingFace dataset
all_samples = positive_samples + negative_samples
random.shuffle(all_samples)
print(f"Total: {len(all_samples)} | Positive: {len(positive_samples)} | Negative: {len(negative_samples)}")

dataset = Dataset.from_dict({
    "image": [s["image_path"] for s in all_samples],
    "text": [s["table_html"] for s in all_samples],
    "label": [s["label"] for s in all_samples],
    "doc_id": [s["doc_id"] for s in all_samples],
    "year": [s["year"] for s in all_samples],
    "company": [s["company"] for s in all_samples],
})
dataset = dataset.cast_column("image", Image())
dataset = dataset.cast_column("label", ClassLabel(names=["non_sct", "sct"]))
print(dataset)

Total: 14998 | Positive: 4998 | Negative: 10000


Casting the dataset: 100%|██████████| 14998/14998 [00:00<00:00, 750491.19 examples/s]

Dataset({
    features: ['image', 'text', 'label', 'doc_id', 'year', 'company'],
    num_rows: 14998
})





In [7]:
# Step 5: Split and push to HuggingFace
dataset_split = dataset.train_test_split(test_size=0.2, seed=SEED, stratify_by_column="label")
print(f"Train: {len(dataset_split['train'])} | Test: {len(dataset_split['test'])}")

dataset_split.push_to_hub(HF_REPO, private=False)
print(f"✓ Pushed to: https://huggingface.co/datasets/{HF_REPO}")

Train: 11998 | Test: 3000


Map: 100%|██████████| 4000/4000 [00:02<00:00, 1515.91 examples/s]ards/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00,  8.83ba/s]
Processing Files (1 / 1): 100%|██████████|  331MB /  331MB, 47.3MB/s  
New Data Upload: 100%|██████████|  326MB /  326MB, 46.6MB/s  
Map: 100%|██████████| 3999/3999 [00:02<00:00, 1680.12 examples/s]13.79s/ shards]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00,  8.71ba/s]
Processing Files (1 / 1): 100%|██████████|  336MB /  336MB, 76.1MB/s  
New Data Upload: 100%|██████████|  331MB /  331MB, 75.3MB/s  
Map: 100%|██████████| 3999/3999 [00:02<00:00, 1891.43 examples/s]12.73s/ shards]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00,  8.35ba/s]
Processing Files (1 / 1): 100%|██████████|  336MB /  336MB, 76.2MB/s  
New Data Upload: 100%|██████████|  332MB /  332MB, 75.5MB/s  
Uploading the dataset shards: 100%|██████████| 3/3 [00:45<00:00, 15.22s/ shards]
Map: 100%|██████████| 3000/3000 [00:01<

✓ Pushed to: https://huggingface.co/datasets/pierjoe/sec-table-classifier
