In [16]:
from pathlib import Path
import json
from PIL import Image
from datasets import Dataset, Features, Value, Image as HFImage, Sequence

OUTPUT_DIR = Path("output")

In [17]:
# Trova tutti i documenti con extraction_results.json
records = []

for doc_dir in OUTPUT_DIR.iterdir():
    if not doc_dir.is_dir():
        continue
    
    extraction_file = doc_dir / "extraction_results.json"
    classification_file = doc_dir / "classification_results.json"
    metadata_file = doc_dir / "metadata.json"
    
    # Carica metadata
    if not metadata_file.exists():
        continue
    with open(metadata_file) as f:
        meta = json.load(f)
    
    # Escludi fondi (no SIC)
    if meta.get("sic") == "NULL" or meta.get("sic") is None:
        continue
    
    # Se non ha extraction results, skip
    if not extraction_file.exists() or not classification_file.exists():
        continue
    
    with open(extraction_file) as f:
        extraction = json.load(f)
    with open(classification_file) as f:
        classification = json.load(f)
    
    # Base record con metadati
    base_record = {
        "cik": meta.get("cik"),
        "company": meta.get("company"),
        "year": meta.get("year"),
        "filing_date": meta.get("filing_date"),
        "sic": meta.get("sic"),
        "state_of_inc": meta.get("state_of_inc"),
        "filing_html_index": meta.get("filing_html_index"),
        "accession_number": meta.get("accession_number"),
    }
    
    # Per ogni tabella trovata
    for i, table_info in enumerate(classification.get("tables", [])):
        record = base_record.copy()
        
        # Immagine
        img_path = table_info.get("table", {}).get("img_path", "")
        images_dir = doc_dir / doc_dir.name / "vlm"
        full_img_path = images_dir / img_path
        
        if full_img_path.exists():
            record["table_image"] = str(full_img_path)
        else:
            record["table_image"] = None
        
        # HTML body
        record["table_body"] = table_info.get("table", {}).get("table_body", "")
        
        # Executives (dalla extraction corrispondente)
        if i < len(extraction.get("data", [])):
            execs = extraction["data"][i].get("executives", [])
            record["executives"] = json.dumps(execs)
        else:
            record["executives"] = json.dumps([])
        
        records.append(record)

print(f"Total records: {len(records)}")

Total records: 82


In [18]:
# Crea dataset HuggingFace
ds = Dataset.from_list(records)

# Cast immagini
ds = ds.cast_column("table_image", HFImage())

print(ds)
print(ds[0])

Dataset({
    features: ['cik', 'company', 'year', 'filing_date', 'sic', 'state_of_inc', 'filing_html_index', 'accession_number', 'table_image', 'table_body', 'executives'],
    num_rows: 82
})
{'cik': '1475922', 'company': 'Primerica, Inc.', 'year': 2014, 'filing_date': '2014-03-31T00:00:00', 'sic': '6311', 'state_of_inc': 'DE', 'filing_html_index': 'https://www.sec.gov/Archives/edgar/data/1475922/0001193125-14-123135-index.html', 'accession_number': '0001193125-14-123135', 'table_image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1583x120 at 0x7FFDD97BFDA0>, 'table_body': '<table><tr><td>D. Richard Williams</td><td>Chairman of the Board and Co-Chief Executive Officer</td><td>$3,513,892</td></tr><tr><td>John A. Addison, Jr.</td><td>Chairman of Primerica Distribution and Co-Chief Executive Officer</td><td>$3,498,268</td></tr><tr><td>Glenn J. Williams</td><td>President</td><td>$1,585,077</td></tr><tr><td>Gregory C. Pitts</td><td>Executive Vice President and Chief Operating O

In [19]:
ds.save_to_disk("hf/execcomp-ai-sample")

Saving the dataset (1/1 shards): 100%|██████████| 82/82 [00:00<00:00, 1289.24 examples/s]


In [20]:
# Push to HuggingFace
from huggingface_hub import HfApi
api = HfApi()
# Usa token dalla CLI (huggingface-cli login) oppure passa token="hf_..."
ds.push_to_hub("pierjoe/execcomp-ai-sample")

Map: 100%|██████████| 82/82 [00:00<00:00, 2127.57 examples/s]? shards/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 91.84ba/s]
Processing Files (1 / 1): 100%|██████████| 8.04MB / 8.04MB, 4.46MB/s  
New Data Upload: 100%|██████████| 7.74MB / 7.74MB, 4.30MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.60s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/pierjoe/execcomp-ai-sample/commit/4643476a01a1f551659a4016e8e68a78727ecd36', commit_message='Upload dataset', commit_description='', oid='4643476a01a1f551659a4016e8e68a78727ecd36', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/pierjoe/execcomp-ai-sample', endpoint='https://huggingface.co', repo_type='dataset', repo_id='pierjoe/execcomp-ai-sample'), pr_revision=None, pr_num=None)