In [4]:
from pathlib import Path
import json
from PIL import Image
from datasets import Dataset, Features, Value, Image as HFImage, Sequence

OUTPUT_DIR = Path("output")

In [6]:
# Trova tutti i documenti con extraction_results.json
records = []

for doc_dir in OUTPUT_DIR.iterdir():
    if not doc_dir.is_dir():
        continue
    
    extraction_file = doc_dir / "extraction_results.json"
    classification_file = doc_dir / "classification_results.json"
    metadata_file = doc_dir / "metadata.json"
    
    # Carica metadata
    if not metadata_file.exists():
        continue
    with open(metadata_file) as f:
        meta = json.load(f)
    
    # Controlla se è un fondo (no SIC)
    is_fund = meta.get("sic") == "NULL" or meta.get("sic") is None
    
    # Base record con metadati
    base_record = {
        "cik": meta.get("cik"),
        "company": meta.get("company"),
        "year": meta.get("year"),
        "filing_date": meta.get("filing_date"),
        "sic": meta.get("sic"),
        "is_fund": is_fund,
        "state_of_inc": meta.get("state_of_inc"),
        "filing_html_index": meta.get("filing_html_index"),
        "accession_number": meta.get("accession_number"),
    }
    
    # Se è un fondo, niente tabelle exec comp
    if is_fund:
        base_record["table_image"] = None
        base_record["table_body"] = None
        base_record["executives"] = json.dumps([])
        records.append(base_record)
        continue
    
    # Se non ha extraction results, skip
    if not extraction_file.exists() or not classification_file.exists():
        continue
    
    with open(extraction_file) as f:
        extraction = json.load(f)
    with open(classification_file) as f:
        classification = json.load(f)
    
    # Per ogni tabella trovata
    for i, table_info in enumerate(classification.get("tables", [])):
        record = base_record.copy()
        
        # Immagine
        img_path = table_info.get("table", {}).get("img_path", "")
        images_dir = doc_dir / doc_dir.name / "vlm"
        full_img_path = images_dir / img_path
        
        if full_img_path.exists():
            record["table_image"] = str(full_img_path)
        else:
            record["table_image"] = None
        
        # HTML body
        record["table_body"] = table_info.get("table", {}).get("table_body", "")
        
        # Executives (dalla extraction corrispondente)
        if i < len(extraction.get("data", [])):
            execs = extraction["data"][i].get("executives", [])
            record["executives"] = json.dumps(execs)
        else:
            record["executives"] = json.dumps([])
        
        records.append(record)

print(f"Total records: {len(records)}")
print(f"Funds (no exec comp): {sum(1 for r in records if r['is_fund'])}")
print(f"With tables: {sum(1 for r in records if not r['is_fund'])}")

Total records: 48
Funds (no exec comp): 47
With tables: 1


In [7]:
# Crea dataset HuggingFace
ds = Dataset.from_list(records)

# Cast immagini
ds = ds.cast_column("table_image", HFImage())

print(ds)
print(ds[0])

Dataset({
    features: ['cik', 'company', 'year', 'filing_date', 'sic', 'is_fund', 'state_of_inc', 'filing_html_index', 'accession_number', 'table_image', 'table_body', 'executives'],
    num_rows: 48
})
{'cik': '1059386', 'company': 'VAN KAMPEN SENIOR INCOME TRUST', 'year': 2006, 'filing_date': '2006-05-19T00:00:00', 'sic': 'NULL', 'is_fund': True, 'state_of_inc': 'MA', 'filing_html_index': 'https://www.sec.gov/Archives/edgar/data/1059386/0000950137-06-006125-index.html', 'accession_number': '0000950137-06-006125', 'table_image': None, 'table_body': None, 'executives': '[]'}


In [None]:
ds.save_to_disk("execcomp-ai-sample")

Saving the dataset (1/1 shards): 100%|██████████| 48/48 [00:00<00:00, 5324.97 examples/s]

Saved to hf_dataset/





In [None]:
# Push to HuggingFace (decommentare quando pronto)
# from huggingface_hub import login
# login()
# ds.push_to_hub("your-username/sec-exec-compensation")