In [None]:
import json
import re
import math
import csv
from statistics import mean, median, pstdev

WORD_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?|\d+")

# Frasi: split semplice su . ! ? (non perfetto, ma utile come proxy)
SENT_SPLIT_RE = re.compile(r"[.!?]+")

def count_words(text: str) -> int:
    """
    Conta 'parole' in modo robusto:
    - sequenze di lettere (gestisce apostrofi tipo don't)
    - numeri (es. 1807)
    Perché: split() sbaglia facilmente con punteggiatura e trattini.
    """
    if not text:
        return 0
    return len(WORD_RE.findall(text))

def count_sentences(text: str) -> int:
    """
    Conta frasi in modo semplice: separa per . ! ?
    Perché: utile come metrica extra, senza dipendenze esterne.
    """
    if not text:
        return 0
    parts = [p.strip() for p in SENT_SPLIT_RE.split(text) if p.strip()]
    return len(parts)

def percentile(sorted_values, p: float):
    """
    Percentile con interpolazione lineare.
    sorted_values: lista già ordinata
    p: percentile in [0, 100]
    Perché: vedere la 'coda' (p90/p95/p99) spesso è più informativo della media.
    """
    if not sorted_values:
        return None
    if p <= 0:
        return sorted_values[0]
    if p >= 100:
        return sorted_values[-1]

    k = (len(sorted_values) - 1) * (p / 100.0)
    f = math.floor(k)
    c = math.ceil(k)
    if f == c:
        return sorted_values[int(k)]
    d0 = sorted_values[f] * (c - k)
    d1 = sorted_values[c] * (k - f)
    return d0 + d1

def summarize_lengths(lengths):
    """
    Calcola un set di statistiche descrittive.
    Uso pstdev (deviazione standard 'popolazione') perché stai descrivendo
    l'intero dataset, non stimando da un campione.
    """
    if not lengths:
        return {}

    lengths_sorted = sorted(lengths)
    return {
        "count": len(lengths_sorted),
        "min": lengths_sorted[0],
        "max": lengths_sorted[-1],
        "mean": mean(lengths_sorted),
        "median": median(lengths_sorted),
        "std": pstdev(lengths_sorted),
        "p10": percentile(lengths_sorted, 10),
        "p25": percentile(lengths_sorted, 25),
        "p75": percentile(lengths_sorted, 75),
        "p90": percentile(lengths_sorted, 90),
        "p95": percentile(lengths_sorted, 95),
        "p99": percentile(lengths_sorted, 99),
    }

def load_json_any_shape(path: str):
    """
    Gestisce due formati comuni:
    1) JSON array: [ {...}, {...}, ... ]
    2) JSON lines: una riga = un oggetto JSON
    Perché: molti dataset grandi sono in JSONL.
    """
    with open(path, "r", encoding="utf-8") as f:
        content = f.read().strip()

    if not content:
        return []

    # Provo prima come JSON "normale"
    try:
        data = json.loads(content)
        if isinstance(data, list):
            return data
        # se è un singolo oggetto, lo metto in lista
        if isinstance(data, dict):
            return [data]
    except json.JSONDecodeError:
        pass

    # Fallback: JSON Lines
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    return items

def main(json_path: str, save_csv_path: str | None = None, top_n: int = 5):
    data = load_json_any_shape(json_path)

    rows = []
    for obj in data:
        context = obj.get("context", "")
        para_id = obj.get("para_id", "")
        w = count_words(context)
        ch = len(context) if context else 0
        s = count_sentences(context)
        rows.append({
            "para_id": para_id,
            "words": w,
            "chars": ch,
            "sentences": s,
        })

    word_lengths = [r["words"] for r in rows]
    char_lengths = [r["chars"] for r in rows]
    sent_counts = [r["sentences"] for r in rows]

    word_stats = summarize_lengths(word_lengths)
    char_stats = summarize_lengths(char_lengths)
    sent_stats = summarize_lengths(sent_counts)

    print("\n=== STATISTICHE SU context ===")
    print(f"File: {json_path}")
    print("\n--- Parole (words) ---")
    for k, v in word_stats.items():
        print(f"{k:>6}: {v:.2f}" if isinstance(v, float) else f"{k:>6}: {v}")

    print("\n--- Caratteri (chars) ---")
    for k, v in char_stats.items():
        print(f"{k:>6}: {v:.2f}" if isinstance(v, float) else f"{k:>6}: {v}")

    print("\n--- Frasi (sentences) ---")
    for k, v in sent_stats.items():
        print(f"{k:>6}: {v:.2f}" if isinstance(v, float) else f"{k:>6}: {v}")

    # Top N più lunghi/corti per parole (utile per ispezione)
    rows_sorted = sorted(rows, key=lambda r: r["words"])
    print(f"\n--- Top {top_n} più corti (per parole) ---")
    for r in rows_sorted[:top_n]:
        print(f"{r['para_id']}: {r['words']} parole")

    print(f"\n--- Top {top_n} più lunghi (per parole) ---")
    for r in rows_sorted[-top_n:][::-1]:
        print(f"{r['para_id']}: {r['words']} parole")

    if save_csv_path:
        with open(save_csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=["para_id", "words", "chars", "sentences"])
            writer.writeheader()
            writer.writerows(rows)
        print(f"\n[OK] Salvato CSV: {save_csv_path}")

In [None]:
json_path = "/Users/albi/GitHub/f1nder/data/document_collection.json"
csv_path = "/Users/albi/GitHub/f1nder/artifacts/exploration_analysis/out.csv"
top_n = 12

main(json_path, save_csv_path=csv_path, top_n=top_n)


=== STATISTICHE SU context ===
File: /Users/albi/GitHub/f1nder/data/document_collection.json

--- Parole (words) ---
 count: 131921
   min: 1
   max: 4077
  mean: 221.50
median: 227
   std: 71.16
   p10: 185
   p25: 210
   p75: 238
   p90: 246
   p95: 251
   p99: 281.00

--- Caratteri (chars) ---
 count: 131921
   min: 5
   max: 27001
  mean: 1262.00
median: 1289
   std: 394.48
   p10: 1045
   p25: 1188
   p75: 1364
   p90: 1427
   p95: 1466
   p99: 1573.00

--- Frasi (sentences) ---
 count: 131921
   min: 1
   max: 1121
  mean: 19.87
median: 16
   std: 15.46
   p10: 8
   p25: 11
   p75: 25
   p90: 36
   p95: 45
   p99: 69.00

--- Top 12 più corti (per parole) ---
Michigan_18370308_24: 1 parole
California_18581022_15: 2 parole
Kentucky_18480527_6: 3 parole
Wisconsin_18760817_18: 7 parole
Kentucky_18640630_16: 11 parole
Missouri_18401003_27: 14 parole
Delaware_18761114_19: 17 parole
New_Jersey_18550418_34: 22 parole
New_Jersey_18570117_41: 22 parole
Arkansas_18510129_29: 23 parole
Verm