In [13]:
# --- Runner: executa todos os objetivos da EDA (end-to-end) ---

from pathlib import Path
import sys, json, pandas as pd

# (1) Garantir que o Python enxergue os módulos em src/eda
for _candidate in [Path.cwd()/"src", Path.cwd().parent/"src"]:
    if _candidate.exists() and str(_candidate) not in sys.path:
        sys.path.append(str(_candidate))

from eda.io_tar import read_parquet_partitions_from_tar, normalize_mon_period_m
from eda.check_schema_vol import schema_report, leakage_report, volumetria_report
from eda.prevalence import prevalence_table
from eda.baseline import baseline_from_prevalence
from eda.artifacts import write_manifest

# (2) Config (ajuste os caminhos ROOT/PATH_* se necessário)
ROOT = Path("/mnt/wsl/PHYSICALDRIVE1p1/datasets/mini-DSB")  # <- ajuste para o seu ambiente
PATH_TARGETS = ROOT / "targets.tar.gz"
PATH_SPLIT   = ROOT / "client_split.tar.gz"
REPORTS = Path("../reports") if (Path.cwd().name == "notebooks") else Path("./reports")
REPORTS.mkdir(parents=True, exist_ok=True)

# modos de execução (rápido vs completo)
FAST_MODE = False
LIMIT_FOLDS = None            # ex.: {0,1} para amostrar folds
MAX_PARTS_PER_PREFIX = 40 if FAST_MODE else None

def _print_summary(prev, vol, leaks):
    print("\n===== RESUMO =====")
    if isinstance(vol, dict):
        print(f"targets total rows: {vol.get('rows_targets_total')}")
        ncli = vol.get('n_clients_total')
        if ncli is not None:
            print(f"nº de clientes (client_split): {ncli}")
    if prev is not None and not prev.empty and "ALL" in prev.index:
        print("\nPrevalência global (linha 'ALL'):")
        display(prev.loc[["ALL"]])
    if leaks is not None and not leaks.empty:
        print(f"\n[ALERTA] Clientes em >1 fold: {len(leaks)} (ver reports/fold_leakage.csv)")
    print("==================\n")

def run_all():
    print("[INFO] Executando pipeline EDA...")

    # 1) Carregar dados dos TARs (reconstrói 'fold' a partir do caminho) + normalizar 'mon'
    allowed = {"fold": LIMIT_FOLDS} if LIMIT_FOLDS is not None else None
    t = read_parquet_partitions_from_tar(
        PATH_TARGETS, prefix="targets/", expected_keys=("fold",),
        allowed_partitions=allowed, max_parts=MAX_PARTS_PER_PREFIX
    )
    cs = read_parquet_partitions_from_tar(
        PATH_SPLIT, prefix="client_split/", expected_keys=("fold",),
        allowed_partitions=allowed, max_parts=MAX_PARTS_PER_PREFIX
    )
    t = normalize_mon_period_m(t, "mon")

    # 2) Esquema & volumetria (+ vazamento)
    sch_t = schema_report(t, "targets", reports_dir=REPORTS)
    sch_cs = schema_report(cs, "client_split", reports_dir=REPORTS)
    leaks = leakage_report(cs)
    if len(leaks):
        leaks.to_csv(REPORTS / "fold_leakage.csv", index=False)
        print("[ALERTA] fold_leakage.csv salvo em reports/")
    vol = volumetria_report(t, cs)
    with (REPORTS / "volumetria.json").open("w") as f:
        json.dump(vol, f, indent=2, ensure_ascii=False)
    print("[OK] volumetria.json salvo.")

    # 3) Prevalência
    prev = prevalence_table(t)
    if not prev.empty:
        prev.to_csv(REPORTS / "prevalencia.csv")
        print("[OK] prevalencia.csv salvo.")
    else:
        print("[INFO] Prevalência não calculada (colunas target_* ausentes).")

    # 4) Baseline AUPRC (≈ prevalência)
    baseline = baseline_from_prevalence(prev) if not prev.empty else pd.DataFrame()
    if not baseline.empty:
        baseline.to_csv(REPORTS / "baseline_auprc.csv")
        print("[OK] baseline_auprc.csv salvo (≈ prevalência).")

    # 5) Manifest
    _ = write_manifest(PATH_TARGETS, PATH_SPLIT, REPORTS, t, cs)
    print("[OK] manifest.json salvo.")

    # 6) Resumo
    _print_summary(prev, vol, leaks)

    print("[OK] EDA concluída.")
    return {"targets": t, "client_split": cs, "prev": prev, "vol": vol, "baseline": baseline}

# Execute tudo agora (comente esta linha se quiser rodar manualmente depois)
results = run_all()


[INFO] Executando pipeline EDA...
[OK] Lidos 435 arquivos parquet de 'targets/' em targets.tar.gz. Partições recuperadas: ['fold']
[OK] Lidos 1000 arquivos parquet de 'client_split/' em client_split.tar.gz. Partições recuperadas: ['fold']
[OK] volumetria.json salvo.
[OK] prevalencia.csv salvo.
[OK] baseline_auprc.csv salvo (≈ prevalência).
[OK] manifest.json salvo.

===== RESUMO =====
targets total rows: 1202688
nº de clientes (client_split): 100224

Prevalência global (linha 'ALL'):


Unnamed: 0,target_1,target_2,target_3,target_4
ALL,0.004458,0.00048,0.003767,0.002544



[OK] EDA concluída.
