In [None]:
import os
from pathlib import Path
import sys
from pathlib import Path

def get_root_path():
    """Always use the same, absolute (relative to root) paths

    which makes moving the notebooks around easier.
    """
    return Path(os.getcwd()).parent

PROJECT_DIR = Path(get_root_path())
assert PROJECT_DIR.exists(), PROJECT_DIR

if str(PROJECT_DIR) not in sys.path:
    sys.path.insert(0, str(PROJECT_DIR))

print("Using project at:", PROJECT_DIR)

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Merge eval_queries.jsonl and rag_pipeline_results.jsonl into a single dataset.
- Keeps columns: id, category, query, answer, section_key from eval set
- Adds gen_answer from results (left join on id by default; falls back to query if requested/needed)
- Adds empty columns: Clareza, Precisão, Utilidade
- Saves CSV and (if available) Parquet
Usage:
    python merge_eval_results.py \
        --eval /path/to/eval_queries.jsonl \
        --results /path/to/rag_pipeline_results.jsonl \
        --out /path/to/output.csv \
        [--merge-on auto|id|query]
"""
import argparse
import sys
from pathlib import Path
from datetime import datetime
import pandas as pd

REQUIRED_EVAL_COLS = ["id", "category", "query", "answer", "section_key"]
SCORE_COLS = ["Clareza\n1 - 5", "Precisão\n1 - 5", "Utilidade\n1 - 5"]

def read_jsonl(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")
    try:
        return pd.read_json(path, lines=True)
    except ValueError as e:
        raise ValueError(f"Failed to parse JSONL at {path}: {e}")

def main():
    
    eval_df = read_jsonl(PROJECT_DIR / "notebooks/eval_queries.jsonl")
    res_df = read_jsonl(PROJECT_DIR / "notebooks/eval_results/v2/rag_pipeline_results.jsonl")

    # Validate eval columns
    missing_eval = [c for c in REQUIRED_EVAL_COLS if c not in eval_df.columns]
    if missing_eval:
        raise SystemExit(f"Missing required columns in eval_queries: {missing_eval}")

    # Determine merge key
    merge_key = "id"

    # Ensure gen_answer exists
    if "gen_answer" not in res_df.columns:
        res_df["gen_answer"] = None

    # Build a slim results frame
    res_keep = res_df[[merge_key, "gen_answer"]].copy()

    # Perform left join
    merged = pd.merge(
        eval_df[REQUIRED_EVAL_COLS].copy(),
        res_keep,
        how="left",
        left_on=merge_key,
        right_on=merge_key,
        suffixes=("", "_res"),
    )

    # Add scoring columns
    for col in SCORE_COLS:
        if col not in merged.columns:
            merged[col] = ""

    # Determine output path
    
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = PROJECT_DIR / f"notebooks/eval_with_results_{ts}.csv"

    # Save CSV
    merged.to_csv(out_path, index=False)


    # Summary
    print("✅ Merge completed")
    print(f" • rows: {len(merged):,}")
    print(f" • cols: {len(merged.columns):,} -> {list(merged.columns)}")
    print(f" • CSV: {out_path}")

if __name__ == "__main__":
    main()
