In [None]:
"""
convert_json_to_csv.ipynb

This script processes JSON-formatted outputs from LLM evaluations and merges
them into a single, consolidated CSV file for unified analysis.
"""

In [18]:
import pandas as pd
import json
import csv
import logging
from pathlib import Path
from typing import Dict, List, Any
import sys

In [19]:
# --- CONFIGURATION CELL ---
# Set the model version you want to process.
# This is the only line that needs to be changed for different runs.
MODEL_VERSION = "v3"

In [20]:
# --- Define Project Root and Add to Python Path ---
project_code_root = Path.cwd().parent.resolve()

if str(project_code_root) not in sys.path:
        sys.path.append(str(project_code_root))
    
from core import config

In [21]:
def load_data_sources(original_csv_path: Path, json_paths: Dict[str, Path]) -> Dict[str, Any]:
    """
    Loads the original dataset and all JSON evaluation files into memory.
    """
    logging.info(f"Loading original dataset from: {original_csv_path}")
    # Initialize a dictionary to hold all data, starting with the original CSV.
    data_frames = {
        "original": pd.read_csv(original_csv_path, encoding='utf-8')
    }

    logging.info("Loading JSON evaluation files...")
    # Loop through the dictionary of JSON file paths provided.
    for key, path in json_paths.items():
        with open(path, 'r', encoding='utf-8') as f:
            # Load each JSON file and convert it into a pandas DataFrame.
            data_frames[key] = pd.DataFrame(json.load(f))
    
    logging.info("All data sources loaded successfully.")
    return data_frames

In [22]:
def merge_to_single_dataframe(data_frames: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    """
    Merges the original DataFrame with all evaluation DataFrames into one.

    It renames the columns from each evaluation to prevent conflicts and then
    concatenates all of them together side-by-side.

    Args:
        data_frames (Dict[str, pd.DataFrame]): A dictionary of loaded DataFrames.

    Returns:
        A single pandas DataFrame containing all merged data.
    """
    original_df = data_frames["original"]
    
    # Create a list to hold all DataFrames that will be merged, starting with the original.
    dfs_to_merge: List[pd.DataFrame] = [original_df]

    logging.info("Preparing evaluation data for a unified merge...")
    # Iterate through each loaded DataFrame.
    for key, df in data_frames.items():
        # Skip the original DataFrame as it's already in the list.
        if key == "original":
            continue
        
        # Rename columns to make them unique and descriptive.
        # This prevents column name collisions (e.g., 'score' from relevance.json and 'score' from correctness.json).
        renamed_df = df.rename(columns={
            "score": f"{key}_score",
            "reasoning": f"{key}_reasoning"
        })
        dfs_to_merge.append(renamed_df)
    
    # Concatenate all DataFrames column-wise (side-by-side) into a single DataFrame.
    consolidated_df = pd.concat(dfs_to_merge, axis=1)
    logging.info("Successfully merged all data into a single DataFrame.")
    return consolidated_df


In [23]:
def save_dataframe_to_csv(df: pd.DataFrame, output_path: Path):
    """
    Saves a single DataFrame to a specified CSV file path.

    Args:
        df (pd.DataFrame): The consolidated DataFrame to be saved.
        output_path (Path): The file path for the output CSV.
    """
    # Ensure the directory for the output file exists, creating it if necessary.
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    logging.info(f"Writing consolidated data to '{output_path}'...")
    # Write the DataFrame to a CSV file with specific formatting.
    # index=False prevents writing the DataFrame's row index as a column.
    # quoting=csv.QUOTE_ALL ensures that all fields are quoted, which is safer for text data that might contain commas.
    df.to_csv(output_path, index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
    logging.info("Consolidated result file has been saved successfully.")

In [24]:
def main(model_version: str):
    """
    Main function to orchestrate the data processing pipeline.
    """
    try:
        # --- Configuration ---
        # Define the base directory to make file paths relative and portable.
        BASE_DIR = Path.cwd().parent.parent
        EVAL_DATA_DIR = BASE_DIR / "evaluation_data"
        OUTPUT_DIR = BASE_DIR / "evaluation_results"
        
        # --- Path Definitions ---
        # The original_csv path is built using the model_version passed into the function.
        original_csv = EVAL_DATA_DIR / f"rag_eval_generated_answers_{model_version}.csv"
        
        # A dictionary mapping evaluation types to their corresponding JSON result files.
        json_files = {
            "relevance": EVAL_DATA_DIR / f"relevance_{model_version}.json",
            "faithfulness": EVAL_DATA_DIR / f"faithfulness_{model_version}.json",
            "correctness": EVAL_DATA_DIR / f"correctness_{model_version}.json"
        }
        # Define the final output file path using the model_version.
        output_file = OUTPUT_DIR / f"consolidated_evaluation_results_custom_dataset_{model_version}.csv"

        # --- Pipeline Execution ---
        # Execute the pipeline steps in sequence: load, merge, and save.
        loaded_data = load_data_sources(original_csv, json_files)
        merged_data = merge_to_single_dataframe(loaded_data)
        save_dataframe_to_csv(merged_data, output_file)
        
        logging.info("Script finished successfully!")

    # --- Error Handling ---
    # Catch common errors to provide user-friendly feedback.
    except FileNotFoundError as e:
        logging.error(f"Input file not found: {e}. Please verify file paths and names.")
    except Exception as e:
        logging.error(f"An unexpected error occurred during execution: {e}")

In [27]:
if __name__ == "__main__":
    # Pass the version from the configuration cell into the main function
    main(MODEL_VERSION)