In [None]:
!pip install sentence-transformers
!pip install datasets
!pip install huggingface_hub

In [None]:
# Standard library imports
import json
import os
import pandas as pd
import numpy as np
from huggingface_hub import hf_hub_download, HfFolder
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

# --- Our project's modules ---
from config import CONFIG, setup_directories
from src.utils import setup_logger, load_json
from src.api_manager import GeminiAPIManager
from src.orchestration import run_experiments
from src.evaluation import analyze_experiment_logs
# --- NEW: Import for HF synchronization ---
from src.hf_sync import initialize_workspace, sync_workspace_to_hub

# --- Initial Setup ---
# 1. Create the directory structure (logs, results, etc.)
setup_directories()

# 2. Initialize the main logger for the notebook
logger = setup_logger('main_orchestrator', log_dir=CONFIG['LOGS_DIR'])

# --- MODIFIED: Hugging Face Hub Workspace Initialization from Config ---
if CONFIG.get('PERSIST_RESULTS_ONLINE'):
    # The token is now read directly from the CONFIG dictionary.
    # Make sure you have set it in your config.py file.
    if CONFIG.get("HF_SYNC_TOKEN"):
        logger.info("HF_SYNC_TOKEN found in config. Initializing workspace from Hugging Face Hub.")
        initialize_workspace(CONFIG)
    else:
        logger.warning("PERSIST_RESULTS_ONLINE is True, but no HF_SYNC_TOKEN was found in config.py.")
        logger.warning("Online persistence will fail. Please add your token to config.py to enable it.")

logger.info("Notebook execution started. Directories and logger are set up.")

In [None]:
# This cell handles loading all necessary assets. If any part fails,
# the execution should stop here.

# --- 1. Load Embedding Model ---
try:
    embedding_model = SentenceTransformer(CONFIG['EMBEDDING_MODEL_PATH'])
    logger.info(f"Successfully loaded SentenceTransformer model from: {CONFIG['EMBEDDING_MODEL_PATH']}")
except Exception as e:
    logger.critical(f"Failed to load embedding model. Aborting. Error: {e}", exc_info=True)
    embedding_model = None # Ensure it's None if loading fails

# --- 2. Load "Hard Questions" (Target Queries) using Indices ---
if embedding_model:
    hard_questions_list = []
    hard_questions_ground_truths = []
    try:
        # Load the main dataset which now serves as the source for all questions
        main_ds = load_dataset('AI-MO/NuminaMath-CoT', split='train')
        
        # Load the indices that specify which questions are "hard"
        # This uses the new path from your updated config.py
        hard_question_indices = load_json(CONFIG['HARD_QUESTIONS_INDICES_PATH'])
        
        if hard_question_indices:
            # Use the indices to directly select the questions and solutions from the main dataset
            hard_questions_list = [main_ds['problem'][i] for i in hard_question_indices]
            hard_questions_ground_truths = [main_ds['solution'][i] for i in hard_question_indices]
            logger.info(f"Successfully loaded {len(hard_questions_list)} hard questions and their ground truths using indices.")
        else:
            logger.critical(f"Hard questions index file not found or empty at {CONFIG['HARD_QUESTIONS_INDICES_PATH']}. Aborting.")
    except Exception as e:
        logger.critical(f"An error occurred loading hard questions. Aborting. Error: {e}", exc_info=True)


# --- 2. Load "Hard Questions" (Target Queries) ---
if embedding_model:
    hard_questions_list = []
    hard_questions_ground_truths = []
    try:
        # Load the main dataset
        main_ds = load_dataset('AI-MO/NuminaMath-CoT', split='train')

        # --- NEW LOGIC ---
        # Load the indices of the hard questions
        hard_question_indices = load_json(CONFIG['HARD_QUESTIONS_INDICES_PATH'])

        if hard_question_indices:
            # Directly create the lists using the indices
            hard_questions_list = [main_ds['problem'][i] for i in hard_question_indices]
            hard_questions_ground_truths = [main_ds['solution'][i] for i in hard_question_indices]

            # Update the log message
            logger.info(f"Successfully loaded {len(hard_questions_list)} hard questions and their ground truths using indices.")
        else:
            logger.critical(f"Hard questions index file not found or empty at {CONFIG['HARD_QUESTIONS_INDICES_PATH']}. Aborting.")
        # --- END NEW LOGIC ---

    except Exception as e:
        logger.critical(f"An error occurred loading hard questions. Aborting. Error: {e}", exc_info=True)
        
# --- 4. Initialize Gemini API Manager ---
if embedding_model and hard_questions_list and exemplar_data:
    try:
        gemini_manager = GeminiAPIManager(
            api_keys=CONFIG['GEMINI_API_KEYS'],
            model_quotas=CONFIG['GEMINI_MODEL_QUOTAS'],
            global_delay_seconds=CONFIG['GLOBAL_API_CALL_DELAY_SECONDS']
        )
        logger.info("GeminiAPIManager initialized successfully.")
    except Exception as e:
        logger.critical(f"Failed to initialize GeminiAPIManager. Aborting. Error: {e}", exc_info=True)
        gemini_manager = None



In [None]:
# This is the control panel for your research. Define different experiment
# configurations by creating dictionaries that override the default CONFIG.

experiment_configurations = [
    {
        "experiment_name": "RAG_Only_Pass@3",
        "USE_RETRIEVAL": True, # Explicitly set for clarity
        "APPLY_STANDARDIZATION": False,
        "APPLY_TRANSFORMATION": False,
        "APPLY_MERGING": False,
        "TOP_N_CANDIDATES_RETRIEVAL": 1,
        "N_PASS_ATTEMPTS": 3,
        "DEFAULT_PASS_N_SOLVER_TEMPERATURE": 1.0
    },
    {
        "experiment_name": "RAG_plus_Standardize_Pass@3",
        "APPLY_STANDARDIZATION": True,
        "APPLY_TRANSFORMATION": False,
        "APPLY_MERGING": False,
        "TOP_N_CANDIDATES_RETRIEVAL": 1,
        "N_PASS_ATTEMPTS": 3,
        "DEFAULT_PASS_N_SOLVER_TEMPERATURE": 1.0,
        "DEFAULT_ADAPTATION_TEMPERATURE": 0.0 # Be explicit about adaptation temp
    },
    {
        "experiment_name": "Full_Pipeline_K3_Pass@1",
        "APPLY_STANDARDIZATION": True,
        "APPLY_TRANSFORMATION": True,
        "APPLY_MERGING": True,
        "TOP_N_CANDIDATES_RETRIEVAL": 3, # Retrieve more to allow for merging
        "TARGET_ADAPTED_SAMPLES_MERGING": 1,
        "N_PASS_ATTEMPTS": 1,
        "DEFAULT_PASS_N_SOLVER_TEMPERATURE": 0.5 # Lower temp for single pass
    },
]

logger.info(f"Defined {len(experiment_configurations)} experiments to run.")
print("Experiments to run:")
for exp in experiment_configurations:
    print(f"- {exp['experiment_name']}")

In [None]:
# This cell takes the logs generated by the experiments and runs the
# LLM-based evaluation to produce the final Pass@K summary.

if all_experiment_logs:
    logger.info("Starting analysis of experiment results.")
    
    # This single function call evaluates all experiments and returns a DataFrame
    summary_df = analyze_experiment_logs(
        all_experiments_logs=all_experiment_logs,
        ground_truths=hard_questions_ground_truths,
        gemini_manager=gemini_manager,
        config=CONFIG
    )
    
    # Save the final summary to a CSV file for easy access
    summary_path = os.path.join(CONFIG['RESULTS_DIR'], "final_experiment_summary.csv")
    summary_df.to_csv(summary_path, index=False)
    
    logger.info(f"Analysis complete. Summary saved to {summary_path}")
    
    # Display the final results in the notebook
    print("\n--- Experiment Summary ---")
    
    # Set display options for better viewing of the DataFrame
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    
    display(summary_df)

else:
    logger.warning("No experiment logs were generated or loaded. Skipping analysis.")

# --- NEW: Final synchronization to save all results ---
logger.info("Performing final synchronization to Hugging Face Hub.")
sync_workspace_to_hub(CONFIG)
logger.info("Final sync complete. All results are saved online.")

In [None]:
# This cell takes the logs generated by the experiments and runs the
# LLM-based evaluation to produce the final Pass@K summary.

# --- NEW: Import for final sync ---
from src.hf_sync import sync_workspace_to_hub

if all_experiment_logs:
    logger.info("Starting analysis of experiment results.")
    
    # This single function call evaluates all experiments and returns a DataFrame
    summary_df = analyze_experiment_logs(
        all_experiments_logs=all_experiment_logs,
        ground_truths=hard_questions_ground_truths,
        gemini_manager=gemini_manager,
        config=CONFIG
    )
    
    # Save the final summary to a CSV file for easy access
    summary_path = os.path.join(CONFIG['RESULTS_DIR'], "final_experiment_summary.csv")
    summary_df.to_csv(summary_path, index=False)
    
    logger.info(f"Analysis complete. Summary saved to {summary_path}")
    
    # Display the final results in the notebook
    print("\n--- Experiment Summary ---")
    
    # Set display options for better viewing of the DataFrame
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    
    display(summary_df)

else:
    logger.warning("No experiment logs were generated or loaded. Skipping analysis.")

# --- NEW: Final synchronization to save all results ---
logger.info("Performing final synchronization to Hugging Face Hub.")
sync_workspace_to_hub(CONFIG)
logger.info("Final sync complete. All results are saved online.")

In [None]:
# --- NEW CELL: RETRY FAILED EVALUATIONS ---
# This cell attempts to retry the evaluations that failed due to API errors
# and then recalculates the summary to see the impact of the retries.

# --- Import the new error handler functions ---
from src.error_handler import retry_failed_evaluations

if all_experiment_logs:
    logger.info("Starting the retry process for failed evaluations.")
    
    # This function will find API_ERRORs in the logs and retry them, updating the logs in place.
    retry_failed_evaluations(
        all_experiments_logs=all_experiment_logs,
        ground_truths=hard_questions_ground_truths,
        gemini_manager=gemini_manager,
        config=CONFIG
    )
    
    logger.info("Retry process complete. Recalculating the experiment summary.")
    
    # Now, we run the analysis again to get the updated summary
    updated_summary_df = analyze_experiment_logs(
        all_experiments_logs=all_experiment_logs,
        ground_truths=hard_questions_ground_truths,
        gemini_manager=gemini_manager,
        config=CONFIG
    )
    
    # Save the new summary to a different CSV file
    updated_summary_path = os.path.join(CONFIG['RESULTS_DIR'], "final_experiment_summary_after_retry.csv")
    updated_summary_df.to_csv(updated_summary_path, index=False)
    
    logger.info(f"Analysis after retry complete. New summary saved to {updated_summary_path}")
    
    # Display the new summary
    print("\n--- Experiment Summary After Retries ---")
    display(updated_summary_df)

else:
    logger.warning("No experiment logs were loaded. Skipping retry process.")

# --- Final sync to save the updated logs and summary ---
logger.info("Performing final synchronization after retries.")
sync_workspace_to_hub(CONFIG)
logger.info("Final sync complete.")

In [None]:
# --- NEW CELL: API ERROR EXPLORATION REPORT ---
# This cell generates a detailed report of all API errors that occurred
# during the evaluation process, helping to diagnose the issues.

# --- Import the error reporting function ---
from src.error_handler import generate_error_report

if all_experiment_logs:
    logger.info("Generating a detailed report of API errors from the evaluation logs.")
    
    # This function scans all logs and compiles a report of API_ERRORs
    error_df = generate_error_report(
        all_experiments_logs=all_experiment_logs,
        config=CONFIG
    )
    
    if not error_df.empty:
        print("\n--- API Error Exploration Report ---")
        # Set display options for better viewing of the DataFrame
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 1000)
        pd.set_option('display.max_rows', 100)
        
        display(error_df)
        
        # Save the report to a CSV file for further analysis
        error_report_path = os.path.join(CONFIG['RESULTS_DIR'], "api_error_report.csv")
        error_df.to_csv(error_report_path, index=False)
        logger.info(f"API error report saved to {error_report_path}")
        
    else:
        print("\n--- No API Errors Found ---")
        logger.info("The error report is empty because no API_ERRORs were logged.")
else:
    logger.warning("No experiment logs loaded, cannot generate an error report.")

In [None]:
# --- NEW: Check API Keys ---
from src.utils import check_api_keys

if gemini_manager:
    check_api_keys(CONFIG)
else:
    logger.warning("Skipping API key check because GeminiAPIManager failed to initialize.")