In [None]:
!pip install sentence-transformers
!pip install datasets
!pip install huggingface_hub
!pip install openai

In [None]:
# Standard library imports
import json
import os
import pandas as pd
import numpy as np
from huggingface_hub import hf_hub_download, HfFolder
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

# --- Our project's modules ---
from config import CONFIG, setup_directories
from src.utils import setup_logger, load_json
# MODIFIED: Import both API manager classes
from src.api_manager import GeminiAPIManager, AvalAIAPIManager
from src.orchestration import run_experiments
from src.evaluation import analyze_experiment_logs
from src.hf_sync import initialize_workspace, sync_workspace_to_hub

# --- Initial Setup ---
# 1. Create the directory structure (logs, results, etc.)
setup_directories()

# 2. Initialize the main logger for the notebook
logger = setup_logger('main_orchestrator', log_dir=CONFIG['LOGS_DIR'])

# --- Hugging Face Hub Workspace Initialization ---
if CONFIG.get('PERSIST_RESULTS_ONLINE'):
    if CONFIG.get("HF_SYNC_TOKEN"):
        logger.info("HF_SYNC_TOKEN found in config. Initializing workspace from Hugging Face Hub.")
        initialize_workspace(CONFIG)
    else:
        logger.warning("PERSIST_RESULTS_ONLINE is True, but no HF_SYNC_TOKEN was found in config.py.")

# --- NEW: API Manager Factory ---
# This block dynamically instantiates the correct API manager based on the config.
api_manager = None
provider = CONFIG.get("API_PROVIDER", "gemini").lower()
logger.info(f"Selected API Provider from config: '{provider}'")

if provider == "gemini":
    try:
        api_manager = GeminiAPIManager(
            api_keys=CONFIG['GEMINI_API_KEYS'],
            model_quotas=CONFIG['GEMINI_MODEL_QUOTAS'],
            global_delay_seconds=CONFIG['GLOBAL_API_CALL_DELAY_SECONDS'],
            config=CONFIG  # <-- MODIFIED: Pass the config dictionary
        )
        logger.info("GeminiAPIManager initialized successfully.")
    except Exception as e:
        logger.critical(f"Failed to initialize GeminiAPIManager. Aborting. Error: {e}", exc_info=True)

elif provider == "avalai":
    try:
        api_manager = AvalAIAPIManager(
            api_key=CONFIG['AVALAI_API_KEY'],
            base_url=CONFIG['AVALAI_BASE_URL'],
            model_quotas=CONFIG['AVALAI_MODEL_QUOTAS'],
            config=CONFIG  # <-- MODIFIED: Pass the config dictionary
        )
        logger.info("AvalAIAPIManager initialized successfully.")
    except Exception as e:
        logger.critical(f"Failed to initialize AvalAIAPIManager. Aborting. Error: {e}", exc_info=True)

else:
    logger.critical(f"Invalid API_PROVIDER configured: '{provider}'. Must be 'gemini' or 'avalai'. Aborting.")
    # In a real script, you might raise an error here.

logger.info("Notebook execution started. Directories, logger, and API Manager are set up.")

In [None]:
# This cell handles loading all necessary assets.

# --- 1. Load Embedding Model ---
try:
    embedding_model = SentenceTransformer(CONFIG['EMBEDDING_MODEL_PATH'])
    logger.info(f"Successfully loaded SentenceTransformer model from: {CONFIG['EMBEDDING_MODEL_PATH']}")
except Exception as e:
    logger.critical(f"Failed to load embedding model. Aborting. Error: {e}", exc_info=True)
    embedding_model = None

# --- 2. Load Full Exemplar Corpus and Embeddings ---
exemplar_data = {}
if embedding_model:
    try:
        logger.info(f"Loading exemplar corpus: {CONFIG['EXEMPLAR_CORPUS_NAME']}")
        exemplar_ds = load_dataset(CONFIG['EXEMPLAR_CORPUS_NAME'], split='train')
        
        logger.info(f"Loading pre-computed embeddings from: {CONFIG['EMBEDDED_EXEMPLAR_CORPUS_QUESTIONS_PATH']}")
        exemplar_embeddings = np.load(CONFIG['EMBEDDED_EXEMPLAR_CORPUS_QUESTIONS_PATH'])
        
        exemplar_data = {
            'questions': exemplar_ds['problem'],
            'solutions': exemplar_ds['solution'],
            'embeddings': exemplar_embeddings
        }
        logger.info(f"Successfully loaded {len(exemplar_data['questions'])} exemplars and their embeddings.")
    except Exception as e:
        logger.critical(f"Failed to load exemplar data or embeddings. Aborting. Error: {e}", exc_info=True)

# --- 3. Load "Hard Questions" (Target Queries) using Indices ---
hard_questions_list = []
hard_questions_ground_truths = []
if exemplar_data:
    try:
        hard_question_indices = load_json(CONFIG['HARD_QUESTIONS_INDICES_PATH'])
        if hard_question_indices:
            hard_questions_list = [exemplar_data['questions'][i] for i in hard_question_indices]
            hard_questions_ground_truths = [exemplar_data['solutions'][i] for i in hard_question_indices]
            logger.info(f"Successfully loaded {len(hard_questions_list)} hard questions and their ground truths using indices.")
        else:
            logger.critical(f"Hard questions index file not found or empty at {CONFIG['HARD_QUESTIONS_INDICES_PATH']}. Aborting.")
    except Exception as e:
        logger.critical(f"An error occurred loading hard questions. Aborting. Error: {e}", exc_info=True)

# --- 4. Final Check ---
if not all([api_manager, embedding_model, exemplar_data, hard_questions_list]):
    logger.critical("One or more critical assets failed to load. Please check the logs above. Halting execution.")
else:
    logger.info("All assets loaded successfully. Ready to run experiments.")

In [None]:
# This is the control panel for your research. Define different experiment
# configurations by creating dictionaries that override the default CONFIG.

experiment_configurations = [
    {
        "experiment_name": "RAG_Only_Pass@3",
        "USE_RETRIEVAL": True,
        "APPLY_STANDARDIZATION": False,
        "APPLY_TRANSFORMATION": False,
        "APPLY_MERGING": False,
        "TOP_N_CANDIDATES_RETRIEVAL": 1,
        "N_PASS_ATTEMPTS": 3,
    },
    {
        "experiment_name": "No_RAG_Baseline_Pass@3",
        "USE_RETRIEVAL": False, # Experiment without RAG
        "N_PASS_ATTEMPTS": 3,
    },
    {
        "experiment_name": "Full_Pipeline_K3_Pass@1",
        "USE_RETRIEVAL": True,
        "APPLY_STANDARDIZATION": True,
        "APPLY_TRANSFORMATION": True,
        "APPLY_MERGING": True,
        "TOP_N_CANDIDATES_RETRIEVAL": 3,
        "TARGET_ADAPTED_SAMPLES_MERGING": 1,
        "N_PASS_ATTEMPTS": 1,
    },
]

logger.info(f"Defined {len(experiment_configurations)} experiments to run.")
print("Experiments to run:")
for exp in experiment_configurations:
    print(f"- {exp['experiment_name']}")

In [None]:
# This cell executes all defined experiments.
# It will save logs for each experiment, allowing you to resume if the run is interrupted.

all_experiment_logs = {}
if 'api_manager' in locals() and api_manager is not None:
    logger.info("Starting the main experiment execution loop.")
    
    all_experiment_logs = run_experiments(
        experiment_configs=experiment_configurations,
        global_config=CONFIG,
        hard_questions=hard_questions_list,
        embedding_model=embedding_model,
        exemplar_data=exemplar_data,
        api_manager=api_manager  # Pass the generic API manager
    )
    
    logger.info("All experiments have been processed.")
    
    # --- Final synchronization to save all run logs ---
    logger.info("Performing final synchronization of run logs to Hugging Face Hub.")
    sync_workspace_to_hub(CONFIG)
    logger.info("Final sync of run logs complete.")

else:
    logger.error("API Manager not initialized. Cannot run experiments.")

In [None]:
# This cell takes the logs generated by the experiments and runs the
# LLM-based evaluation to produce the final Pass@K summary.

if all_experiment_logs:
    logger.info("Starting analysis of experiment results.")
    
    summary_df = analyze_experiment_logs(
        all_experiments_logs=all_experiment_logs,
        ground_truths=hard_questions_ground_truths,
        api_manager=api_manager,  # MODIFIED: Pass the generic manager
        config=CONFIG
    )
    
    summary_path = os.path.join(CONFIG['RESULTS_DIR'], "final_experiment_summary.csv")
    summary_df.to_csv(summary_path, index=False)
    logger.info(f"Analysis complete. Summary saved to {summary_path}")
    
    print("\n--- Experiment Summary ---")
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    display(summary_df)

    # --- Final sync to save evaluation results ---
    logger.info("Performing final synchronization of evaluation results to Hugging Face Hub.")
    sync_workspace_to_hub(CONFIG)
    logger.info("Final sync complete. All results are saved online.")

else:
    logger.warning("No experiment logs were generated or loaded. Skipping analysis.")

In [None]:
# This cell attempts to retry the evaluations that failed due to API errors
# and then recalculates the summary to see the impact of the retries.

from src.error_handler import retry_failed_evaluations

if all_experiment_logs:
    logger.info("Starting the retry process for failed evaluations.")
    
    retry_failed_evaluations(
        all_experiments_logs=all_experiment_logs,
        ground_truths=hard_questions_ground_truths,
        api_manager=api_manager,  # MODIFIED: Pass the generic manager
        config=CONFIG
    )
    
    logger.info("Retry process complete. Recalculating the experiment summary.")
    
    updated_summary_df = analyze_experiment_logs(
        all_experiments_logs=all_experiment_logs,
        ground_truths=hard_questions_ground_truths,
        api_manager=api_manager,  # MODIFIED: Pass the generic manager
        config=CONFIG
    )
    
    updated_summary_path = os.path.join(CONFIG['RESULTS_DIR'], "final_experiment_summary_after_retry.csv")
    updated_summary_df.to_csv(updated_summary_path, index=False)
    logger.info(f"Analysis after retry complete. New summary saved to {updated_summary_path}")
    
    print("\n--- Experiment Summary After Retries ---")
    display(updated_summary_df)

    # --- Final sync after retries ---
    logger.info("Performing final synchronization after retries.")
    sync_workspace_to_hub(CONFIG)
    logger.info("Final sync complete.")

else:
    logger.warning("No experiment logs were loaded. Skipping retry process.")

In [None]:
# This cell generates a detailed report of all API errors that occurred
# during the evaluation process, helping to diagnose the issues.

from src.error_handler import generate_error_report

if all_experiment_logs:
    logger.info("Generating a detailed report of API errors from the evaluation logs.")
    
    error_df = generate_error_report(
        all_experiments_logs=all_experiment_logs,
        config=CONFIG
    )
    
    if not error_df.empty:
        print("\n--- API Error Exploration Report ---")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 1000)
        pd.set_option('display.max_rows', 100)
        display(error_df)
        
        error_report_path = os.path.join(CONFIG['RESULTS_DIR'], "api_error_report.csv")
        error_df.to_csv(error_report_path, index=False)
        logger.info(f"API error report saved to {error_report_path}")
        
    else:
        print("\n--- No API Errors Found ---")
        logger.info("The error report is empty because no API_ERRORs were logged.")
else:
    logger.warning("No experiment logs loaded, cannot generate an error report.")