In [None]:
!pip install sentence-transformers
!pip install datasets
!pip install huggingface_hub

In [None]:
# Standard library imports
import json
import os
import pandas as pd
import numpy as np
from huggingface_hub import hf_hub_download, HfFolder
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

# --- Our project's modules ---
from config import CONFIG, setup_directories
from src.utils import setup_logger, load_json
from src.api_manager import GeminiAPIManager
from src.orchestration import run_experiments
from src.evaluation import analyze_experiment_logs
# --- NEW: Import for HF synchronization ---
from src.hf_sync import initialize_workspace

# --- Initial Setup ---
# 1. Create the directory structure (logs, results, etc.)
setup_directories()

# 2. Initialize the main logger for the notebook
logger = setup_logger('main_orchestrator', log_dir=CONFIG['LOGS_DIR'])

# --- MODIFIED: Hugging Face Hub Workspace Initialization from Config ---
if CONFIG.get('PERSIST_RESULTS_ONLINE'):
    # The token is now read directly from the CONFIG dictionary.
    # Make sure you have set it in your config.py file.
    if CONFIG.get("HF_SYNC_TOKEN"):
        logger.info("HF_SYNC_TOKEN found in config. Initializing workspace from Hugging Face Hub.")
        initialize_workspace(CONFIG)
    else:
        logger.warning("PERSIST_RESULTS_ONLINE is True, but no HF_SYNC_TOKEN was found in config.py.")
        logger.warning("Online persistence will fail. Please add your token to config.py to enable it.")

logger.info("Notebook execution started. Directories and logger are set up.")

In [None]:
# This cell handles loading all necessary assets. If any part fails,
# the execution should stop here.

# --- 1. Load Embedding Model ---
try:
    embedding_model = SentenceTransformer(CONFIG['EMBEDDING_MODEL_PATH'])
    logger.info(f"Successfully loaded SentenceTransformer model from: {CONFIG['EMBEDDING_MODEL_PATH']}")
except Exception as e:
    logger.critical(f"Failed to load embedding model. Aborting. Error: {e}", exc_info=True)
    embedding_model = None # Ensure it's None if loading fails

# --- 2. Load "Hard Questions" (Target Queries) ---
if embedding_model:
    hard_questions_list = []
    hard_questions_ground_truths = []
    try:
        # Load the main dataset to get ground truths
        main_ds = load_dataset('AI-MO/NuminaMath-CoT', split='train')
        all_solutions_from_ds = main_ds['solution']
        
        # Load the hard questions JSON file
        hard_questions_data = load_json(CONFIG['HARD_QUESTIONS_JSON_PATH'])
        if hard_questions_data:
            for index_str, question_text in hard_questions_data.items():
                original_idx = int(index_str)
                hard_questions_list.append(question_text)
                hard_questions_ground_truths.append(all_solutions_from_ds[original_idx])
            logger.info(f"Successfully loaded {len(hard_questions_list)} hard questions and their ground truths.")
        else:
            logger.critical(f"Hard questions file not found or empty at {CONFIG['HARD_QUESTIONS_JSON_PATH']}. Aborting.")
    except Exception as e:
        logger.critical(f"An error occurred loading hard questions. Aborting. Error: {e}", exc_info=True)

# --- 3. Load Exemplar Corpus and Embeddings ---
if embedding_model and hard_questions_list:
    try:
        corpus_ds = load_dataset(CONFIG['EXEMPLAR_CORPUS_NAME'], token=CONFIG.get('EXEMPLAR_CORPUS_HF_TOKEN'))
        exemplar_questions = corpus_ds['train']['problem']
        exemplar_solutions = corpus_ds['train']['solution']
        
        # Load embeddings (local -> Hugging Face -> generate)
        local_path = CONFIG['EMBEDDED_EXEMPLAR_CORPUS_QUESTIONS_PATH']
        if os.path.exists(local_path):
            embedded_exemplars = np.load(local_path)
            logger.info(f"Loaded exemplar embeddings from local path: {local_path}")
        else:
            logger.info("Local embeddings not found. Downloading from Hugging Face Hub.")
            hf_path = hf_hub_download(
                repo_id=CONFIG['EXEMPLAR_EMBEDDINGS_HF_REPO_ID'],
                filename=CONFIG['EXEMPLAR_EMBEDDINGS_HF_FILENAME'],
                repo_type="dataset"
            )
            embedded_exemplars = np.load(hf_path)
            np.save(local_path, embedded_exemplars) # Save locally for next time
            logger.info(f"Downloaded and saved embeddings from {CONFIG['EXEMPLAR_EMBEDDINGS_HF_REPO_ID']}.")
            
        exemplar_data = {
            "questions": exemplar_questions,
            "solutions": exemplar_solutions,
            "embeddings": embedded_exemplars
        }
        logger.info(f"Exemplar corpus loaded: {len(exemplar_questions)} questions with embeddings of shape {embedded_exemplars.shape}.")
    except Exception as e:
        logger.critical(f"Failed to load exemplar corpus or embeddings. Aborting. Error: {e}", exc_info=True)
        exemplar_data = None

# --- 4. Initialize Gemini API Manager ---
if embedding_model and hard_questions_list and exemplar_data:
    try:
        gemini_manager = GeminiAPIManager(
            api_keys=CONFIG['GEMINI_API_KEYS'],
            model_quotas=CONFIG['GEMINI_MODEL_QUOTAS'],
            global_delay_seconds=CONFIG['GLOBAL_API_CALL_DELAY_SECONDS']
        )
        logger.info("GeminiAPIManager initialized successfully.")
    except Exception as e:
        logger.critical(f"Failed to initialize GeminiAPIManager. Aborting. Error: {e}", exc_info=True)
        gemini_manager = None

In [None]:
# This is the control panel for your research. Define different experiment
# configurations by creating dictionaries that override the default CONFIG.

experiment_configurations = [
    {
        "experiment_name": "RAG_Only_Pass@3",
        "APPLY_STANDARDIZATION": False,
        "APPLY_TRANSFORMATION": False,
        "APPLY_MERGING": False,
        "TOP_N_CANDIDATES_RETRIEVAL": 1,
        "N_PASS_ATTEMPTS": 3,
        "DEFAULT_PASS_N_SOLVER_TEMPERATURE": 1.0
    },
    {
        "experiment_name": "RAG_plus_Standardize_Pass@3",
        "APPLY_STANDARDIZATION": True,
        "APPLY_TRANSFORMATION": False,
        "APPLY_MERGING": False,
        "TOP_N_CANDIDATES_RETRIEVAL": 1,
        "N_PASS_ATTEMPTS": 3,
        "DEFAULT_PASS_N_SOLVER_TEMPERATURE": 1.0,
        "DEFAULT_ADAPTATION_TEMPERATURE": 0.0 # Be explicit about adaptation temp
    },
    {
        "experiment_name": "Full_Pipeline_K3_Pass@1",
        "APPLY_STANDARDIZATION": True,
        "APPLY_TRANSFORMATION": True,
        "APPLY_MERGING": True,
        "TOP_N_CANDIDATES_RETRIEVAL": 3, # Retrieve more to allow for merging
        "TARGET_ADAPTED_SAMPLES_MERGING": 1,
        "N_PASS_ATTEMPTS": 1,
        "DEFAULT_PASS_N_SOLVER_TEMPERATURE": 0.5 # Lower temp for single pass
    },
]

logger.info(f"Defined {len(experiment_configurations)} experiments to run.")
print("Experiments to run:")
for exp in experiment_configurations:
    print(f"- {exp['experiment_name']}")

In [None]:
# This cell takes the logs generated by the experiments and runs the
# LLM-based evaluation to produce the final Pass@K summary.

# --- NEW: Import for final sync ---
from src.hf_sync import sync_workspace_to_hub

if all_experiment_logs:
    logger.info("Starting analysis of experiment results.")
    
    # This single function call evaluates all experiments and returns a DataFrame
    summary_df = analyze_experiment_logs(
        all_experiments_logs=all_experiment_logs,
        ground_truths=hard_questions_ground_truths,
        gemini_manager=gemini_manager,
        config=CONFIG
    )
    
    # Save the final summary to a CSV file for easy access
    summary_path = os.path.join(CONFIG['RESULTS_DIR'], "final_experiment_summary.csv")
    summary_df.to_csv(summary_path, index=False)
    
    logger.info(f"Analysis complete. Summary saved to {summary_path}")
    
    # Display the final results in the notebook
    print("\n--- Experiment Summary ---")
    
    # Set display options for better viewing of the DataFrame
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    
    display(summary_df)

else:
    logger.warning("No experiment logs were generated or loaded. Skipping analysis.")

# --- NEW: Final synchronization to save all results ---
logger.info("Performing final synchronization to Hugging Face Hub.")
sync_workspace_to_hub(CONFIG)
logger.info("Final sync complete. All results are saved online.")

In [None]:
# This cell takes the logs generated by the experiments and runs the
# LLM-based evaluation to produce the final Pass@K summary.

# --- NEW: Import for final sync ---
from src.hf_sync import sync_workspace_to_hub

if all_experiment_logs:
    logger.info("Starting analysis of experiment results.")
    
    # This single function call evaluates all experiments and returns a DataFrame
    summary_df = analyze_experiment_logs(
        all_experiments_logs=all_experiment_logs,
        ground_truths=hard_questions_ground_truths,
        gemini_manager=gemini_manager,
        config=CONFIG
    )
    
    # Save the final summary to a CSV file for easy access
    summary_path = os.path.join(CONFIG['RESULTS_DIR'], "final_experiment_summary.csv")
    summary_df.to_csv(summary_path, index=False)
    
    logger.info(f"Analysis complete. Summary saved to {summary_path}")
    
    # Display the final results in the notebook
    print("\n--- Experiment Summary ---")
    
    # Set display options for better viewing of the DataFrame
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    
    display(summary_df)

else:
    logger.warning("No experiment logs were generated or loaded. Skipping analysis.")

# --- NEW: Final synchronization to save all results ---
logger.info("Performing final synchronization to Hugging Face Hub.")
sync_workspace_to_hub(CONFIG)
logger.info("Final sync complete. All results are saved online.")