In [None]:
# Standard library imports
import json
import os
import random

# --- Our project's modules ---
from config import CONFIG, setup_directories
from src.utils import setup_logger, load_json, save_json
from src.api_manager import GeminiAPIManager
from src.orchestration import run_pipeline_for_single_query
from src.hf_sync import sync_workspace_to_hub
from datasets import load_dataset

# --- 1. Initial Setup ---
# Create the directory structure (logs, results, etc.)
setup_directories()

# Initialize the main logger for this notebook
logger = setup_logger('hard_question_identifier', log_dir=CONFIG['LOGS_DIR'])

# --- 2. Load Configuration & Initialize API Manager ---
# Extract the dedicated config for this task for easier access
ID_CONFIG = CONFIG['HARD_QUESTION_IDENTIFICATION_CONFIG']

logger.info("--- Hard Question Identification Notebook Started ---")
logger.info(f"Loaded configuration: {json.dumps(ID_CONFIG, indent=2)}")

# Initialize the Gemini API Manager
try:
    gemini_manager = GeminiAPIManager(
        api_keys=CONFIG['GEMINI_API_KEYS'],
        model_quotas=CONFIG['GEMINI_MODEL_QUOTAS'],
        global_delay_seconds=CONFIG['GLOBAL_API_CALL_DELAY_SECONDS']
    )
    logger.info("GeminiAPIManager initialized successfully.")
except Exception as e:
    logger.critical(f"Failed to initialize GeminiAPIManager. Aborting. Error: {e}", exc_info=True)
    gemini_manager = None

In [None]:
# --- 3. Load Dataset and Source Questions ---
target_indices = []
all_questions = []
all_ground_truths = []

if gemini_manager:
    try:
        # Load the main dataset which serves as the source for all questions
        main_ds = load_dataset(CONFIG['EXEMPLAR_CORPUS_NAME'], split='train')
        all_questions = main_ds['problem']
        all_ground_truths = main_ds['solution']
        logger.info(f"Successfully loaded the main dataset with {len(all_questions)} questions.")

        # --- Flexible Question Sourcing Logic ---
        # Check if a specific list of indices is provided in the config
        if ID_CONFIG.get("TARGET_INDICES_FILE_PATH") and os.path.exists(ID_CONFIG["TARGET_INDICES_FILE_PATH"]):
            target_indices = load_json(ID_CONFIG["TARGET_INDICES_FILE_PATH"])
            logger.info(f"Loaded {len(target_indices)} target indices from the specified file: {ID_CONFIG['TARGET_INDICES_FILE_PATH']}")
        
        # If not, generate a random sample of indices
        else:
            num_samples = ID_CONFIG.get("NUM_RANDOM_SAMPLES", 100)
            if num_samples > len(all_questions):
                logger.warning(f"Number of samples ({num_samples}) is larger than the dataset size ({len(all_questions)}). Using all questions.")
                target_indices = list(range(len(all_questions)))
            else:
                target_indices = random.sample(range(len(all_questions)), num_samples)
                logger.info(f"Generated a random sample of {len(target_indices)} indices.")
                
    except Exception as e:
        logger.critical(f"An error occurred during data loading or question sourcing. Aborting. Error: {e}", exc_info=True)
        target_indices = []

print(f"Prepared to process {len(target_indices)} questions.")

In [None]:
# --- 4. Run the Identification Pipeline ---
run_logs = []
log_file_path = os.path.join(CONFIG['RESULTS_DIR'], "hard_question_identification_log.json")

if gemini_manager and target_indices:
    # --- Setup for non-RAG execution ---
    # Create a specific configuration for this run
    run_config = CONFIG.copy()
    run_config.update({
        "USE_RETRIEVAL": False, # Crucial: We are testing the model's standalone ability
        "ONLINE_EVALUATION_ENABLED": True, # Crucial: We need real-time results
        "STOP_ON_FIRST_SUCCESS": False # We need to see if it fails all attempts
    })
    
    # Check for existing logs to resume
    existing_logs = load_json(log_file_path)
    if existing_logs:
        run_logs = existing_logs
        completed_indices = {log.get('target_query_original_hard_list_idx') for log in run_logs}
        logger.info(f"Resuming run. Loaded {len(run_logs)} existing results. {len(completed_indices)} questions already processed.")
    else:
        completed_indices = set()

    # Filter out already processed questions
    indices_to_process = [idx for idx in target_indices if idx not in completed_indices]
    
    logger.info(f"Starting pipeline for {len(indices_to_process)} new questions.")
    
    from tqdm.notebook import tqdm

    for original_idx in tqdm(indices_to_process, desc="Identifying Hard Questions"):
        query_text = all_questions[original_idx]
        ground_truth_text = all_ground_truths[original_idx]

        # Run the pipeline for the single query
        single_run_log = run_pipeline_for_single_query(
            hard_list_idx=original_idx,
            target_query=query_text,
            ground_truth=ground_truth_text,
            config=run_config,
            embedding_model=None, # Not needed as retrieval is off
            exemplar_data={}, # Not needed as retrieval is off
            gemini_manager=gemini_manager
        )
        
        run_logs.append(single_run_log)
        
        # Save progress after each question
        save_json(run_logs, log_file_path)
    
    logger.info("Pipeline execution complete for all selected questions.")

else:
    logger.warning("Skipping pipeline execution because the API manager or target indices are not available.")

In [None]:
# --- 5. Analyze Results to Identify Hard Questions ---
hard_question_indices = []

if run_logs:
    logger.info("Analyzing the results to identify hard questions...")
    
    # Filter out any runs that failed due to technical issues (not model inability)
    evaluable_logs = [log for log in run_logs if not log['pipeline_status'].startswith('UN-EVALUABLE')]
    
    num_unevaluable = len(run_logs) - len(evaluable_logs)
    if num_unevaluable > 0:
        logger.warning(f"{num_unevaluable} questions were excluded from analysis because they were un-evaluable.")
        
    for log in evaluable_logs:
        # Check the online evaluation results for any correct answer
        evaluation_results = log.get("online_evaluation_results", [])
        was_solved = any(result.get("is_correct") for result in evaluation_results)
        
        # If it was never solved, it's a hard question
        if not was_solved:
            hard_question_indices.append(log["target_query_original_hard_list_idx"])
            
    logger.info(f"Analysis complete. Found {len(hard_question_indices)} hard questions out of {len(evaluable_logs)} evaluable questions.")
    print(f"\n--- Analysis Summary ---")
    print(f"Total Questions Processed: {len(run_logs)}")
    print(f"Evaluable Questions: {len(evaluable_logs)}")
    print(f"Identified Hard Questions: {len(hard_question_indices)}")
    
else:
    logger.warning("No run logs were found to analyze.")

In [None]:
# --- 6. Save the Final List and Synchronize ---
if hard_question_indices:
    output_path = ID_CONFIG['HARD_QUESTIONS_OUTPUT_PATH']
    logger.info(f"Saving the list of {len(hard_question_indices)} hard question indices to: {output_path}")
    
    # Save the list to the specified file
    save_json(hard_question_indices, output_path)
    
    print(f"\nSuccessfully saved the hard question indices to '{output_path}'.")

else:
    logger.info("No hard questions were identified, so no output file was saved.")

# --- Final synchronization to save all results and logs ---
if CONFIG.get("PERSIST_RESULTS_ONLINE"):
    logger.info("Performing final synchronization to Hugging Face Hub...")
    sync_workspace_to_hub(CONFIG)
    logger.info("Final sync complete. All results from this run are saved online.")

logger.info("--- Hard Question Identification Notebook Finished ---")

In [None]:
# --- NEW: Check API Keys ---
from src.utils import check_api_keys

if gemini_manager:
    check_api_keys(CONFIG)
else:
    logger.warning("Skipping API key check because GeminiAPIManager failed to initialize.")