In [None]:
from huggingface_hub import login
import os

# Use environment variable for Hugging Face token
# Set HUGGINGFACE_HUB_TOKEN environment variable before running
token = os.getenv('HUGGINGFACE_HUB_TOKEN')
if token:
    login(token=token)
else:
    print("Warning: HUGGINGFACE_HUB_TOKEN environment variable not set. Some models may not be accessible.")

In [2]:
# ⚠️ IMPORTANT: Clear any cached variables to ensure fresh execution
%reset -f

from llm_eval import evaluate_model
from llm_eval.main import generate_output_path, process_tasks
import os
from llm_eval.reporting.report_generator import get_reports_dir

# Auto-generate output path like the CLI does
tasks = ["leaderboard"]
model_name = "Qwen/Qwen3-8B"
model_type = "hf" #vllm or hf
quantize = True # False for no quantization
quantization_method = "4bit"

# 🔍 Verify the variables are set correctly (DEBUGGING)
print(f"🔍 Variable Verification:")
print(f"  Model Name: {model_name}")
print(f"  Model Type: {model_type}")
print(f"  Tasks: {tasks}")
print(f"  Quantize: {quantize}")
print("="*50)

# Process tasks to get the actual task list
processed_tasks = process_tasks(tasks)

# Generate output path
output_path = generate_output_path(
    model_name=model_name,
    model_type=model_type,
    tasks=processed_tasks,
    quantize=quantize,
    quantization_method=quantization_method
)

print(f"Auto-generated output path: {output_path}")

# Run the enhanced evaluation (now captures model text outputs by default)
results, output_path = evaluate_model(
    model_type=model_type,
    model_name=model_name,
    tasks=tasks,
    num_samples=1, # 1 for no few-shot, 10 for few-shot
    device="cuda",
    quantize=quantize,
    quantization_method=quantization_method,
    output_path=output_path,  # Now we provide the auto-generated path
    preserve_default_fewshot=True,  # This ensures the correct few-shot settings for each benchmark task
    capture_text_outputs=True,  # NEW: Capture model text outputs for comparison with targets
    seed=42  # NEW: Ensure reproducible sample selection
)

# Print the paths to the results and report
print(f"Results saved to: {output_path}")

# The report path is derived from the output path
if output_path:
    # Get the base filename without extension
    basename = os.path.basename(output_path)
    basename = os.path.splitext(basename)[0]

    # Construct the report path
    reports_dir = get_reports_dir()
    report_path = os.path.join(reports_dir, f"{basename}_report.md")

    if os.path.exists(report_path):
        print(f"Report generated at: {report_path}")
    else:
        print("Report was not generated. Check if there were any errors during evaluation.")
else:
    print("No output path available. Cannot check for report.")

🔍 Variable Verification:
  Model Name: Qwen/Qwen3-8B
  Model Type: hf
  Tasks: ['leaderboard']
  Quantize: True
Expanding task group 'LEADERBOARD' to 39 individual tasks
Auto-generated output path: /workspace/llm-evaluation/evaluations/Qwen3-8B_2025-10-09_150315/results.json
Using enhanced evaluation to capture model text outputs...
Enhanced evaluation - capturing model text outputs
Evaluating model type: hf
Model: Qwen/Qwen3-8B
Tasks: leaderboard
Device: cuda, Few-shot examples: 0
Batch size: 1
Using 1 samples per task
Using quantization method: 4bit
Using default few-shot settings for each task:
  - BBH tasks: 3-shot
  - GPQA tasks: 0-shot
  - MMLU-Pro tasks: 5-shot
  - MUSR tasks: 0-shot
  - IFEval tasks: 0-shot
  - Math-lvl-5 tasks: 4-shot
Setting random seed to 42 for reproducible sample selection
Starting enhanced evaluation on 1 tasks: leaderboard


`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

precalculus/train-00000-of-00001.parquet:   0%|          | 0.00/354k [00:00<?, ?B/s]

precalculus/test-00000-of-00001.parquet:   0%|          | 0.00/242k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/746 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/546 [00:00<?, ? examples/s]

Filter:   0%|          | 0/546 [00:00<?, ? examples/s]

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

prealgebra/train-00000-of-00001.parquet:   0%|          | 0.00/384k [00:00<?, ?B/s]

prealgebra/test-00000-of-00001.parquet:   0%|          | 0.00/268k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1205 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/871 [00:00<?, ? examples/s]

Filter:   0%|          | 0/871 [00:00<?, ? examples/s]

Map:   0%|          | 0/193 [00:00<?, ? examples/s]

number_theory/train-00000-of-00001.parqu(…):   0%|          | 0.00/309k [00:00<?, ?B/s]

number_theory/test-00000-of-00001.parque(…):   0%|          | 0.00/182k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/869 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/540 [00:00<?, ? examples/s]

Filter:   0%|          | 0/540 [00:00<?, ? examples/s]

Map:   0%|          | 0/154 [00:00<?, ? examples/s]

intermediate_algebra/train-00000-of-0000(…):   0%|          | 0.00/575k [00:00<?, ?B/s]

intermediate_algebra/test-00000-of-00001(…):   0%|          | 0.00/395k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1295 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/903 [00:00<?, ? examples/s]

Filter:   0%|          | 0/903 [00:00<?, ? examples/s]

Map:   0%|          | 0/280 [00:00<?, ? examples/s]

geometry/train-00000-of-00001.parquet:   0%|          | 0.00/549k [00:00<?, ?B/s]

geometry/test-00000-of-00001.parquet:   0%|          | 0.00/264k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/870 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/479 [00:00<?, ? examples/s]

Filter:   0%|          | 0/479 [00:00<?, ? examples/s]

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

counting_and_probability/train-00000-of-(…):   0%|          | 0.00/329k [00:00<?, ?B/s]

counting_and_probability/test-00000-of-0(…):   0%|          | 0.00/175k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/771 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/474 [00:00<?, ? examples/s]

Filter:   0%|          | 0/474 [00:00<?, ? examples/s]

Map:   0%|          | 0/123 [00:00<?, ? examples/s]

algebra/train-00000-of-00001.parquet:   0%|          | 0.00/505k [00:00<?, ?B/s]

algebra/test-00000-of-00001.parquet:   0%|          | 0.00/353k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1744 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1187 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1187 [00:00<?, ? examples/s]

Map:   0%|          | 0/307 [00:00<?, ? examples/s]

gpqa_main.csv:   0%|          | 0.00/3.21M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/448 [00:00<?, ? examples/s]

Map:   0%|          | 0/448 [00:00<?, ? examples/s]

gpqa_extended.csv:   0%|          | 0.00/4.09M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/546 [00:00<?, ? examples/s]

Map:   0%|          | 0/546 [00:00<?, ? examples/s]

gpqa_diamond.csv:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/198 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/4.15M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/45.3k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/12032 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/70 [00:00<?, ? examples/s]

100%|██████████| 1/1 [00:00<00:00, 9098.27it/s]
100%|██████████| 1/1 [00:00<00:00, 502.79it/s]
100%|██████████| 1/1 [00:00<00:00, 586.86it/s]
100%|██████████| 1/1 [00:00<00:00, 612.31it/s]
100%|██████████| 1/1 [00:00<00:00, 393.09it/s]
100%|██████████| 1/1 [00:00<00:00, 405.56it/s]
100%|██████████| 1/1 [00:00<00:00, 646.97it/s]
100%|██████████| 1/1 [00:00<00:00, 465.62it/s]
100%|██████████| 1/1 [00:00<00:00, 530.12it/s]
100%|██████████| 1/1 [00:00<00:00, 593.51it/s]
100%|██████████| 1/1 [00:00<00:00, 606.11it/s]
100%|██████████| 1/1 [00:00<00:00, 596.54it/s]
100%|██████████| 1/1 [00:00<00:00, 617.35it/s]
100%|██████████| 1/1 [00:00<00:00, 484.83it/s]
100%|██████████| 1/1 [00:00<00:00, 625.64it/s]
100%|██████████| 1/1 [00:00<00:00, 622.67it/s]
100%|██████████| 1/1 [00:00<00:00, 414.87it/s]
100%|██████████| 1/1 [00:00<00:00, 560.74it/s]
100%|██████████| 1/1 [00:00<00:00, 545.07it/s]
100%|██████████| 1/1 [00:00<00:00, 620.18it/s]
100%|██████████| 1/1 [00:00<00:00, 613.56it/s]
100%|███████

Processing results and extracting model text outputs...
Found samples section, processing text outputs...
Processing text outputs for task: leaderboard_mmlu_pro (1 samples)
Processing text outputs for task: leaderboard_bbh_boolean_expressions (1 samples)
Processing text outputs for task: leaderboard_bbh_causal_judgement (1 samples)
Processing text outputs for task: leaderboard_bbh_date_understanding (1 samples)
Processing text outputs for task: leaderboard_bbh_disambiguation_qa (1 samples)
Processing text outputs for task: leaderboard_bbh_formal_fallacies (1 samples)
Processing text outputs for task: leaderboard_bbh_geometric_shapes (1 samples)
Processing text outputs for task: leaderboard_bbh_hyperbaton (1 samples)
Processing text outputs for task: leaderboard_bbh_logical_deduction_five_objects (1 samples)
Processing text outputs for task: leaderboard_bbh_logical_deduction_seven_objects (1 samples)
Processing text outputs for task: leaderboard_bbh_logical_deduction_three_objects (1 sa

In [1]:
# Option 2: Specifying the task group
!python llm_eval_cli.py \
  --model hf \
  --model_name mistralai/Ministral-8B-Instruct-2410 \
  --tasks LEADERBOARD \
  --device cuda \
  --num_samples 10 \
  --quantize


Expanding task group 'LEADERBOARD' to 39 individual tasks
Running leaderboard tasks with their default few-shot settings
Auto-generated output path: /workspace/llm-evaluation/evaluations/Ministral-8B-Instruct-2410_2025-10-09_151651/results.json
Using enhanced evaluation to capture model text outputs...
Enhanced evaluation - capturing model text outputs
Evaluating model type: hf
Model: mistralai/Ministral-8B-Instruct-2410
Tasks: leaderboard_bbh_boolean_expressions, leaderboard_bbh_causal_judgement, leaderboard_bbh_date_understanding, leaderboard_bbh_disambiguation_qa, leaderboard_bbh_formal_fallacies, leaderboard_bbh_geometric_shapes, leaderboard_bbh_hyperbaton, leaderboard_bbh_logical_deduction_five_objects, leaderboard_bbh_logical_deduction_seven_objects, leaderboard_bbh_logical_deduction_three_objects, leaderboard_bbh_movie_recommendation, leaderboard_bbh_navigate, leaderboard_bbh_object_counting, leaderboard_bbh_penguins_in_a_table, leaderboard_bbh_reasoning_about_colored_objects, l

In [2]:
# ✅ NEW: Demonstrate how to access captured model text outputs
import json

if output_path and os.path.exists(output_path):
    print("🔍 Examining captured model text outputs...")
    print(output_path)
    
    with open(output_path, 'r') as f:
        results_data = json.load(f)
    
    # ✅ FIXED: Check the correct location for samples (top-level "samples" field)
    sample_count = 0
    if "samples" in results_data:
        print("Found samples section!")
        
        for task_name, task_samples in results_data["samples"].items():
            if isinstance(task_samples, list) and len(task_samples) > 0:
                print(f"\n=== Task: {task_name} ({len(task_samples)} samples) ===")
                
                for i, sample in enumerate(task_samples[:3]):  # Show first 3 samples
                    if isinstance(sample, dict) and "model_output" in sample:
                        sample_count += 1
                        print(f"\n📝 Sample {i+1}:")
                        print(f"  🎯 Target Answer: {sample.get('target', 'N/A')}")
                        print(f"  🤖 Model Output: {sample.get('model_output', 'N/A')}")
                        print(f"  ✅ Correct: {sample.get('output_matches_target', 'N/A')}")
                        
                        # Show the question/prompt if available
                        if "doc" in sample and isinstance(sample["doc"], dict):
                            # Different tasks have different field names for questions
                            question_field = None
                            if "input" in sample["doc"]:
                                question_field = "input"
                            elif "question" in sample["doc"]:
                                question_field = "question"
                            elif "narrative" in sample["doc"]:
                                question_field = "narrative"
                            
                            if question_field:
                                question = sample["doc"][question_field]
                                # Truncate long questions
                                if len(question) > 150:
                                    question = question[:150] + "..."
                                print(f"  ❓ Question: {question}")
                        
                        # Show response details for debugging
                        if "resps" in sample and len(sample["resps"]) > 0:
                            print(f"  📊 Response scores: {sample['resps']}")
                        
                if sample_count >= 6:  # Limit to avoid too much output
                    print(f"\n... (showing first {sample_count} samples with text outputs)")
                    break
        
        if sample_count == 0:
            print("❌ No samples with text outputs found.")
        else:
            print(f"\n🎉 SUCCESS! Total samples with text outputs: {sample_count}")
            print("\n✅ Model text outputs are now captured in the results file!")
            print("✅ Each sample includes:")
            print("   - model_output: The actual decoded text response from the model")
            print("   - output_matches_target: Boolean indicating if the response matches the target")
            print("   - target: The expected correct answer")
            print("   - doc: The original question/prompt")
            print("\n📊 These can be used for detailed analysis and comparison with target answers.")
    else:
        print("❌ No 'samples' section found in results.")
else:
    print("❌ Results file not found - cannot examine text outputs.")

# Check if the report includes model outputs
if output_path:
    # Get the base filename without extension
    basename = os.path.basename(output_path)
    basename = os.path.splitext(basename)[0]

    # Construct the report path
    reports_dir = get_reports_dir()
    report_path = os.path.join(reports_dir, f"{basename}_professional_report.md")

    if os.path.exists(report_path):
        print(f"\n📄 Professional report generated at: {report_path}")
        
        # Check if the report contains model output information
        with open(report_path, 'r') as f:
            report_content = f.read()
        
        if "Model Output Analysis" in report_content:
            print("✅ Report includes enhanced Model Output Analysis section!")
        if "model_output" in report_content.lower():
            print("✅ Report includes captured model text outputs!")
        if "Text Output Summary" in report_content:
            print("✅ Report includes text output statistics and visualizations!")
    else:
        print(f"❌ Report not found at: {report_path}")
else:
    print("❌ No output path available. Cannot check for report.")


🔍 Examining captured model text outputs...
/workspace/llm-evaluation/results/results_Ministral-8B-Instruct-2410_hf_leaderboard_bbh_boolean_expressions_l..._20250710_122909.json
Found samples section!

=== Task: leaderboard_mmlu_pro (50 samples) ===

📝 Sample 1:
  🎯 Target Answer: I
  🤖 Model Output: A
  ✅ Correct: False
  ❓ Question: Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must n...
  📊 Response scores: [[[-2.703125, False]], [[-3.953125, False]], [[-3.703125, False]], [[-0.8359375, True]], [[-3.203125, False]], [[-2.453125, False]], [[-2.703125, False]], [[-2.578125, False]], [[-1.7109375, False]]]

📝 Sample 2:
  🎯 Target Answer: F
  🤖 Model Output: A
  ✅ Correct: False
  ❓ Question: Managers are entrusted to run the company in the best interest of ________. Specifically, they have a duty to act for the benefit of the company, as w...
  📊 Response scores: [[[-3.34375, False]], [[-0.