# HumanEval: Code Generation Strategies

This notebook compares different sampling strategies on HumanEval, focusing on **pass@k** metrics.

**Key aspects:**
- Code generation is different from math - correctness is binary (passes tests or not)
- pass@k is the standard metric: probability of at least one correct solution in k samples
- Temperature and diversity matter for exploration

**Strategies compared:**
- Greedy (pass@1 baseline)
- Temperature sampling (pass@1, 5, 10)
- Various temperatures and top_p values
- Nucleus sampling configurations

In [None]:
import sys
sys.path.insert(0, '..')

import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from datetime import datetime

from src.models import VLLMModel
from src.datasets import HumanEvalDataset
from src.samplers import GreedySampler, StandardSampler, NucleusSampler, DiverseSampler
from src.evaluators import PassAtKEvaluator, AccuracyEvaluator
from src.runners import run_evaluation
from src.utils import save_results

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Setup

In [None]:
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
MAX_PROBLEMS = 50  # HumanEval has 164 problems
OUTPUT_DIR = "../results/humaneval_comparison"

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
print("Loading model...")
model = VLLMModel(
    model_name=MODEL_NAME,
    gpu_memory_utilization=0.9,
)
print(f"Model loaded: {model.name}")

In [None]:
dataset = HumanEvalDataset(timeout=5.0)
print(f"Dataset: {dataset.name}, {len(dataset)} problems")

problems = dataset.get_problems(limit=MAX_PROBLEMS)
print(f"Using {len(problems)} problems")

# Preview
print(f"\nExample problem:")
print(problems[0].prompt)

## 2. Define Experiments

For code generation, we focus on pass@k with different sampling configurations.

In [None]:
# We'll run with n=20 samples for all temperature experiments to compute pass@1,5,10
N_SAMPLES = 20

experiments = {
    # Baseline greedy
    "greedy": {
        "sampler": GreedySampler(max_tokens=512),
        "evaluator": AccuracyEvaluator(dataset),
        "n_samples": 1,
        "description": "Greedy (pass@1 only)",
    },
    
    # Temperature 0.2 (low diversity)
    "temp_0.2": {
        "sampler": StandardSampler(temperature=0.2, top_p=0.95, max_tokens=512),
        "evaluator": PassAtKEvaluator(dataset, k_values=[1, 5, 10]),
        "n_samples": N_SAMPLES,
        "description": "Temperature 0.2",
    },
    
    # Temperature 0.4
    "temp_0.4": {
        "sampler": StandardSampler(temperature=0.4, top_p=0.95, max_tokens=512),
        "evaluator": PassAtKEvaluator(dataset, k_values=[1, 5, 10]),
        "n_samples": N_SAMPLES,
        "description": "Temperature 0.4",
    },
    
    # Temperature 0.6
    "temp_0.6": {
        "sampler": StandardSampler(temperature=0.6, top_p=0.95, max_tokens=512),
        "evaluator": PassAtKEvaluator(dataset, k_values=[1, 5, 10]),
        "n_samples": N_SAMPLES,
        "description": "Temperature 0.6",
    },
    
    # Temperature 0.8 (recommended by Codex paper)
    "temp_0.8": {
        "sampler": StandardSampler(temperature=0.8, top_p=0.95, max_tokens=512),
        "evaluator": PassAtKEvaluator(dataset, k_values=[1, 5, 10]),
        "n_samples": N_SAMPLES,
        "description": "Temperature 0.8 (Codex default)",
    },
    
    # Temperature 1.0
    "temp_1.0": {
        "sampler": StandardSampler(temperature=1.0, top_p=0.95, max_tokens=512),
        "evaluator": PassAtKEvaluator(dataset, k_values=[1, 5, 10]),
        "n_samples": N_SAMPLES,
        "description": "Temperature 1.0",
    },
    
    # Nucleus sampling p=0.9
    "nucleus_0.9": {
        "sampler": NucleusSampler(temperature=0.8, top_p=0.9, max_tokens=512),
        "evaluator": PassAtKEvaluator(dataset, k_values=[1, 5, 10]),
        "n_samples": N_SAMPLES,
        "description": "Nucleus p=0.9, temp=0.8",
    },
    
    # Nucleus sampling p=0.95
    "nucleus_0.95": {
        "sampler": NucleusSampler(temperature=0.8, top_p=0.95, max_tokens=512),
        "evaluator": PassAtKEvaluator(dataset, k_values=[1, 5, 10]),
        "n_samples": N_SAMPLES,
        "description": "Nucleus p=0.95, temp=0.8",
    },
    
    # Diverse temperatures
    "diverse": {
        "sampler": DiverseSampler(
            temperatures=[0.2, 0.4, 0.6, 0.8, 1.0],
            top_p=0.95,
            max_tokens=512
        ),
        "evaluator": PassAtKEvaluator(dataset, k_values=[1, 5, 10]),
        "n_samples": N_SAMPLES,
        "description": "Diverse temperatures",
    },
}

print(f"Defined {len(experiments)} experiments")

## 3. Run Experiments

In [None]:
all_results = {}

for exp_name, exp_config in experiments.items():
    print(f"\n{'='*60}")
    print(f"Running: {exp_name}")
    print(f"{'='*60}")
    
    start_time = datetime.now()
    
    results, metrics, responses, scores = run_evaluation(
        model=model,
        sampler=exp_config["sampler"],
        dataset=dataset,
        evaluator=exp_config["evaluator"],
        batch_size=2 if exp_config["n_samples"] > 1 else 8,
        n_samples=exp_config["n_samples"],
        max_problems=MAX_PROBLEMS,
        verbose=True,
    )
    
    elapsed = (datetime.now() - start_time).total_seconds()
    
    run_dir = save_results(
        output_dir=OUTPUT_DIR,
        run_name=exp_name,
        results=results,
        metrics=metrics,
        config={"experiment": exp_name, "n_samples": exp_config["n_samples"]},
        responses=responses,
        scores=scores,
    )
    
    all_results[exp_name] = {
        "metrics": metrics,
        "results": results,
        "elapsed": elapsed,
        "n_samples": exp_config["n_samples"],
    }
    
    print(f"\nCompleted in {elapsed:.1f}s")
    print(f"Metrics: {metrics.metrics}")

## 4. Compare pass@k Results

In [None]:
# Extract pass@k metrics
comparison_data = []
for exp_name, data in all_results.items():
    metrics = data["metrics"]
    row = {
        "Experiment": exp_name,
        "Description": experiments[exp_name]["description"],
        "N Samples": data["n_samples"],
        "Time (s)": data["elapsed"],
    }
    
    # Add pass@k metrics
    if "pass@1" in metrics.metrics:
        row["pass@1"] = metrics.metrics["pass@1"]
        row["pass@5"] = metrics.metrics.get("pass@5", np.nan)
        row["pass@10"] = metrics.metrics.get("pass@10", np.nan)
    else:
        # Greedy is just accuracy
        row["pass@1"] = metrics.accuracy
        row["pass@5"] = np.nan
        row["pass@10"] = np.nan
    
    comparison_data.append(row)

df = pd.DataFrame(comparison_data)
df = df.sort_values("pass@1", ascending=False)
df

In [None]:
# pass@k comparison chart
fig, ax = plt.subplots(figsize=(12, 6))

# Filter to experiments with multiple samples
df_multi = df[df["N Samples"] > 1].copy()

x = np.arange(len(df_multi))
width = 0.25

bars1 = ax.bar(x - width, df_multi["pass@1"], width, label='pass@1', color='#3498db')
bars2 = ax.bar(x, df_multi["pass@5"], width, label='pass@5', color='#2ecc71')
bars3 = ax.bar(x + width, df_multi["pass@10"], width, label='pass@10', color='#e74c3c')

ax.set_xlabel('Sampling Strategy')
ax.set_ylabel('pass@k')
ax.set_title('HumanEval: pass@k by Sampling Strategy')
ax.set_xticks(x)
ax.set_xticklabels(df_multi["Experiment"], rotation=45, ha='right')
ax.legend()
ax.set_ylim(0, 1)

# Add value labels
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        if not np.isnan(height):
            ax.annotate(f'{height:.2f}',
                       xy=(bar.get_x() + bar.get_width() / 2, height),
                       xytext=(0, 3),
                       textcoords="offset points",
                       ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/pass_at_k_comparison.png", dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Temperature vs pass@k curve
temp_experiments = ["temp_0.2", "temp_0.4", "temp_0.6", "temp_0.8", "temp_1.0"]
temps = [0.2, 0.4, 0.6, 0.8, 1.0]

pass_at_1 = [all_results[exp]["metrics"].metrics.get("pass@1", 0) for exp in temp_experiments if exp in all_results]
pass_at_5 = [all_results[exp]["metrics"].metrics.get("pass@5", 0) for exp in temp_experiments if exp in all_results]
pass_at_10 = [all_results[exp]["metrics"].metrics.get("pass@10", 0) for exp in temp_experiments if exp in all_results]

fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(temps[:len(pass_at_1)], pass_at_1, 'o-', label='pass@1', markersize=10, linewidth=2)
ax.plot(temps[:len(pass_at_5)], pass_at_5, 's-', label='pass@5', markersize=10, linewidth=2)
ax.plot(temps[:len(pass_at_10)], pass_at_10, '^-', label='pass@10', markersize=10, linewidth=2)

ax.set_xlabel('Temperature', fontsize=12)
ax.set_ylabel('pass@k', fontsize=12)
ax.set_title('Temperature vs pass@k', fontsize=14)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_ylim(0, 1)

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/temperature_curve.png", dpi=150, bbox_inches='tight')
plt.show()

# Find optimal temperature
if pass_at_1:
    best_temp_idx = np.argmax(pass_at_1)
    print(f"\nOptimal temperature for pass@1: {temps[best_temp_idx]} (pass@1 = {pass_at_1[best_temp_idx]:.4f})")

## 5. Analyze Sample Diversity

In [None]:
# Analyze how many unique solutions are generated per problem
def count_unique_solutions(results):
    """Count unique correct solutions per problem."""
    unique_counts = []
    for r in results:
        if r.metadata and "correct_flags" in r.metadata:
            n_correct = sum(r.metadata["correct_flags"])
            unique_counts.append(n_correct)
    return unique_counts

# Compare diversity across temperatures
for exp_name in ["temp_0.2", "temp_0.6", "temp_1.0"]:
    if exp_name not in all_results:
        continue
    results = all_results[exp_name]["results"]
    unique_counts = count_unique_solutions(results)
    
    if unique_counts:
        print(f"\n{exp_name}:")
        print(f"  Mean correct samples per problem: {np.mean(unique_counts):.2f}")
        print(f"  Max correct samples: {max(unique_counts)}")
        print(f"  Problems with 0 correct: {sum(1 for c in unique_counts if c == 0)}")
        print(f"  Problems with 1+ correct: {sum(1 for c in unique_counts if c >= 1)}")

In [None]:
# Distribution of correct samples per problem
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, exp_name in zip(axes, ["temp_0.2", "temp_0.6", "temp_1.0"]):
    if exp_name not in all_results:
        continue
        
    results = all_results[exp_name]["results"]
    unique_counts = count_unique_solutions(results)
    
    if unique_counts:
        ax.hist(unique_counts, bins=range(N_SAMPLES + 2), align='left', rwidth=0.8)
        ax.set_xlabel('Number of Correct Samples')
        ax.set_ylabel('Number of Problems')
        ax.set_title(f'{exp_name}\nMean: {np.mean(unique_counts):.2f}')

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/sample_diversity.png", dpi=150, bbox_inches='tight')
plt.show()

## 6. Summary

In [None]:
print("\n" + "="*80)
print("SUMMARY: HumanEval Code Generation")
print("="*80)

# Summary table
summary_df = df[["Experiment", "pass@1", "pass@5", "pass@10", "N Samples"]].copy()
for col in ["pass@1", "pass@5", "pass@10"]:
    summary_df[col] = summary_df[col].apply(lambda x: f"{x:.4f}" if not np.isnan(x) else "-")

print(summary_df.to_string(index=False))

# Key findings
print(f"\n\nKey Findings:")
print(f"  1. Greedy pass@1: {all_results.get('greedy', {}).get('metrics', {}).accuracy if 'greedy' in all_results else 'N/A'}")

# Best temperature for each k
df_multi = df[df["N Samples"] > 1]
if not df_multi.empty:
    best_pass1 = df_multi.loc[df_multi["pass@1"].idxmax()]
    print(f"  2. Best pass@1: {best_pass1['Experiment']} ({best_pass1['pass@1']:.4f})")
    
    if "pass@10" in df_multi.columns and not df_multi["pass@10"].isna().all():
        best_pass10 = df_multi.loc[df_multi["pass@10"].idxmax()]
        print(f"  3. Best pass@10: {best_pass10['Experiment']} ({best_pass10['pass@10']:.4f})")

df.to_csv(f"{OUTPUT_DIR}/summary.csv", index=False)
print(f"\nResults saved to {OUTPUT_DIR}/")