In [16]:
# === IMPORTS ===
import os
import sys
import json
import numpy as np
from typing import List, Dict
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict

sys.path.insert(0, '/home/shougan/projects/aip-fredashi/shougan/balance-budget')
BASE_DIR = Path('/home/shougan/projects/aip-fredashi/shougan/balance-budget')

from tuning.config import OUTPUTS_DIR
from tuning.inference.ifeval_inference import run_inference_ifeval
from instruction_following_eval import evaluation_lib, instructions_registry

In [2]:
# === UTILITY FUNCTIONS ===

def pass_at_k(n: int, c: int, k: int) -> float:
    """Calculate pass@k: probability that at least one of k samples is correct."""
    if n - c < k:
        return 1.0
    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

def compute_pass_at_k_scores(results_per_prompt: List[List[bool]], k_values: List[int]) -> Dict[int, float]:
    """Compute average pass@k across all prompts."""
    scores = {k: [] for k in k_values}
    for results in results_per_prompt:
        n, c = len(results), sum(results)
        for k in k_values:
            if k <= n:
                scores[k].append(pass_at_k(n, c, k))
    return {k: np.mean(v) for k, v in scores.items() if v}

def save_responses(results: List[Dict], model_name: str):
    path = OUTPUTS / model_name
    path.mkdir(parents=True, exist_ok=True)
    with open(path / "responses_multi_sample.jsonl", "w") as f:
        for r in results:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def load_responses(model_name: str) -> List[Dict]:
    with open(OUTPUTS / model_name / "responses_multi_sample.jsonl") as f:
        return [json.loads(line) for line in f]

In [3]:
# === CONFIGURATION ===
model_name = "llama3-1B"
k_values = [1, 2]
n_samples = 4
temperature = 0.7
run_inference_flag = True
num_examples = 20  # Set to None for full dataset

# Setup paths
OUTPUTS = Path(OUTPUTS_DIR) / "pass@k_responses"
IFEVAL_INPUT_PATH = BASE_DIR / "instruction_following_eval/data/input_data.jsonl"

In [4]:
# === STEP 1: LOAD IFEVAL INPUTS ===
inputs_map = {inp.prompt: inp for inp in evaluation_lib.read_prompt_list(str(IFEVAL_INPUT_PATH))}
print(f"Loaded {len(inputs_map)} IFEval prompts")

Loaded 541 IFEval prompts


In [5]:
print(evaluation_lib.read_prompt_list(str(IFEVAL_INPUT_PATH))[0])
test_prompt = evaluation_lib.read_prompt_list(str(IFEVAL_INPUT_PATH))[0].prompt
print("\n", test_prompt)
print("\n",inputs_map[test_prompt])

InputExample(key=1000, instruction_id_list=['punctuation:no_comma', 'detectable_format:number_highlighted_sections', 'length_constraints:number_words'], prompt='Write a 300+ word summary of the wikipedia page "https://en.wikipedia.org/wiki/Raymond_III,_Count_of_Tripoli". Do not use any commas and highlight at least 3 sections that has titles in markdown format, for example *highlighted section part 1*, *highlighted section part 2*, *highlighted section part 3*.', kwargs=[{}, {'num_highlights': 3}, {'relation': 'at least', 'num_words': 300}])

 Write a 300+ word summary of the wikipedia page "https://en.wikipedia.org/wiki/Raymond_III,_Count_of_Tripoli". Do not use any commas and highlight at least 3 sections that has titles in markdown format, for example *highlighted section part 1*, *highlighted section part 2*, *highlighted section part 3*.

 InputExample(key=1000, instruction_id_list=['punctuation:no_comma', 'detectable_format:number_highlighted_sections', 'length_constraints:number

In [15]:
# === STEP 2: RUN INFERENCE OR LOAD CACHED RESPONSES ===
print(f"Model: {model_name}")

if run_inference_flag:
    print(f"Running inference with n_samples={n_samples}, temperature={temperature}")
    raw_results = run_inference_ifeval(
        model_name=model_name,
        n_samples=n_samples,
        temperature=temperature,
        save_results=False,
        num_examples=num_examples
    )
    # Group responses by prompt for pass@k evaluation
    # raw_results: [{prompt: "", responses: ["", "", ...]}, ...] where each response is separate
    grouped = defaultdict(list)
    for r in raw_results:
        grouped[r["prompt"]].append(r["response"])
    
    model_results = [{"prompt": p, "responses": resps} for p, resps in grouped.items()]
    print(f"Generated {len(model_results)} prompts with {n_samples} samples each")
    save_responses(model_results, model_name)
    print("Responses saved.")
else:
    print("Loading cached responses...")
    model_results = load_responses(model_name)
    print(f"Loaded {len(model_results)} results")

Model: llama3-1B
Running inference with n_samples=4, temperature=0.7
Loading model from /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-1B
Using GPU memory utilization: 0.8
INFO 02-04 22:41:52 [config.py:841] This model supports multiple tasks: {'classify', 'embed', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 02-04 22:41:52 [config.py:3368] Downcasting torch.float32 to torch.bfloat16.


INFO 02-04 22:41:52 [config.py:1472] Using max model len 131072
INFO 02-04 22:41:52 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 02-04 22:42:00 [__init__.py:244] Automatically detected platform cuda.
INFO 02-04 22:42:02 [core.py:526] Waiting for init message from front-end.
INFO 02-04 22:42:02 [core.py:69] Initializing a V1 LLM engine (v0.9.2) with config: model='/home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-1B', speculative_config=None, tokenizer='/home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-1B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.25it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.25it/s]



INFO 02-04 22:42:04 [default_loader.py:272] Loading weights took 0.85 seconds
INFO 02-04 22:42:04 [gpu_model_runner.py:1801] Model loading took 2.3205 GiB and 1.042927 seconds
INFO 02-04 22:42:08 [backends.py:508] Using cache directory: /home/shougan/.cache/vllm/torch_compile_cache/ab3adac7a0/rank_0_0/backbone for vLLM's torch.compile
INFO 02-04 22:42:08 [backends.py:519] Dynamo bytecode transform time: 3.91 s
INFO 02-04 22:42:11 [backends.py:155] Directly load the compiled graph(s) for shape None from the cache, took 2.289 s
INFO 02-04 22:42:11 [monitor.py:34] torch.compile takes 3.91 s in total
INFO 02-04 22:42:12 [gpu_worker.py:232] Available KV cache memory: 56.11 GiB
INFO 02-04 22:42:12 [kv_cache_utils.py:716] GPU KV cache size: 1,838,496 tokens
INFO 02-04 22:42:12 [kv_cache_utils.py:720] Maximum concurrency for 131,072 tokens per request: 14.03x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:16<00:00,  4.15it/s]


INFO 02-04 22:42:29 [gpu_model_runner.py:2326] Graph capturing finished in 16 secs, took 0.43 GiB
INFO 02-04 22:42:29 [core.py:172] init engine (profile, create kv cache, warmup model) took 24.79 seconds


Adding requests:   0%|          | 0/20 [00:00<?, ?it/s]

Processed prompts:   0%|                                                                                      …

Generated 20 responses using vllm




Generated 20 prompts with 4 samples each
Responses saved.


In [7]:
# === STEP 4: EVALUATE SINGLE RESPONSE USING PRE-BUILT FUNCTIONS ===
def evaluate_single_response(inp: evaluation_lib.InputExample, response: str, strict: bool = True) -> bool:
    """Evaluate a single response using the pre-built IFEval functions."""
    prompt_to_response = {inp.prompt: response}
    
    if strict:
        result = evaluation_lib.test_instruction_following_strict(inp, prompt_to_response)
    else:
        result = evaluation_lib.test_instruction_following_loose(inp, prompt_to_response)
    
    return result.follow_all_instructions

In [8]:
test_response = model_results[0]["responses"][0]
test_eval_input = inputs_map[model_results[0]["prompt"]]
evaluate_single_response(test_eval_input,test_response)

False

In [9]:
# === STEP 5: COMPUTE PASS@K SCORES ===

model_name = "llama3-8B_sft-tuluif-10000"
model_results = load_responses(model_name)
final_results_strict = defaultdict(list)
final_results_loose = defaultdict(list)
# === PIPELINE ===
results_per_prompt = {}
for item in tqdm(model_results, desc="Evaluating responses"):
    prompt = item["prompt"]
    responses = item["responses"]
    eval_input = inputs_map[prompt]
    strict_results = [evaluate_single_response(eval_input, r, strict=True) for r in responses]
    loose_results = [evaluate_single_response(eval_input, r, strict=False) for r in responses]
    results_per_prompt[prompt] = {
        "strict": strict_results,
        "loose": loose_results,
        **{f"strict_k{i}": pass_at_k(len(strict_results), sum(strict_results), i) for i in k_values},
        **{f"loose_k{i}": pass_at_k(len(loose_results), sum(loose_results), i) for i in k_values},
        "strict_k":[pass_at_k(len(strict_results), sum(strict_results), i) for i in k_values],
        "loose_k":[ pass_at_k(len(loose_results), sum(loose_results), i) for i in k_values]
    }
    for i in k_values:
        final_results_strict[f"k{i}"].append(pass_at_k(len(strict_results), sum(strict_results), i))
        final_results_loose[f"k{i}"].append(pass_at_k(len(loose_results), sum(loose_results), i))
    # for r in responses:
    #     if evaluate_single_response(eval_input, r, strict=True):
    #         print(prompt)
    #         print(r)
    #         print("="*50)
    #         break

Evaluating responses: 100%|█| 541/541 [00:53<00:00, 10.18it/s


In [None]:

print([np.mean(final_results_strict[f"k{i}"]) for i in k_values])


[0.5973313308687616, 0.7070350321274537, 0.7868362086127725, 0.8413829714691288, 0.8791300770301045, 0.9065080487227927, 0.9260628465804066]


In [28]:
my_sum = 0

for prompt, result in results_per_prompt.items():
    my_sum += result['loose_k64']
print(my_sum/541)

0.9574861367837338


In [None]:
# === FINAL RESULTS ===
print(f"\n{'='*50}")
print(f"RESULTS: {model_name}")
print(f"{'='*50}")
print(f"Strict pass@k: {strict_scores}")
print(f"Loose pass@k:  {loose_scores}")