In [2]:
import json, torch
import numpy as np
import torch.nn.functional as F
from scipy.stats import pearsonr
from transformers import AutoTokenizer, AutoModel
import os
import matplotlib.pyplot as plt
PRM_MODEL="Qwen/Qwen2.5-Math-PRM-7B"


tokenizer=AutoTokenizer.from_pretrained(PRM_MODEL,trust_remote_code=True)
model=AutoModel.from_pretrained(
    PRM_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
).eval()

STEP_TOKEN = tokenizer.encode("<extra_0>")[0]

def avg_step_length(steps):
    return np.mean([len(s.split()) for s in steps]) if len(steps)>0 else 0


# ---------------- Step Scoring ----------------
def prm_step_scores(logits, input_ids):
    probs = F.softmax(logits, dim=-1)
    idx   = (input_ids==STEP_TOKEN).nonzero(as_tuple=True)[1]
    return [ probs[0, idx[i]:idx[i+1], 1].mean().item()
             for i in range(len(idx)-1) ]


def eval_cot_prm(system, query, steps):
    text = "<extra_0>".join(steps) + "<extra_0>"

    msgs = [
        {"role":"system", "content":system},
        {"role":"user",   "content":query},
        {"role":"assistant", "content":text}
    ]

    conv = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)
    ids  = tokenizer.encode(conv, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out = model(input_ids=ids, use_cache=False)

    return prm_step_scores(out[0], ids)

def analyze_step_errors(scores, thr=0.5):
    """
    Returns:
        earliest_err  : index of first low-confidence step  (1-based)
        prefix_len    : how many steps are consecutively correct from the start
    """
    n = len(scores)

    # ---------- earliest error step (first score<thr) ----------
    err_idx = next((i+1 for i,s in enumerate(scores) if s < thr), None)

    # ---------- prefix correctness ----------
    prefix_len = 0
    for s in scores:
        if s > thr: prefix_len += 1
        else: break

    return err_idx, prefix_len
    

def run_dataset(jsonl, thr=0.5, label="exact_match"):
    data=[json.loads(l) for l in open(jsonl)]

    F_full=[];F_no_last=[];F_hard=[];F_hard_no_last=[];
    EARLY=[]; PREFIX=[]; AVG_LEN=[]; STEPS=[];
    Y=[]

    for i,d in enumerate(data):
        if d.get("filter") == "strict-match": continue

        cot = d["resps"][0][0].strip()
        steps = [s.strip() for s in cot.split("\n") if s.strip()] \
                if "\n" in cot else \
                [s.strip() for s in cot.split(".") if s.strip()]

        if len(steps)==0: continue  # skip broken samples
        scores = eval_cot_prm("", d["arguments"]["gen_args_0"]["arg_0"], steps)

        Fi_full = sum(scores)/len(scores)
        Fi_no_last = sum(scores[:-1])/len(scores[:-1]) if len(scores)>1 else Fi_full
        Fi_hard = sum(s>thr for s in scores)/len(scores)
        Fi_hard_no_last = sum(s>thr for s in scores[:-1])/(len(scores)-1) if len(scores)>1 else Fi_hard

        earliest_err, prefix_len = analyze_step_errors(scores, thr)
        avg_len = avg_step_length(steps)   # ‚≠ê new

        yi = int(d.get(label,0))

        print(f"[{i}] steps={len(scores)} | avg_step_len={avg_len:.1f} | scores={scores} | len(steps)={len(steps)}" )
        print(f"  Full={Fi_full:.3f} NoLast={Fi_no_last:.3f} | Hard={Fi_hard:.3f} HardNL={Fi_hard_no_last:.3f}")
        print(f"  üî∏EarliestError={earliest_err} üî∏PrefixOK={prefix_len} üî∏AvgLen={avg_len} | Y={yi}\n")

        F_full.append(Fi_full);F_no_last.append(Fi_no_last)
        F_hard.append(Fi_hard);F_hard_no_last.append(Fi_hard_no_last)
        EARLY.append(earliest_err);PREFIX.append(prefix_len);AVG_LEN.append(avg_len)
        STEPS.append(len(steps))
        Y.append(yi)

    # safe corr
    def c(a): return pearsonr(a,Y)[0] if len(a)>1 else 0

    print("\n‚îÅ‚îÅ‚îÅ‚îÅ Aggregate ‚îÅ‚îÅ‚îÅ‚îÅ")
    print(f"Step length mean = {np.mean(AVG_LEN):.2f}")
    print("‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ\n")

    return F_full, F_no_last, F_hard, F_hard_no_last, EARLY, PREFIX, AVG_LEN, STEPS, Y





A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B:
- configuration_qwen2_rm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B:
- modeling_qwen2_rm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:04<00:00,  1.15s/it]
Some weights of the model checkpoint at Qwen/Qwen2.5-Math-PRM-7B were not used when initializing Qwen2ForProcessRewardModel: ['lm_head.weight']
- This IS expected if you are initializing Qwen2ForProcessRewardModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassifica

In [None]:
import glob, json, os, numpy as np
# ===================== CONFIG =====================
BASE_DIR = "/home/youyang7/projects/lm-evaluation-harness/lm_eval/models/eval_grid_11_18/triviaqa_cot_gsm8k_cot_zeroshot"

L_values   = ["L8", "L16", "L24"]
lam_values = ["BASELINE", "lam0p5","lam1p0","lam1p5","lam2p0"]

SAVE_PATH = "prm_results_live_dump.json"   # ‰Ω†ÂèØ‰ª•Êç¢Ë∑ØÂæÑ
# ==================================================

results = {}
for L in L_values:
    results.setdefault(L, {})

    for lam in lam_values:
        folder = f"{BASE_DIR}/Llama-3.1-8B-Instruct_{L}_{lam}/meta-llama__Llama-3.1-8B-Instruct/"
        pattern = os.path.join(folder, "samples_gsm8k_cot_zeroshot_*.jsonl")

        files = sorted(glob.glob(pattern))  # <-- Only matching correct jsonl names
        
        if len(files)==0:
            print(f"‚ö† No jsonl found for {L}-{lam}")
            continue
        
        # ‚≠ê Pick newest file
        jsonl = files[-1]
        print(f"üöÄ Running {L}-{lam}  |  using latest JSONL:\n ‚Üí {jsonl}\n")

        F_full, F_no_last, F_hard, F_hard_no_last, EARLY, PREFIX, AVG_LEN, STEPS, Y = run_dataset(jsonl)
        EARLY_clean = [e if e is not None else (max([x for x in EARLY if x is not None])+1) 
               for e in EARLY]
        results[L][lam] = {
            "corr_full"       : float(pearsonr(F_full, Y)[0]),
            "corr_hard"       : float(pearsonr(F_hard, Y)[0]),
            "corr_avg_prefix" : float(pearsonr(PREFIX, Y)[0]),
            "corr_avg_steps"  : float(pearsonr(STEPS, Y)[0]),
            "corr_avg_first_error"      : float(pearsonr(EARLY_clean, Y)[0]),
            "avg_prefix"      : float(np.mean(PREFIX)),
            "avg_first_error" : float(np.mean([e for e in EARLY if e is not None])),
            "avg_steps"       : float(np.mean(STEPS)),
            "file_used"       : jsonl,  # ‚≠ê Save for traceability
            "Y"               : Y
        }

        # ÂÆûÊó∂ÂÜôÁõòÔºàÈÅøÂÖç‰∏≠Êñ≠ÊçüÂ§±ÁªìÊûúÔºâ
        with open(SAVE_PATH, "w") as f: json.dump(results, f, indent=4)
        print(f"üíæ Saved to ‚Üí {SAVE_PATH}\n")

# ===================== Plotting =====================
import matplotlib.pyplot as plt

# ==== Create output dir ====
fig_dir = "figures"
os.makedirs(fig_dir, exist_ok=True)
print(f"üìÅ Figures will be saved to: {fig_dir}/")

# ==== x-axis tick labels ====
lam_plot_labels = {
    "BASELINE":"0.0","lam0p5":"0.5","lam1p0":"1.0","lam1p5":"1.5","lam2p0":"2.0"
}
x = [lam_plot_labels[l] for l in lam_values]


def plot_metric(metric_key, title, ylabel, fname):
    plt.figure(figsize=(7,5))

    for L in results.keys():
        y = [results[L][lam][metric_key] if lam in results[L] else None
             for lam in lam_values]
        plt.plot(x, y, marker="o", label=L)

    plt.title(title, fontsize=13)
    plt.xlabel("Lambda", fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend(title="Training Layers (L)")
    plt.grid(alpha=0.35)

    save_path = f"{fig_dir}/{fname}.png"
    plt.savefig(save_path, dpi=300, bbox_inches="tight")
    plt.close()
    print(f"üìå Saved: {save_path}")


# ==== Generate All Figures ====

plot_metric("corr_full",       "Correlation (Full Step Mean)",      "Pearson r", "corr_full")
plot_metric("corr_hard",       "Correlation (Hard Step Score)",     "Pearson r", "corr_hard")
    
plot_metric("corr_avg_prefix", "Correlation (Avg Prefix Correctness Length)", "Pearson r", "corr_avg_prefix")
plot_metric("corr_avg_first_error", "Correlation (Avg First Error Position)", "Pearson r", "corr_avg_first_error")
plot_metric("corr_avg_steps", "Correlation (Mean Total Steps per Solution)", "Pearson r", "corr_avg_steps")

plot_metric("avg_prefix",      "Avg Prefix Correctness Length",     "Prefix Length", "avg_prefix")
plot_metric("avg_first_error", "Avg First Error Position",          "First Error Step", "avg_first_error")
plot_metric("avg_steps",       "Mean Total Steps per Solution",     "Step Count", "avg_step_count")

print("\nüéâ All plots completed & saved!")
# Ê†πÊçÆËøô‰∏™resultsËÆ°ÁÆó

üöÄ Running L8-BASELINE  |  using latest JSONL:
 ‚Üí /home/youyang7/projects/lm-evaluation-harness/lm_eval/models/eval_grid_11_18/triviaqa_cot_gsm8k_cot_zeroshot/Mistral-7B-Instruct-v0.3_L8_BASELINE/mistralai__Mistral-7B-Instruct-v0.3/samples_gsm8k_cot_zeroshot_2025-11-17T18-05-58.820230.jsonl

--- Sample 66 with ["1. Janet's ducks lay 16 eggs per day.", '2. She eats 3 eggs for breakfast, so she has 16 - 3 = 13 eggs left.', '3. She uses 4 eggs to bake muffins, leaving her with 13 - 4 = 9 eggs.', "4. She sells the remaining eggs at the farmers' market for $2 per egg.", "5. So, she makes 9 * $2 = $18 per day at the farmers' market.", "Final answer: Janet makes $18 per day at the farmers' market."] steps ---
[66] steps=5 | avg_step_len=13.7 | scores=[0.66015625, 0.68359375, 0.68359375, 0.72265625, 0.73828125] | len(steps)=6
  Full=0.698 NoLast=0.688 | Hard=1.000 HardNL=1.000
  üî∏EarliestError=None üî∏PrefixOK=5 üî∏AvgLen=13.666666666666666 | Y=1

--- Sample 67 with ['The robe takes 2

KeyError: 'corr_no_last'

<Figure size 700x500 with 0 Axes>

In [22]:


# ==== Generate All Figures ====

plot_metric("corr_full",       "Correlation (Full Step Mean)",      "Pearson r", "corr_full")
plot_metric("corr_hard",       "Correlation (Hard Step Score)",     "Pearson r", "corr_hard")
    
plot_metric("corr_avg_prefix", "Correlation (Avg Prefix Correctness Length)", "Pearson r", "corr_avg_prefix")
plot_metric("corr_avg_first_error", "Correlation (Avg First Error Position)", "Pearson r", "corr_avg_first_error")
plot_metric("corr_avg_steps", "Correlation (Mean Total Steps per Solution)", "Pearson r", "corr_avg_steps")

plot_metric("avg_prefix",      "Avg Prefix Correctness Length",     "Prefix Length", "avg_prefix")
plot_metric("avg_first_error", "Avg First Error Position",          "First Error Step", "avg_first_error")
plot_metric("avg_steps",       "Mean Total Steps per Solution",     "Step Count", "avg_step_count")

print("\nüéâ All plots completed & saved!")
# Ê†πÊçÆËøô‰∏™resultsËÆ°ÁÆó

üìå Saved: figures/corr_full.png
üìå Saved: figures/corr_hard.png
üìå Saved: figures/corr_avg_prefix.png
üìå Saved: figures/corr_avg_first_error.png
üìå Saved: figures/corr_avg_steps.png
üìå Saved: figures/avg_prefix.png
üìå Saved: figures/avg_first_error.png
üìå Saved: figures/avg_step_count.png

üéâ All plots completed & saved!


In [24]:
import glob, json, os, numpy as np
# ===================== CONFIG =====================
BASE_DIR = "/home/youyang7/projects/lm-evaluation-harness/lm_eval/models/eval_grid_11_18/triviaqa_cot_gsm8k_cot_zeroshot"

L_values   = ["L8", "L16", "L24"]
lam_values = ["BASELINE", "lam0p5","lam1p0","lam1p5","lam2p0"]

SAVE_PATH = "prm_results_live_dump.json"   # ‰Ω†ÂèØ‰ª•Êç¢Ë∑ØÂæÑ
# ==================================================

results = {}
for L in L_values:
    results.setdefault(L, {})

    for lam in lam_values:
        # Qwen2.5-7B-Instruct_L8_lam2p0
        folder = f"{BASE_DIR}/Qwen2.5-7B-Instruct_{L}_{lam}/Qwen__Qwen2.5-7B-Instruct"
        pattern = os.path.join(folder, "samples_gsm8k_cot_zeroshot_*.jsonl")

        files = sorted(glob.glob(pattern))  # <-- Only matching correct jsonl names
        
        if len(files)==0:
            print(f"‚ö† No jsonl found for {L}-{lam}")
            continue
        
        # ‚≠ê Pick newest file
        jsonl = files[-1]
        print(f"üöÄ Running {L}-{lam}  |  using latest JSONL:\n ‚Üí {jsonl}\n")

        F_full, F_no_last, F_hard, F_hard_no_last, EARLY, PREFIX, AVG_LEN, STEPS, Y = run_dataset(jsonl)
        EARLY_clean = [e if e is not None else (max([x for x in EARLY if x is not None])+1) 
               for e in EARLY]
        results[L][lam] = {
            "corr_full"       : float(pearsonr(F_full, Y)[0]),
            "corr_hard"       : float(pearsonr(F_hard, Y)[0]),
            "corr_avg_prefix" : float(pearsonr(PREFIX, Y)[0]),
            "corr_avg_steps"  : float(pearsonr(STEPS, Y)[0]),
            "corr_avg_first_error"      : float(pearsonr(EARLY_clean, Y)[0]),
            "avg_prefix"      : float(np.mean(PREFIX)),
            "avg_first_error" : float(np.mean([e for e in EARLY if e is not None])),
            "avg_steps"       : float(np.mean(STEPS)),
            "file_used"       : jsonl,  # ‚≠ê Save for traceability
            "Y"               : Y
        }

        # ÂÆûÊó∂ÂÜôÁõòÔºàÈÅøÂÖç‰∏≠Êñ≠ÊçüÂ§±ÁªìÊûúÔºâ
        with open(SAVE_PATH, "w") as f: json.dump(results, f, indent=4)
        print(f"üíæ Saved to ‚Üí {SAVE_PATH}\n")

# ===================== Plotting =====================
import matplotlib.pyplot as plt

# ==== Create output dir ====
fig_dir = "figures"
os.makedirs(fig_dir, exist_ok=True)
print(f"üìÅ Figures will be saved to: {fig_dir}/")

# ==== x-axis tick labels ====
lam_plot_labels = {
    "BASELINE":"0.0","lam0p5":"0.5","lam1p0":"1.0","lam1p5":"1.5","lam2p0":"2.0"
}
x = [lam_plot_labels[l] for l in lam_values]


def plot_metric(metric_key, title, ylabel, fname):
    plt.figure(figsize=(7,5))

    for L in results.keys():
        y = [results[L][lam][metric_key] if lam in results[L] else None
             for lam in lam_values]
        plt.plot(x, y, marker="o", label=L)

    plt.title(title, fontsize=13)
    plt.xlabel("Lambda", fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend(title="Training Layers (L)")
    plt.grid(alpha=0.35)

    save_path = f"{fig_dir}/{fname}.png"
    plt.savefig(save_path, dpi=300, bbox_inches="tight")
    plt.close()
    print(f"üìå Saved: {save_path}")


# ==== Generate All Figures ====

plot_metric("corr_full",       "Correlation (Full Step Mean)",      "Pearson r", "corr_full")
plot_metric("corr_hard",       "Correlation (Hard Step Score)",     "Pearson r", "corr_hard")
    
plot_metric("corr_avg_prefix", "Correlation (Avg Prefix Correctness Length)", "Pearson r", "corr_avg_prefix")
plot_metric("corr_avg_first_error", "Correlation (Avg First Error Position)", "Pearson r", "corr_avg_first_error")
plot_metric("corr_avg_steps", "Correlation (Mean Total Steps per Solution)", "Pearson r", "corr_avg_steps")

plot_metric("avg_prefix",      "Avg Prefix Correctness Length",     "Prefix Length", "avg_prefix")
plot_metric("avg_first_error", "Avg First Error Position",          "First Error Step", "avg_first_error")
plot_metric("avg_steps",       "Mean Total Steps per Solution",     "Step Count", "avg_step_count")

print("\nüéâ All plots completed & saved!")
# Ê†πÊçÆËøô‰∏™resultsËÆ°ÁÆó

‚ö† No jsonl found for L8-BASELINE
üöÄ Running L8-lam0p5  |  using latest JSONL:
 ‚Üí /home/youyang7/projects/lm-evaluation-harness/lm_eval/models/eval_grid_11_18/triviaqa_cot_gsm8k_cot_zeroshot/Qwen2.5-7B-Instruct_L8_lam0p5/Qwen__Qwen2.5-7B-Instruct/samples_gsm8k_cot_zeroshot_2025-11-17T21-05-55.543576.jsonl

--- Sample 66 with ["Sure, let's break down the problem step by step:", "1. **Calculate the total number of eggs laid by Janet's ducks per day:**", "- Janet's ducks lay 16 eggs per day.", '2. **Determine how many eggs Janet uses for herself:**', '- Janet eats 3 eggs for breakfast every morning.', '- She also uses 4 eggs to bake muffins for her friends.', '- Total eggs used: \\(3 + 4 = 7\\) eggs.', '3. **Calculate the number of eggs remaining after Janet uses some for herself:**', '- Remaining eggs: \\(16 - 7 = 9\\) eggs.', '4. **Calculate the revenue from selling the remaining eggs:**', "- Janet sells the remaining 9 eggs at the farmers' market.", '- Each egg is sold for $2.', '