# MBPP Evaluation Analysis

This notebook summarizes results from baseline and LoRA r=8 runs using saved metrics, without loading any models. It compares pass@1 and syntax rates and shows representative examples.



In [1]:
import json, os, ast
from typing import Dict, List, Tuple

# Paths
BASELINE_RESULTS = "artifacts/metrics/baseline_mbpp_results.json"  # may be missing
BASELINE_GENERATIONS = "artifacts/metrics/baseline_generations.jsonl"
MBPP_TEST = "data/processed/mbpp_test.jsonl"
R8_RESULTS = "artifacts/metrics/mistral7b-code-r8-mbpp_results.json"


def load_jsonl(path: str) -> List[Dict]:
    items: List[Dict] = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    return items


def safe_syntax_ok(code: str) -> bool:
    try:
        ast.parse(code)
        return True
    except Exception:
        return False


def run_tests_on_code(code: str, tests: List[str]) -> Tuple[bool, List[str]]:
    # WARNING: executes code/tests; use only in trusted environments
    g: Dict = {}
    try:
        exec(code, g, g)  # noqa: S102
    except Exception as e:
        return False, [f"Execution error: {type(e).__name__}: {e}"]
    errors: List[str] = []
    for t in tests:
        try:
            exec(t, g, g)  # noqa: S102
        except Exception as e:
            errors.append(f"Test failed: {t} -> {type(e).__name__}: {e}")
    return (len(errors) == 0), errors



In [2]:
# Load r=8 results
r8 = json.load(open(R8_RESULTS, "r", encoding="utf-8")) if os.path.exists(R8_RESULTS) else None
if r8:
    print("LoRA r=8 summary:", r8["summary"])  # contains total, syntax_rate, pass_rate
else:
    print("LoRA r=8 results not found at:", R8_RESULTS)



LoRA r=8 summary: {'total': 500, 'syntax_ok': 496, 'syntax_rate': 0.992, 'pass': 22, 'pass_rate': 0.044, 'model': 'mistralai/Mistral-7B-Instruct-v0.2', 'lora_dir': 'artifacts/checkpoints/mistral7b-code-r8'}


In [3]:
# Load or compute baseline metrics
baseline = None
if os.path.exists(BASELINE_RESULTS):
    try:
        baseline = json.load(open(BASELINE_RESULTS, "r", encoding="utf-8"))
        print("Baseline summary (from results):", baseline["summary"])
    except Exception as e:
        print("Could not read baseline results:", e)

if baseline is None and os.path.exists(BASELINE_GENERATIONS) and os.path.exists(MBPP_TEST):
    print("Baseline summary not found; computing from generations + MBPP tests (this may take a while)...")
    gens = load_jsonl(BASELINE_GENERATIONS)
    tests = {ex.get("task_id"): (ex.get("tests") or []) for ex in load_jsonl(MBPP_TEST)}
    total = len(gens)
    num_syntax_ok = 0
    num_pass = 0
    for idx, g in enumerate(gens, start=1):
        code = g.get("generated", "").strip()
        ok = safe_syntax_ok(code)
        if ok:
            num_syntax_ok += 1
        task_id = g.get("task_id")
        tlist = tests.get(task_id, [])
        passed = False
        if tlist:
            p, _ = run_tests_on_code(code, tlist)
            passed = p
        if passed:
            num_pass += 1
        if idx % 50 == 0 or idx == total:
            print(f"[baseline] processed {idx}/{total}")
    baseline_summary = {
        "total": total,
        "syntax_ok": num_syntax_ok,
        "syntax_rate": num_syntax_ok / max(1, total),
        "pass": num_pass,
        "pass_rate": num_pass / max(1, total),
        "model": "mistralai/Mistral-7B-Instruct-v0.2",
        "lora_dir": None,
    }
    print("Baseline summary (computed):", baseline_summary)
else:
    if baseline is None:
        print("Baseline generations or MBPP tests not found; skipping baseline computation.")



Baseline summary (from results): {'total': 500, 'syntax_ok': 486, 'syntax_rate': 0.972, 'pass': 11, 'pass_rate': 0.022, 'model': 'mistralai/Mistral-7B-Instruct-v0.2', 'lora_dir': None}


In [4]:
# Comparison
r8_summary = r8["summary"] if r8 else None
baseline_summary = baseline["summary"] if isinstance(baseline, dict) and "summary" in baseline else (
    locals().get("baseline_summary") if "baseline_summary" in locals() else None
)

if r8_summary and baseline_summary:
    print("Baseline pass_rate:", round(baseline_summary["pass_rate"], 3), "syntax_rate:", round(baseline_summary["syntax_rate"], 3))
    print("LoRA r=8 pass_rate:", round(r8_summary["pass_rate"], 3), "syntax_rate:", round(r8_summary["syntax_rate"], 3))
else:
    print("Not enough data to compare both baselines and r=8.")



Baseline pass_rate: 0.022 syntax_rate: 0.972
LoRA r=8 pass_rate: 0.044 syntax_rate: 0.992


In [5]:
# Sample successes and failures from r=8
if r8 and "results" in r8:
    results = r8["results"]
    passed = [r for r in results if r.get("passed")]
    failed = [r for r in results if not r.get("passed")]
    print("Examples — Passed:")
    for ex in passed[:3]:
        print("- task", ex.get("task_id"), "|", ex.get("instruction")[:80])
        print((ex.get("generated") or "").split("\n")[0][:120], "...\n")
    print("Examples — Failed:")
    for ex in failed[:3]:
        print("- task", ex.get("task_id"), "|", ex.get("instruction")[:80])
        errs = ex.get("errors") or []
        print("Error:", errs[0] if errs else "(no details)")
else:
    print("r=8 results not found or missing 'results' field.")



Examples — Passed:
- task 54 | Write a function to sort the given array by using counting sort.
def counting_sort(arr): ...

- task 71 | Write a function to sort a list of elements using comb sort.
def comb_sort(arr): ...

- task 80 | Write a function to find the nth tetrahedral number.
def tetrahedral_number(n): ...

Examples — Failed:
- task 11 | Write a python function to remove first and last occurrence of a given character
Error: Test failed: assert remove_Occ("hello","l") == "heo" -> NameError: name 'remove_Occ' is not defined
- task 12 | Write a function to sort a given matrix in ascending order according to the sum 
Error: Test failed: assert sort_matrix([[1, 2, 3], [2, 4, 5], [1, 1, 1]])==[[1, 1, 1], [1, 2, 3], [2, 4, 5]] -> AssertionError: 
- task 13 | Write a function to count the most common words in a dictionary.
Error: Test failed: assert count_common(['red','green','black','pink','black','white','black','eyes','white','black','orange','pink','pink','red','red','white','o

In [7]:
# Print test cases where the trained model (r=8) passed and the baseline failed

if r8 and "results" in r8 and baseline and "results" in baseline:
    # Index baseline results by task_id for fast lookup
    baseline_results_by_task = {r.get("task_id"): r for r in baseline["results"]}
    r8_results = r8["results"]

    cases_passed_r8_failed_baseline = []
    for r8_ex in r8_results:
        task_id = r8_ex.get("task_id")
        baseline_ex = baseline_results_by_task.get(task_id)
        if not baseline_ex:
            continue
        if r8_ex.get("passed") and not baseline_ex.get("passed"):
            cases_passed_r8_failed_baseline.append((r8_ex, baseline_ex))

    print(f"\nTest cases where LoRA r=8 PASSED but baseline FAILED ({len(cases_passed_r8_failed_baseline)} cases):")
    for i, (r8_ex, baseline_ex) in enumerate(cases_passed_r8_failed_baseline[:5]):  # print at most 5 examples
        print(f"--- Example {i+1} ---")
        print("Task ID:", r8_ex.get("task_id"))
        instr = r8_ex.get("instruction", "")
        print("Instruction:", instr[:200] + ("..." if len(instr) > 200 else ""))
        print("\nLoRA r=8 generated:\n", (r8_ex.get("generated") or "")[:400], "\n")
        baseline_gen = baseline_ex.get("generated") or ""
        print("Baseline generated:\n", baseline_gen[:400], "\n")
        baseline_errs = baseline_ex.get("errors") or []
        err_msg = baseline_errs[0] if baseline_errs else "(no error details)"
        print("Baseline error:", err_msg)
        print("-" * 80)
    if not cases_passed_r8_failed_baseline:
        print("No cases found where r=8 passed and baseline failed.")
else:
    print("Cannot compute delta cases: missing results in r=8 or baseline.")



Test cases where LoRA r=8 PASSED but baseline FAILED (16 cases):
--- Example 1 ---
Task ID: 54
Instruction: Write a function to sort the given array by using counting sort.

LoRA r=8 generated:
 def counting_sort(arr):
    min_val = min(arr)
    max_val = max(arr)
    count_arr = [0] * (max_val - min_val + 1)

    for i in arr:
        count_arr[i - min_val] += 1

    for i in range(1, len(count_arr)):
        count_arr[i] += count_arr[i - 1]

    sorted_arr = [0] * len(arr)
    for i in arr:
        sorted_arr[count_arr[i - min_val] - 1] = i
        count_arr[i - min_val] -= 1

    retur 

Baseline generated:
 def counting_sort(arr):
    n = len(arr)
    output = [0] * n

    # Initialize count array
    count = [0] * (n+1)

    # Find the count of each element in the input array
    for i in arr:
        count[i] += 1

    # Change count array to cumulative count array
    for i in range(1, n+1):
        count[i] += count[i-1]

    # Find the index of each element in the output arra