In [1]:
# !pip install tabulate
# !pip install pytest tabulate hypothesis pandas matplotlib

import subprocess
import json
import os
import re
import time
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
metadata = []
with open("bug_portfolio/metadata.jsonl", "r") as f:
    buffer = ""
    for line in f:
        line = line.strip()
        if not line:
            continue
        buffer += line
        if line.endswith("}"):
            try:
                metadata.append(json.loads(buffer))
            except json.JSONDecodeError:
                pass
            buffer = ""

bug_map = {item["name"]: item.get("bug_description", "No description provided") for item in metadata}
functions = [item["name"] for item in metadata]

# Create mapping from function name to bug description
bug_map = {item["name"]: item.get("bug_description", "No description provided") for item in metadata}
functions = [item["name"] for item in metadata]

print(functions)
print(len(functions))

['set_Right_most_Unset_Bit', 'find_Max', 'get_max_gold', 'sumofFactors', 'first_Digit', 'find_max_val', 'bitonic_subsequence', 'binomial_Coeff', 'max_chain_length', 'sum_Of_Primes', 'max_run_uppercase', 'sort_by_dnf', 'pass_validity', 'check_Type_Of_Triangle', 'count_Pairs', 'generate_matrix', 'rgb_to_hsv', 'max_sub_array_sum', 'get_sum', 'count_duplic', 'is_subset', 'find_first_occurrence', 'longest_increasing_subsequence', 'sum_of_odd_Factors', 'find_longest_conseq_subseq', 'get_Number', 'Sum', 'get_median', 'largest_subset', 'armstrong_number']
30


In [3]:
def parse_counts_from_output(out):
    """Return (passed, failed) parsed from pytest stdout."""
    passed = 0
    failed = 0
    m_passed = re.search(r"(\d+)\s+passed", out)
    m_failed = re.search(r"(\d+)\s+failed", out)
    if m_passed:
        passed = int(m_passed.group(1))
    if m_failed:
        failed = int(m_failed.group(1))
    return passed, failed


def run_tests(file):
    """
    Run pytest on a single file.
    Returns:
        exitcode, passed, failed, time_taken_seconds, stdout, stderr
    """
    base = os.path.splitext(os.path.basename(file))[0]
    json_file = f"report_{base}.json"

    # ---- attempt 1: with JSON ----
    start = time.perf_counter()

    cmd = ["pytest", "-q", file, "--json-report", f"--json-report-file={json_file}"]
    proc = subprocess.run(cmd, capture_output=True, text=True)

    time_taken = time.perf_counter() - start

    # If pytest did not generate JSON or errored out → fallback
    if proc.returncode == 4 or not os.path.exists(json_file):
        # ---- fallback: run WITHOUT json-report ----
        start = time.perf_counter()

        fallback_cmd = ["pytest", "-q", file]
        proc2 = subprocess.run(fallback_cmd, capture_output=True, text=True)

        time_taken = time.perf_counter() - start

        passed, failed = parse_counts_from_output(proc2.stdout)
        return proc2.returncode, passed, failed, time_taken, proc2.stdout, proc2.stderr

    # ---- JSON successfully created: extract summary ----
    try:
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        summary = data.get("summary", {})
        passed = summary.get("passed", 0)
        failed = summary.get("failed", 0)
    except Exception:
        # fallback to parsing stdout
        passed, failed = parse_counts_from_output(proc.stdout)
    finally:
        try:
            os.remove(json_file)
        except OSError:
            pass

    return proc.returncode, passed, failed, time_taken, proc.stdout, proc.stderr


In [None]:
llm_found = {}
human_found = {}

for func in functions:
    print(f"\n=== Testing `{func}` ===")
    print(f"Bug Description: {bug_map[func]}")

    # LLM tests
    llm_file = f"llm_tests/generated_tests/test_{func}.py"
    llm_exitcode, llm_passed, llm_failed, llm_time, llm_out, llm_err = run_tests(llm_file)

    llm_status = "found" if llm_exitcode != 0 else "not found"
    llm_found[func] = (llm_exitcode != 0)
    print(f"LLM tests for `{func}`: Bug {llm_status} (exit code {llm_exitcode})")
    print(f"LLM Summary: {llm_passed} passed, {llm_failed} failed (time: {llm_time:.2f}s)")

    # Human tests
    human_file = f"human_tests/test_{func}.py"
    human_exitcode, human_passed, human_failed, human_time, human_out, human_err = run_tests(human_file)

    human_status = "found" if human_exitcode != 0 else "not found"
    human_found[func] = (human_exitcode != 0)
    print(f"Human tests for `{func}`: Bug {human_status} (exit code {human_exitcode})")
    print(f"Human Summary: {human_passed} passed, {human_failed} failed (time: {human_time:.2f}s)")



=== Testing `set_Right_most_Unset_Bit` ===
Bug Description: When n = 0, the function returns 0 instead of 1
LLM tests for `set_Right_most_Unset_Bit`: Bug found (exit code 1)
LLM Summary: 12 passed, 1 failed (time: 2.15s)
Human tests for `set_Right_most_Unset_Bit`: Bug found (exit code 1)
Human Summary: 3 passed, 2 failed (time: 2.86s)

=== Testing `find_Max` ===
Bug Description: the condition if (arr[low] >= arr[mid]) should be if (arr[low] > arr[mid]), as using >= can cause incorrect recursion and miss the maximum element in certain cases.
LLM tests for `find_Max`: Bug not found (exit code 0)
LLM Summary: 10 passed, 0 failed (time: 1.93s)
Human tests for `find_Max`: Bug found (exit code 1)
Human Summary: 2 passed, 3 failed (time: 3.08s)

=== Testing `get_max_gold` ===
Bug Description: The final answer calculation loop doesn't check for the last row's first element. ie loop is running till m-2
LLM tests for `get_max_gold`: Bug found (exit code 1)
LLM Summary: 7 passed, 2 failed (time:

In [None]:
results = []
for func in functions:
    results.append({
        "Function": func,
        "LLM Found": "Yes" if llm_found.get(func, False) else "No",
        "Human Found": "Yes" if human_found.get(func, False) else "No"
    })

df = pd.DataFrame(results)



# Manually update any function’s results here:
# df.loc[df['Function'] == 'gold_mine_problem', 'LLM Found'] = 'No'
# df.loc[df['Function'] == 'set_Right_most_Unset_Bit', 'Human Found'] = 'Yes'
# Uncomment and edit as needed

# ---- Manual overrides go here ----
# df.loc[df['Function'] == 'gold_mine_problem', 'LLM Found'] = 'No'
# df.loc[df['Function'] == 'set_Right_most_Unset_Bit', 'Human Found'] = 'Yes'
# ----------------------------------

print("### Individual Test Results\n")
print(df.to_markdown(index=False))

In [None]:
only_llm = ((df["LLM Found"] == "Yes") & (df["Human Found"] == "No")).sum()
only_human = ((df["LLM Found"] == "No") & (df["Human Found"] == "Yes")).sum()
both = ((df["LLM Found"] == "Yes") & (df["Human Found"] == "Yes")).sum()
neither = ((df["LLM Found"] == "No") & (df["Human Found"] == "No")).sum()

total = len(df)

# Overall detection percentages
llm_total_found = only_llm + both
human_total_found = only_human + both

llm_overall_percentage = round((llm_total_found / total) * 100, 2)
human_overall_percentage = round((human_total_found / total) * 100, 2)

scorecard = pd.DataFrame({
    "Metric": [
        "Bugs found only by LLM tests",
        "Bugs found only by Human properties",
        "Bugs found by both",
        "Bugs found by neither"
    ],
    "Count": [only_llm, only_human, both, neither],
})

# Add percentage column
scorecard["Percentage"] = (scorecard["Count"] / total * 100).round(2).astype(str) + "%"

print("\n\n### Final Scorecard\n")
print(scorecard.to_markdown(index=False))

print("\n### Overall Detection Rates\n")
print(f"LLM Overall Detection: {llm_total_found}/{total} = {llm_overall_percentage}%")
print(f"Human Overall Detection: {human_total_found}/{total} = {human_overall_percentage}%")

# Save CSVs
df.to_csv("results_summary.csv", index=False)
scorecard.to_csv("final_scorecard.csv", index=False)


In [None]:
plt.figure(figsize=(7,5))
plt.bar(scorecard["Metric"], scorecard["Count"])
plt.title("Bug Detection Summary (LLM vs Human)")
plt.ylabel("Number of Bugs Found")
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
plt.show()