In [5]:
# Setup: Install testing libraries (pytest and hypothesis).
!pip install pytest hypothesis

import os
import pytest

# Change working directory to the project root if needed
# os.chdir('path/to/project_directory')

# File paths for LLM-generated tests and human (Hypothesis) tests
llm_tests = {
    "decimal_to_binary":  "llm_tests/generated_tests/test_decimal_to_binary_llm.py",
    "get_max_gold":       "llm_tests/generated_tests/test_get_max_gold_llm.py",
    "set_Right_most_Unset_Bit": "llm_tests/generated_tests/test_set_Right_most_Unset_Bit_llm.py"
}
human_tests = {
    "decimal_to_binary":  "human_tests/test_decimal_to_binary_props.py",
    "get_max_gold":       "human_tests/test_get_max_gold_props.py",
    "set_Right_most_Unset_Bit": "human_tests/test_set_Right_most_Unset_Bit_props.py"
}




In [6]:
results = {}
for func in ["decimal_to_binary", "get_max_gold", "set_Right_most_Unset_Bit"]:
    # Run LLM-generated pytest suite on the buggy implementation
    llm_test_path = llm_tests[func]
    exit_code_llm = pytest.main(["-q", "--disable-warnings", "--tb=no", llm_test_path])

    # Run Hypothesis-based pytest suite on the buggy implementation
    human_test_path = human_tests[func]
    exit_code_human = pytest.main(["-q", "--disable-warnings", "--tb=no", human_test_path])

    # Determine if each strategy found the bug
    found_llm = (exit_code_llm != 0)
    found_human = (exit_code_human != 0)
    results[func] = {"LLM_found": found_llm, "Human_found": found_human}

# Print summary of results for each function
for func, res in results.items():
    print(f"Function `{func}`: LLM found bug? {res['LLM_found']}, Human found bug? {res['Human_found']}")


[32m.[0m[31mF[0m[31mF[0m[31mF[0m[31mF[0m[31mF[0m[31mF[0m[31mF[0m[31mF[0m[31mF[0m[31mF[0m[31mF[0m[31mF[0m[31mF[0m[31m                                                                                                   [100%][0m
[31mFAILED[0m llm_tests/generated_tests/test_decimal_to_binary_llm.py::[1mtest_decimal_to_binary[1-1][0m - assert 10 == 1
[31mFAILED[0m llm_tests/generated_tests/test_decimal_to_binary_llm.py::[1mtest_decimal_to_binary[2-10][0m - assert 100 == 10
[31mFAILED[0m llm_tests/generated_tests/test_decimal_to_binary_llm.py::[1mtest_decimal_to_binary[3-11][0m - assert 110 == 11
[31mFAILED[0m llm_tests/generated_tests/test_decimal_to_binary_llm.py::[1mtest_decimal_to_binary[4-100][0m - assert 1000 == 100
[31mFAILED[0m llm_tests/generated_tests/test_decimal_to_binary_llm.py::[1mtest_decimal_to_binary[5-101][0m - assert 1010 == 101
[31mFAILED[0m llm_tests/generated_tests/test_decimal_to_binary_llm.py::[1mtest_decimal_to_bina

In [7]:
llm_only = human_only = both = neither = 0
for res in results.values():
    llm = res["LLM_found"]
    human = res["Human_found"]
    if llm and human:
        both += 1
    elif llm and not human:
        llm_only += 1
    elif human and not llm:
        human_only += 1
    else:
        neither += 1

# Print the final scorecard
print("\nFinal Scorecard:")
print(f"- Bugs found *only* by LLM tests: {llm_only}")
print(f"- Bugs found *only* by Human tests: {human_only}")
print(f"- Bugs found by *both* methods: {both}")
print(f"- Bugs found by *neither* method: {neither}")



Final Scorecard:
- Bugs found *only* by LLM tests: 0
- Bugs found *only* by Human tests: 0
- Bugs found by *both* methods: 3
- Bugs found by *neither* method: 0


In [8]:
# Optional: create a simple markdown table for clarity
print("\n| Strategy | Bugs Found |")
print("|----------|------------|")
print(f"| LLM only | {llm_only} |")
print(f"| Human only | {human_only} |")
print(f"| Both | {both} |")
print(f"| Neither | {neither} |")



| Strategy | Bugs Found |
|----------|------------|
| LLM only | 0 |
| Human only | 0 |
| Both | 3 |
| Neither | 0 |
