In [10]:
import subprocess, sys
from os import listdir
from os.path import isfile, join
import re
import pandas as pd

In [11]:
def run_tests(test_files, model, docstr_type):
    for test_file in test_files:
        print(f"Running tests in {test_file}")
        subprocess.run([sys.executable, '-m', 'pytest',
                                 "--continue-on-collection-errors",
                                 "-q",
                                 "--csv",
                                 f"functional_correctness/generated_classes/{model}/{docstr_type}/test_results_{test_file.split('.py')[0]}.csv",
                                  f'functional_correctness/generated_classes/{model}/{docstr_type}/test_{test_file}'], capture_output=True, text=True)
        print(f"Finished tests in {test_file}")
    print("All tests completed.")

In [12]:
def list_files(folder_path, all_files=True, extension=None):
    if all_files:
        return [f for f in listdir(folder_path) if isfile(join(folder_path, f))]
    else:
        if extension is None:
            raise ValueError("Extension cannot be None if only a fixed type of files are to be listed.")
        else:
            return [f for f in listdir(folder_path) if (isfile(join(folder_path, f)) and f.endswith(f".{extension}"))]

In [13]:
def process_functional_correctness_data(model, docstr_type):
    snippets = list_files(folder_path=f'functional_correctness/generated_classes/{model}/{docstr_type}', all_files=False, extension='py')
    snippets = [re.sub(r'^test_', '', snippet) for snippet in snippets if snippet.startswith('test_')]
    run_tests(snippets, model, docstr_type)

    # Combine all CSV files into a single DataFrame
    reports = list_files(folder_path=f'functional_correctness/generated_classes/{model}/{docstr_type}', all_files=False, extension='csv')
    all_reports_df = pd.DataFrame()
    for report in reports:
        report_df = pd.read_csv(f'functional_correctness/generated_classes/{model}/{docstr_type}/{report}')
        all_reports_df = pd.concat([all_reports_df, report_df], axis=0)
        #subprocess.run(f"./functional_correctness/generated_classes/{model}/{docstr_type}/{report}", shell=True)

    all_reports_df.to_csv(f'functional_correctness/generated_classes/{model}/{docstr_type}_test_report.csv', index=False)
    print(f"Combined report saved to functional_correctness/generated_classes/{model}/{docstr_type}_test_report.csv")
    

In [14]:
models = ['GPT-o4', 'Claude4-sonnet', 'Qwen3-coder']
docstr_types = ['no_docstr', 'full_docstr', 'partial_docstr']
for model in models:
    for docstr_type in docstr_types:
        process_functional_correctness_data(model, docstr_type)
        print(f"Generated report for {model} with {docstr_type}")
        print("="*50)
        print()
print("All reports generated.")

Running tests in snippet_94.py
Finished tests in snippet_94.py
Running tests in snippet_694.py
Finished tests in snippet_694.py
Running tests in snippet_482.py
Finished tests in snippet_482.py
Running tests in snippet_713.py
Finished tests in snippet_713.py
Running tests in snippet_653.py
Finished tests in snippet_653.py
Running tests in snippet_521.py
Finished tests in snippet_521.py
Running tests in snippet_492.py
Finished tests in snippet_492.py
Running tests in snippet_612.py
Finished tests in snippet_612.py
Running tests in snippet_752.py
Finished tests in snippet_752.py
Running tests in snippet_684.py
Finished tests in snippet_684.py
Running tests in snippet_404.py
Finished tests in snippet_404.py
Running tests in snippet_677.py
Finished tests in snippet_677.py
Running tests in snippet_535.py
Finished tests in snippet_535.py
Running tests in snippet_90.py
Finished tests in snippet_90.py
Running tests in snippet_690.py
Finished tests in snippet_690.py
Running tests in snippet_158.

In [15]:
def generate_functional_correctness_report(model, docstr_type):
    df = pd.read_csv(f'functional_correctness/generated_classes/{model}/{docstr_type}_test_report.csv')
    total_tests = len(df)
    passed_tests = len(df[df['status'] == 'passed'])
    failed_tests = len(df[df['status'] == 'failed'])
    xfailed_tests = len(df[df['status'] == 'xfailed'])

    #print(df.status.value_counts())

    print(f"Functional Correctness Report for {model} with {docstr_type}\n")
    print("="*50 + "\n")
    print(f"Total Tests: {total_tests}\n")
    print(f"Passed Tests: {passed_tests} ({(passed_tests/total_tests)*100:.2f}%)\n")
    print(f"Failed Tests: {failed_tests} ({(failed_tests/total_tests)*100:.2f}%)\n")
    print(f"XFailed Tests: {xfailed_tests} ({(xfailed_tests/total_tests)*100:.2f}%)\n")
    print()

In [16]:
for model in models:
    for docstr_type in docstr_types:
        generate_functional_correctness_report(model, docstr_type)

Functional Correctness Report for GPT-o4 with no_docstr


Total Tests: 399

Passed Tests: 62 (15.54%)

Failed Tests: 132 (33.08%)

XFailed Tests: 205 (51.38%)


Functional Correctness Report for GPT-o4 with full_docstr


Total Tests: 445

Passed Tests: 44 (9.89%)

Failed Tests: 142 (31.91%)

XFailed Tests: 259 (58.20%)


Functional Correctness Report for GPT-o4 with partial_docstr


Total Tests: 405

Passed Tests: 59 (14.57%)

Failed Tests: 129 (31.85%)

XFailed Tests: 217 (53.58%)


Functional Correctness Report for Claude4-sonnet with no_docstr


Total Tests: 396

Passed Tests: 84 (21.21%)

Failed Tests: 127 (32.07%)

XFailed Tests: 185 (46.72%)


Functional Correctness Report for Claude4-sonnet with full_docstr


Total Tests: 455

Passed Tests: 57 (12.53%)

Failed Tests: 153 (33.63%)

XFailed Tests: 245 (53.85%)


Functional Correctness Report for Claude4-sonnet with partial_docstr


Total Tests: 412

Passed Tests: 84 (20.39%)

Failed Tests: 128 (31.07%)

XFailed Tests: 200 (48.54%)