In [5]:
import re

def extract_main_class(code: str, few_shot: bool) -> str:
    # Determine which instance to extract (2nd if few-shot, 1st otherwise)
    n = 2 if few_shot else 1

    # Find all indices where "public class Main" appears
    starts = [m.start() for m in re.finditer(r'public class Main', code)]
    
    if len(starts) < n:
        return ""  # Not enough occurrences

    start_index = starts[n - 1]  # Get the nth occurrence (1-based index)

    # Brace counting to find the matching closing }
    brace_count = 0
    in_class = False
    end_index = None

    for i in range(start_index, len(code)):
        char = code[i]
        if char == '{':
            brace_count += 1
            if not in_class:
                in_class = True
        elif char == '}':
            brace_count -= 1
            if in_class and brace_count == 0:
                end_index = i
                break

    # Return only the full class block, starting from the nth "public class Main"
    if end_index is not None:
        return code[start_index:end_index + 1].strip()
    else:
        return code[start_index:].strip()  # Fallback if closing brace is missing

In [6]:
files = []

import os

for file in os.listdir("/media/mujtaba/DATA/nick/UnitTestExamples/data/results"):
    if "CodeLlama" in file or "starcoder" in file or "codet5p" in file:
        files.append('/' + file)

files = ["/media/mujtaba/DATA/nick/UnitTestExamples/data/results" + file + '/responses_reformatted.csv' for file in files]
print(files)

import pandas as pd



for file in files:
    df = pd.read_csv(file, quoting=1)
    if "few_shot" in file:
        few_shot = True
    else:
        few_shot = False

    for i in range(len(df)):
        if i==0:
            continue
        #print(df.iloc[i]['code'])
        untrimmed_response = df.iloc[i]['code']
        trimmed_response = extract_main_class(untrimmed_response, few_shot=few_shot)
        # trimmed.append(trimmed_response)   
        os.makedirs(f"/media/mujtaba/DATA/nick/UnitTestExamples/data/results/{file.split('/')[-2]}/java_files", exist_ok = True)
        output_path = os.path.join(f"/media/mujtaba/DATA/nick/UnitTestExamples/data/results/{file.split('/')[-2]}/java_files/", f"{i-1}.java")
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(trimmed_response)

['/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/responses_reformatted.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__zero_shot_first_prompts/responses_reformatted.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/CodeLlama-7b-Instruct-hf__few_shot_first_prompts/responses_reformatted.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/starcoder2-3b__few_shot_first_prompts/responses_reformatted.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/CodeLlama-7b-Instruct-hf__zero_shot_first_prompts/responses_reformatted.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/starcoder2-3b__zero_shot_first_prompts/responses_reformatted.csv']


## Separating Generated Tests From Source Method

In [7]:
import os
import re

def extract_main_class_only(code):
    """
    Extracts everything from '//Unit Test Cases' or 'public class Main' onward.
    Does not require balanced braces, to preserve incomplete generations.
    """
    # Try to find from the //Unit Test Cases comment
    match = re.search(r'//Unit Test Cases\s*(public class Main\s*{.*)', code, re.DOTALL)
    if match:
        return match.group(1)
    
    # If no comment marker, fall back to just 'public class Main'
    match = re.search(r'(public class Main\s*{.*)', code, re.DOTALL)
    if match:
        return match.group(1)
    
    print("Warning: Could not find 'public class Main' in input.")
    return ""

def extract_all_mains(source_dir):
    output_dir = os.path.join(source_dir, "generations")
    os.makedirs(output_dir, exist_ok=True)

    for i in range(0, 164):  # 0.java through 163.java
        file_path = os.path.join(source_dir, f"{i}.java")
        output_path = os.path.join(output_dir, f"{i}.java")

        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            code = f.read()

        main_code = extract_main_class_only(code)

        if main_code.strip():
            with open(output_path, "w", encoding="utf-8") as out:
                out.write(main_code)
            print(f"Saved: {output_path}")
        else:
            print(f"Skipping (empty or missing Main): {file_path}")


import os
directories = []

for directory in os.listdir("/media/mujtaba/DATA/nick/UnitTestExamples/data/results/"):
    if "CodeLlama" in directory or "starcoder" in directory or "codet5p" in directory:
        if "few_shot" in directory:
            print(directory)
            directories.append("/" + directory + "/java_files/trimmed/")
        else:
            directories.append("/" + directory + "/java_files/")

directories = ["/media/mujtaba/DATA/nick/UnitTestExamples/data/results" + directory for directory in directories]

for directory in directories:
    extract_all_mains(directory)

codet5p-2b__few_shot_first_prompts
CodeLlama-7b-Instruct-hf__few_shot_first_prompts
starcoder2-3b__few_shot_first_prompts
File not found: /media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/java_files/trimmed/0.java
File not found: /media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/java_files/trimmed/1.java
File not found: /media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/java_files/trimmed/2.java
File not found: /media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/java_files/trimmed/3.java
File not found: /media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/java_files/trimmed/4.java
File not found: /media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/java_files/trimmed/5.java
File not found: /media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few