## Generating ASTs and Metrics

In [19]:
import os
import subprocess
import csv

def run_gumtree_diff(gumtree_jar, file1, file2):
    """
    Run GumTree's textdiff on two Java files and return stdout.
    """
    try:
        result = subprocess.run(
            ["java", "-jar", gumtree_jar, "textdiff", file1, file2],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        return result.stdout
    except Exception as e:
        return f"ERROR: {str(e)}"

def parse_gumtree_output(output):
    """
    Extracts basic metrics from GumTree's output.
    This assumes textual output (not JSON).
    """
    metrics = {
        "insert": 0,
        "delete": 0,
        "update": 0,
        "move": 0,
        "similarity": None,
        "total_changes": 0
    }

    for line in output.splitlines():
        if "Insert" in line:
            metrics["insert"] += 1
        elif "Delete" in line:
            metrics["delete"] += 1
        elif "Update" in line:
            metrics["update"] += 1
        elif "Move" in line:
            metrics["move"] += 1
        elif "Similarity" in line:
            try:
                metrics["similarity"] = float(line.split()[-1])
            except:
                pass

    metrics["total_changes"] = metrics["insert"] + metrics["delete"] + metrics["update"] + metrics["move"]
    return metrics

def compare_ast_directories(gumtree_jar, gen_dir, target_dir, output_dir, output_csv="ast_metrics.csv"):
    rows = []

    for i in range(0, 164):
        file_name = f"{i}.java"
        gen_file = os.path.join(gen_dir, file_name)
        target_file = os.path.join(target_dir, file_name)

        if not os.path.exists(gen_file) or not os.path.exists(target_file):
            print(f"Skipping missing file: {file_name}")
            continue

        print(f"Comparing: {file_name}")
        output = run_gumtree_diff(gumtree_jar, gen_file, target_file)

        if "ERROR" in output or output.strip() == "":
            print(f"Error in file: {file_name}")
            continue

        metrics = parse_gumtree_output(output)
        metrics["file"] = file_name
        rows.append(metrics)

    # Write metrics to 
    os.makedirs(output_dir, exist_ok=True)
    with open(output_dir + output_csv, "w", newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "file", "similarity", "insert", "delete", "update", "move", "total_changes"
        ])
        writer.writeheader()
        for row in rows:
            writer.writerow(row)

    print(f"\n✅ Saved comparison results to: {output_dir}")


In [23]:
import os
import subprocess
import csv

def run_gumtree_diff(gumtree_jar, file1, file2):
    result = subprocess.run(
        ["java", "-jar", gumtree_jar, "textdiff", file1, file2],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    return result.stdout

def parse_gumtree_output(output):
    metrics = {
        "insert": 0,
        "delete": 0,
        "update": 0,
        "move": 0,
        "similarity": None,
        "total_changes": 0
    }

    for line in output.splitlines():
        if line.startswith("insert-"):
            metrics["insert"] += 1
        elif line.startswith("delete-"):
            metrics["delete"] += 1
        elif line.startswith("update-"):
            metrics["update"] += 1
        elif line.startswith("move-"):
            metrics["move"] += 1

    metrics["total_changes"] = (
        metrics["insert"] +
        metrics["delete"] +
        metrics["update"] +
        metrics["move"]
    )

    # Fallback similarity score: inversely related to change count
    if metrics["total_changes"] == 0:
        metrics["similarity"] = 1.0
    else:
        metrics["similarity"] = round(1 / (1 + metrics["total_changes"]), 3)

    return metrics

def compare_ast_directories(gumtree_jar, gen_dir, target_dir, output_dir, output_csv="ast_metrics.csv"):
    rows = []

    for i in range(164):
        file_name = f"{i}.java"
        gen_file = os.path.join(gen_dir, file_name)
        target_file = os.path.join(target_dir, file_name)

        if not os.path.exists(gen_file) or not os.path.exists(target_file):
            print(f"Skipping {file_name} (missing file)")
            continue

        print(f"Comparing {file_name}")
        output = run_gumtree_diff(gumtree_jar, gen_file, target_file)

        if not output.strip():
            print(f"⚠️ Empty diff output for {file_name}")
            continue

        metrics = parse_gumtree_output(output)
        metrics["file"] = file_name
        rows.append(metrics)

    # Save results to CSV
    os.makedirs(output_dir, exist_ok=True)
    with open(output_dir + output_csv, "w", newline='', encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=[
            "file", "similarity", "insert", "delete", "update", "move", "total_changes"
        ])
        writer.writeheader()
        for row in rows:
            writer.writerow(row)

    print(f"\n✅ AST comparison complete. Results saved to {output_dir + output_csv}")


In [24]:
import os
directories = []

for directory in os.listdir("/media/mujtaba/DATA/nick/UnitTestExamples/data/results/"):
    if "CodeLlama" in directory or "starcoder" in directory:
        if "few_shot" in directory:
            directories.append("/" + directory + "/java_files/trimmed/")
        else:
            directories.append("/" + directory + "/java_files/")

directories = ["/media/mujtaba/DATA/nick/UnitTestExamples/data/results" + directory + "generations" for directory in directories]

for directory in directories:
    # print(directory.split('/')[-4])
# for directory in directories:
    if "few_shot" in directory:
        ident = directory.split('/')[-4]
    else:
        ident = directory.split('/')[-3]
    print(ident)
    compare_ast_directories(
        gumtree_jar="/media/mujtaba/DATA/nick/UnitTestExamples/data/results/gumtree.jar", 
        gen_dir=directory,
        target_dir="/media/mujtaba/DATA/nick/UnitTestExamples/UnitTestGenEvaluation/notebooks/organizing_data/targets/Java",
        output_dir="/media/mujtaba/DATA/nick/UnitTestExamples/data/results/metrics/" + ident + '/'
    )


CodeLlama-7b-Instruct-hf__few_shot_first_prompts
Comparing 0.java
Comparing 1.java
Comparing 2.java
Comparing 3.java
Comparing 4.java
Comparing 5.java
Comparing 6.java
Comparing 7.java
Comparing 8.java
Comparing 9.java
Comparing 10.java
Comparing 11.java
Comparing 12.java
Comparing 13.java
Comparing 14.java
Comparing 15.java
Comparing 16.java
Comparing 17.java
Comparing 18.java
Comparing 19.java
Comparing 20.java
Comparing 21.java
Comparing 22.java
Comparing 23.java
Comparing 24.java
Comparing 25.java
Comparing 26.java
Comparing 27.java
Comparing 28.java
Comparing 29.java
Comparing 30.java
Comparing 31.java
Comparing 32.java
Comparing 33.java
Comparing 34.java
Comparing 35.java
Comparing 36.java
Comparing 37.java
Comparing 38.java
Comparing 39.java
Comparing 40.java
Comparing 41.java
Comparing 42.java
Comparing 43.java
Comparing 44.java
Comparing 45.java
Comparing 46.java
Comparing 47.java
Comparing 48.java
Comparing 49.java
Comparing 50.java
Comparing 51.java
Comparing 52.java
Compari