# Rename and copy folders

In [5]:
import os

# Define the folder mappings
folder_mappings = {
    'processed_by_depgraph_withoutline': 'depgraph',
    # 'processed_by_sbfl_withoutline': 'ochiai',
    # 'processed_by_random_withoutline': 'execution'
}

# Define the base directory where 'RankedData' is located
base_directory = '../data/RankedData'

# Function to rename folders
def rename_folders_in_directory(base_directory, folder_mappings):
    # Iterate over all subdirectories in the base directory
    for root, dirs, files in os.walk(base_directory):
        for dir_name in dirs:
            # Check if the current folder needs to be renamed
            if dir_name in folder_mappings:
                old_path = os.path.join(root, dir_name)
                new_name = folder_mappings[dir_name]
                new_path = os.path.join(root, new_name)
                
                # Rename the folder
                print(f"Renaming '{old_path}' to '{new_path}'")
                os.rename(old_path, new_path)

# Run the renaming function
rename_folders_in_directory(base_directory, folder_mappings)


In [8]:
import os
import shutil

# List of projects
# projects = ["Cli", "Math", "Csv", "Codec", "Gson", "JacksonCore", "JacksonXml", "Mockito", "Compress", "Jsoup", "Lang"]
projects = ["Time"]

# Source and destination base paths
source_base_path = "/Users/user/Desktop/llmfl/llama-index-test/data"
destination_base_path = "/Users/user/Desktop/llmfl/llm-order/data/RankedData"

# Loop through each project folder
for project in projects:
    source_folder = os.path.join(source_base_path, project, "processed_by_depgraph_withoutline")
    destination_folder = os.path.join(destination_base_path, project, "depgraph")
    
    # Check if source folder exists
    if os.path.exists(source_folder):
        # Ensure the destination directory exists, if not, create it
        os.makedirs(os.path.join(destination_base_path, project), exist_ok=True)
        
        # Copy the folder and rename it
        try:
            shutil.copytree(source_folder, destination_folder)
            print(f"Copied and renamed {source_folder} to {destination_folder}")
        except Exception as e:
            print(f"Error copying {source_folder} to {destination_folder}: {e}")
    else:
        print(f"Source folder {source_folder} does not exist.")


Copied and renamed /Users/user/Desktop/llmfl/llama-index-test/data/Time/processed_by_depgraph_withoutline to /Users/user/Desktop/llmfl/llm-order/data/RankedData/Time/depgraph


# Put the Groundtruth in the First place of the ranking

In [3]:
import json
import os


def re_rank_methods(data):
    for i, method in enumerate(data['covered_methods']):
        method['method_id'] = i
    return data


# Function to map method IDs to their signatures from the JSON test files
def map_method_ids_to_signatures(processed_dir, bug_id, test_id):
    method_signatures_map = {}
    test_file_path = os.path.join(processed_dir, f"{bug_id}", f"test_{test_id}.json")
    if os.path.exists(test_file_path):
        with open(test_file_path, 'r') as json_file:
            data = json.load(json_file)
        for method in data.get('covered_methods', []):
            method_signatures_map[method['method_id']] = method['method_signature']
    return method_signatures_map


# Function to load the ground truth methods from the txt file for a specific bug
def load_ground_truth_methods(ground_truth_dir, bug_id):
    ground_truth_file = os.path.join(ground_truth_dir, f"{bug_id}.txt")
    ground_truth_methods = set()  # Store ground truth methods in a set for quick lookup
    if os.path.exists(ground_truth_file):
        with open(ground_truth_file, 'r') as file:
            ground_truth_methods = set(line.strip() for line in file.readlines())
    return ground_truth_methods


# Function to create a perfect ranking of covered methods, with ground truth methods at the top
def generate_perfect_ranking(ranked_data_dir, ground_truth_dir, perfect_ranking_dir, project_name):
    
    if not os.path.exists(perfect_ranking_dir):
        os.makedirs(perfect_ranking_dir)

    for bug_id in os.listdir(ranked_data_dir):  # Loop over bugs in the project
        bug_ranking_dir = os.path.join(perfect_ranking_dir, bug_id)
        if not os.path.exists(bug_ranking_dir):
            os.makedirs(bug_ranking_dir)

        ground_truth_methods = load_ground_truth_methods(ground_truth_dir, bug_id)

        for test_id in os.listdir(os.path.join(ranked_data_dir, bug_id)):
            ranked_file_path = os.path.join(ranked_data_dir, bug_id, test_id)
            perfect_ranking_file_path = os.path.join(bug_ranking_dir, test_id)
            method_signatures_map = map_method_ids_to_signatures(ranked_data_dir, bug_id, test_id)

            if os.path.exists(ranked_file_path):
                with open(ranked_file_path, 'r') as json_file:
                    data = json.load(json_file)
                covered_methods = data.get('covered_methods', [])
                
                # Separate ground truth methods and non-ground truth methods
                ground_truth_covered = []
                non_ground_truth_covered = []

                for method in covered_methods:
                    if method['method_signature'] in ground_truth_methods:
                        ground_truth_covered.append(method)
                    else:
                        non_ground_truth_covered.append(method)

                # Update method ids: ground truth methods get highest ids
                new_id = len(covered_methods)
                for method in ground_truth_covered:
                    method['method_id'] = new_id
                    new_id -= 1

                # Combine ground truth methods at the top, followed by other methods
                perfect_covered_methods = ground_truth_covered + non_ground_truth_covered
                data['covered_methods'] = perfect_covered_methods

                data = re_rank_methods(data)

                # Save the perfect ranking to the output file
                with open(perfect_ranking_file_path, 'w') as json_file:
                    json.dump(data, json_file, indent=4)


# List of projects and techniques
# projects = ["Cli", "Math", "Csv", "Codec", "Gson", "JacksonCore", "JacksonXml", "Mockito", "Compress", "Jsoup"]
projects = ["Time"]
techniques = ["callgraph"]

for project_name in projects:
    for technique in techniques:
        ranked_data_dir = f'../data/RankedData/{project_name}/{technique}'
        ground_truth_dir = f'../data/BuggyMethods/{project_name}'
        perfect_ranking_dir = f'../data/RankedData/{project_name}/perfect_callgraph'
        generate_perfect_ranking(ranked_data_dir, ground_truth_dir, perfect_ranking_dir, project_name)


# Randomize the ranking

In [10]:
import json
import os
import random


def re_rank_methods(data):
    """Reassign method IDs in sequence after shuffling."""
    for i, method in enumerate(data['covered_methods']):
        method['method_id'] = i
    return data


# Function to map method IDs to their signatures from the JSON test files
def map_method_ids_to_signatures(processed_dir, bug_id, test_id):
    method_signatures_map = {}
    test_file_path = os.path.join(processed_dir, f"{bug_id}", f"test_{test_id}.json")
    if os.path.exists(test_file_path):
        with open(test_file_path, 'r') as json_file:
            data = json.load(json_file)
        for method in data.get('covered_methods', []):
            method_signatures_map[method['method_id']] = method['method_signature']
    return method_signatures_map


# Function to create a random ranking of covered methods
def generate_random_ranking(ranked_data_dir, random_ranking_dir, project_name):
    
    if not os.path.exists(random_ranking_dir):
        os.makedirs(random_ranking_dir)

    for bug_id in os.listdir(ranked_data_dir):  # Loop over bugs in the project
        bug_ranking_dir = os.path.join(random_ranking_dir, bug_id)
        if not os.path.exists(bug_ranking_dir):
            os.makedirs(bug_ranking_dir)

        for test_id in os.listdir(os.path.join(ranked_data_dir, bug_id)):
            ranked_file_path = os.path.join(ranked_data_dir, bug_id, test_id)
            random_ranking_file_path = os.path.join(bug_ranking_dir, test_id)

            if os.path.exists(ranked_file_path):
                with open(ranked_file_path, 'r') as json_file:
                    data = json.load(json_file)
                covered_methods = data.get('covered_methods', [])
                
                # Randomize the order of covered methods
                random.shuffle(covered_methods)
                data['covered_methods'] = covered_methods

                # Reassign method IDs in sequence after randomizing
                data = re_rank_methods(data)

                # Save the random ranking to the output file
                with open(random_ranking_file_path, 'w') as json_file:
                    json.dump(data, json_file, indent=4)


# List of projects and techniques
# projects = ["Cli", "Math", "Csv", "Codec", "Gson", "JacksonCore", "JacksonXml", "Mockito", "Compress", "Jsoup", "Lang"]
projects = ["Time"]
techniques = ["execution"]

for project_name in projects:
    for technique in techniques:
        ranked_data_dir = f'../data/RankedData/{project_name}/{technique}'
        random_ranking_dir = f'../data/RankedData/{project_name}/random'  # Save random rankings here
        generate_random_ranking(ranked_data_dir, random_ranking_dir, project_name)


# Find the Kendall Tau Distance

In [9]:
from scipy.stats import kendalltau

# JSON data
json1 = {
    "covered_methods": [
        {
            "method_signature": "org.apache.commons.lang3.math.NumberUtils:createInteger(Ljava/lang/String;)Ljava/lang/Integer;",
            "method_body": "public static Integer createInteger(final String str) {...}",
            "method_id": 0
        },
        {
            "method_signature": "org.apache.commons.lang3.math.NumberUtils:createNumber(Ljava/lang/String;)Ljava/lang/Number;",
            "method_body": "public static Number createNumber(final String str) throws NumberFormatException {...}",
            "method_id": 1
        },
        {
            "method_signature": "org.apache.commons.lang3.StringUtils:isBlank(Ljava/lang/CharSequence;)Z",
            "method_body": "public static boolean isBlank(final CharSequence cs) {...}",
            "method_id": 2
        }
    ]
}

json2 = {
    "covered_methods": [
        {
            "method_signature": "org.apache.commons.lang3.math.NumberUtils:createNumber(Ljava/lang/String;)Ljava/lang/Number;",
            "method_body": "public static Number createNumber(final String str) throws NumberFormatException {...}",
            "method_id": 0
        },
        {
            "method_signature": "org.apache.commons.lang3.StringUtils:isBlank(Ljava/lang/CharSequence;)Z",
            "method_body": "public static boolean isBlank(final CharSequence cs) {...}",
            "method_id": 2
        },
        {
            "method_signature": "org.apache.commons.lang3.math.NumberUtils:createInteger(Ljava/lang/String;)Ljava/lang/Integer;",
            "method_body": "public static Integer createInteger(final String str) {...}",
            "method_id": 1
        }
    ]
}

# Extract the method signatures
signatures1 = [method['method_signature'] for method in json1['covered_methods']]
signatures2 = [method['method_signature'] for method in json2['covered_methods']]

# Create rankings based on the method signatures
# We map signatures1 to their ranking positions in signatures2
ranking1 = [signatures1.index(sig) for sig in signatures1]
ranking2 = [signatures2.index(sig) for sig in signatures1]  # Based on the order in signatures2

# Calculate Kendall Tau Distance
tau, p_value = kendalltau(ranking1, ranking2)

print(f"Kendall Tau Distance: {tau}")


Kendall Tau Distance: -0.33333333333333337


# Find Kendal Tau Distance for all the ranked methods

In [6]:
import os
import json
from scipy.stats import kendalltau

# Path to the Lang project containing bug versions and ranking data
base_path = "../data/RankedData"

projects = ["Cli", "Math", "Csv", "Codec", "Gson", "JacksonCore", "JacksonXml", "Mockito", "Compress", "Jsoup", "Lang"]

# List of ranking techniques
techniques = ['ochiai', 'depgraph', 'execution', 'perfect', 'random']

# Dictionary to hold the results for all projects
all_project_results = {}

# Dictionary to keep track of errors (mismatched sizes)
error_log = {}

def load_ranking_data(project, version, technique, test_file):
    """ Load the method_signature list from the JSON file for a given project, version, technique, and test file """
    file_path = os.path.join(base_path, project, technique, version, test_file)
    with open(file_path, 'r') as file:
        data = json.load(file)
    # Extract the method_signature in the order they appear
    return [method['method_signature'] for method in data['covered_methods']]

def calculate_kendall_tau(project, technique1, technique2, version, test_file):
    """ Calculate Kendall Tau Distance between two techniques for a given project, bug version, and test file """
    ranking1 = load_ranking_data(project, version, technique1, test_file)
    ranking2 = load_ranking_data(project, version, technique2, test_file)
    
    # Check if there are fewer than two methods in either ranking
    if len(ranking1) < 2 or len(ranking2) < 2:
        return "Insufficient data (less than 2 methods)"
    
    # Create a list of method signatures and their respective rankings
    rank_map1 = {signature: idx for idx, signature in enumerate(ranking1)}
    rank_map2 = [rank_map1[signature] for signature in ranking2 if signature in rank_map1]

    # Check if both ranking lists have the same size before calculation
    if len(rank_map1) != len(rank_map2):
        raise ValueError(f"Size mismatch between rankings: {len(rank_map1)} and {len(rank_map2)}")
    
    # Calculate Kendall Tau Distance if valid
    if len(rank_map1) > 1 and len(rank_map2) > 1:
        tau, p_value = kendalltau(list(rank_map1.values()), rank_map2)
        return tau
    else:
        return "Insufficient data (no valid pairs)"


# Loop through all projects
for project in projects:
    project_results = {}
    
    # List of bug versions by inspecting one technique (since the bug versions are common across all techniques)
    bug_versions = os.listdir(f'{base_path}/{project}/{techniques[0]}')
    
    # Loop through all bug versions and calculate Kendall Tau Distance for all technique pairs
    for version in bug_versions:
        version_path = os.path.join(base_path, project, techniques[0], version)
        
        # Get all test files for this version (test_0.json, test_1.json, etc.)
        test_files = [f for f in os.listdir(version_path) if f.endswith('.json')]
        
        version_key = f"{version}"
        project_results[version_key] = {}  # Create an entry for this bug version
        
        # Loop through each test file (e.g., test_0.json, test_1.json)
        for test_file in test_files:
            test_key = f"{test_file}"
            project_results[version_key][test_key] = {}  # Create an entry for this test file

            # Compare all technique pairs for the current test file
            for i in range(len(techniques)):
                for j in range(i + 1, len(techniques)):
                    technique1 = techniques[i]
                    technique2 = techniques[j]
                    
                    # Calculate Kendall Tau Distance for the current test file
                    try:
                        tau = calculate_kendall_tau(project, technique1, technique2, version, test_file)
                        comparison_key = f"{technique1}_vs_{technique2}"
                        project_results[version_key][test_key][comparison_key] = tau
                    except ValueError as e:
                        print(f"Error in {project}, bug {version}, file {test_file}: {e}")
                        
                        # Log the error details in the error_log dictionary
                        if project not in error_log:
                            error_log[project] = {}
                        if version not in error_log[project]:
                            error_log[project][version] = {}
                        if test_file not in error_log[project][version]:
                            error_log[project][version][test_file] = []
                        
                        error_log[project][version][test_file].append(f"{technique1}_vs_{technique2}: {str(e)}")

    # Store the project results in the main dictionary
    all_project_results[project] = project_results

# Save the combined results for all projects to a JSON file
output_path = "../data/KendallTau/combined_kendall_tau_results.json"
with open(output_path, 'w') as outfile:
    json.dump(all_project_results, outfile, indent=4)

# Save the error log to a separate JSON file
error_log_path = "../data/KendallTau/kendall_tau_errors.json"
with open(error_log_path, 'w') as errorfile:
    json.dump(error_log, errorfile, indent=4)

print(f"Results saved to {output_path}")
print(f"Error log saved to {error_log_path}")


Error in JacksonXml, bug 5, file test_0.json: Size mismatch between rankings: 98 and 97
Error in JacksonXml, bug 5, file test_0.json: Size mismatch between rankings: 98 and 97
Error in JacksonXml, bug 5, file test_0.json: Size mismatch between rankings: 98 and 97
Error in JacksonXml, bug 5, file test_0.json: Size mismatch between rankings: 98 and 97
Results saved to ../data/KendallTau/combined_kendall_tau_results.json
Error log saved to ../data/KendallTau/kendall_tau_errors.json


# Calculate More Metrics from the overall Kendall Tau distance

In [7]:
import json
import numpy as np

# Load the combined results from the JSON file
with open('../data/KendallTau/combined_kendall_tau_results.json', 'r') as f:
    all_project_results = json.load(f)

# Dictionary to hold the summary results
project_summary = {}

def calculate_project_summary(project, project_data):
    """ Calculate the overall average and per-pair averages for a project """
    technique_pairs = set()
    tau_values = {}
    
    # Initialize tau values for each technique pair
    for version, version_data in project_data.items():
        for test_file, test_data in version_data.items():
            for pair, tau in test_data.items():
                if isinstance(tau, (int, float)):  # Only consider valid numerical values
                    if pair not in tau_values:
                        tau_values[pair] = []
                    tau_values[pair].append(tau)
    
    # Calculate the overall averages and technique-pair-specific averages
    overall_tau_values = []
    pair_averages = {}
    
    for pair, values in tau_values.items():
        avg_tau = np.mean(values)
        pair_averages[pair] = avg_tau
        overall_tau_values.extend(values)
    
    overall_avg = np.mean(overall_tau_values)
    overall_std = np.std(overall_tau_values)
    
    return {
        "overall_average_tau": overall_avg,
        "overall_std_tau": overall_std,
        "pair_averages": pair_averages,
        "total_comparisons": len(overall_tau_values)
    }

# Loop through each project in the results
for project, project_data in all_project_results.items():
    project_summary[project] = calculate_project_summary(project, project_data)

# Save the summary to a JSON file
summary_output_path = "../data/KendallTau/kendall_tau_summary.json"
with open(summary_output_path, 'w') as outfile:
    json.dump(project_summary, outfile, indent=4)

print(f"Summary saved to {summary_output_path}")


Summary saved to ../data/KendallTau/kendall_tau_summary.json


# Rename the folders

In [1]:
import os

def rename_callgraph_folders(base_path):
    """
    Renames all 'callgraph' folders to 'callgraph_dfs' within project folders under the base path.

    Args:
        base_path (str): The path to the root folder containing project directories.
    """
    # Iterate through all project folders in the base path
    for project_name in os.listdir(base_path):
        project_path = os.path.join(base_path, project_name)

        # Ensure it's a directory
        if os.path.isdir(project_path):
            callgraph_path = os.path.join(project_path, "callgraph")
            new_callgraph_path = os.path.join(project_path, "callgraph_dfs")

            # Check if the 'callgraph' folder exists and rename it
            if os.path.exists(callgraph_path):
                os.rename(callgraph_path, new_callgraph_path)
                print(f"Renamed: {callgraph_path} -> {new_callgraph_path}")
            else:
                print(f"No 'callgraph' folder found in {project_path}")

if __name__ == "__main__":
    # Define the base path
    base_path = "../data/RankedData"

    # Run the renaming script
    rename_callgraph_folders(base_path)


Renamed: ../data/RankedData/Compress/callgraph -> ../data/RankedData/Compress/callgraph_dfs
Renamed: ../data/RankedData/JacksonCore/callgraph -> ../data/RankedData/JacksonCore/callgraph_dfs
Renamed: ../data/RankedData/Gson/callgraph -> ../data/RankedData/Gson/callgraph_dfs
Renamed: ../data/RankedData/JacksonXml/callgraph -> ../data/RankedData/JacksonXml/callgraph_dfs
Renamed: ../data/RankedData/Codec/callgraph -> ../data/RankedData/Codec/callgraph_dfs
Renamed: ../data/RankedData/Cli/callgraph -> ../data/RankedData/Cli/callgraph_dfs
Renamed: ../data/RankedData/Math/callgraph -> ../data/RankedData/Math/callgraph_dfs
Renamed: ../data/RankedData/Time/callgraph -> ../data/RankedData/Time/callgraph_dfs
Renamed: ../data/RankedData/Lang/callgraph -> ../data/RankedData/Lang/callgraph_dfs
Renamed: ../data/RankedData/Jsoup/callgraph -> ../data/RankedData/Jsoup/callgraph_dfs
Renamed: ../data/RankedData/Csv/callgraph -> ../data/RankedData/Csv/callgraph_dfs
Renamed: ../data/RankedData/Mockito/callgr

# Find Kendall Tau distance between two rankings

In [18]:
import json
import os
from scipy.stats import kendalltau
from glob import glob

def load_signatures(file_path):
    """Load the method signatures in the order they appear in the file."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    return [method["method_signature"] for method in data["covered_methods"]]

def calculate_kendall_tau(perfect_signatures, other_signatures):
    """Calculate Kendall Tau distance between two rankings based on method signatures."""
    # Map signatures to indices in the perfect ranking
    perfect_order = [perfect_signatures.index(sig) for sig in perfect_signatures]
    other_order = [perfect_signatures.index(sig) for sig in other_signatures if sig in perfect_signatures]

    # Check if lengths match
    if len(perfect_order) != len(other_order):
        return float('nan')

    # Handle the case of a single covered method
    if len(perfect_order) == 1:
        return 1.0  # Perfect agreement by default

    # Calculate Kendall Tau
    tau, _ = kendalltau(perfect_order, other_order)
    return tau

def compare_with_perfect(perfect_dir, comparison_dirs):
    """Compare rankings in other directories with the perfect ranking."""
    results = {}

    # Iterate over version folders in the perfect directory
    version_dirs = glob(os.path.join(perfect_dir, "*"))  # Example: `perfect_callgraph/1`, `perfect_callgraph/2`
    
    for version_dir in version_dirs:
        version = os.path.basename(version_dir)
        perfect_files = glob(os.path.join(version_dir, "test_*.json"))
        
        for perfect_file in perfect_files:
            perfect_filename = os.path.basename(perfect_file)
            perfect_signatures = load_signatures(perfect_file)

            for comparison_dir in comparison_dirs:
                comparison_name = os.path.basename(comparison_dir)
                if comparison_name not in results:
                    results[comparison_name] = []

                # Locate the corresponding version and file in the comparison directory
                comparison_file = os.path.join(comparison_dir, version, perfect_filename)

                if os.path.exists(comparison_file):
                    other_signatures = load_signatures(comparison_file)
                    tau = calculate_kendall_tau(perfect_signatures, other_signatures)
                    results[comparison_name].append(tau)

    return results

def calculate_average_kendall_tau(results):
    """Calculate the average Kendall Tau distance for each comparison."""
    averages = {}
    for technique, taus in results.items():
        valid_taus = [tau for tau in taus if tau is not None]  # Filter out None values
        if valid_taus:
            averages[technique] = sum(valid_taus) / len(valid_taus)  # Compute average
        else:
            averages[technique] = float('nan')  # Handle case with no valid results
    return averages

# Define paths and projects
projects = ["Cli", "Math", "Csv", "Codec", "Compress", "Gson", "JacksonCore", "JacksonXml", "Mockito", "Jsoup", "Lang", "Time"]
base_path = "../data/RankedData"

# Process each project and print results
print("\nOverall Kendall Tau distances by project:")
for project in projects:
    perfect_dir = os.path.join(base_path, project, "perfect_callgraph")
    comparison_dirs = [
        os.path.join(base_path, project, "callgraph_bfs"),
        os.path.join(base_path, project, "callgraph_dfs")
    ]

    if os.path.exists(perfect_dir):
        distances = compare_with_perfect(perfect_dir, comparison_dirs)
        averages = calculate_average_kendall_tau(distances)

        # Print the results for the current project
        print(f"\nProject: {project}")
        for technique, avg_tau in averages.items():
            print(f"  Perfect vs {technique}: {avg_tau}")
    else:
        print(f"\nProject: {project} - Perfect directory not found.")



Overall Kendall Tau distances by project:

Project: Cli
  Perfect vs callgraph_bfs: 0.6704479015828727
  Perfect vs callgraph_dfs: 0.934192255470955

Project: Math
  Perfect vs callgraph_bfs: 0.8168682422221175
  Perfect vs callgraph_dfs: 0.9619466402996407

Project: Csv
  Perfect vs callgraph_bfs: 0.8389668386472428
  Perfect vs callgraph_dfs: 0.9825531198753418

Project: Codec
  Perfect vs callgraph_bfs: 0.7062768308489742
  Perfect vs callgraph_dfs: 0.8461477696256832

Project: Compress
  Perfect vs callgraph_bfs: 0.8663448839935057
  Perfect vs callgraph_dfs: 0.9836811651895877

Project: Gson
  Perfect vs callgraph_bfs: 0.8491498040527607
  Perfect vs callgraph_dfs: 0.9469443643012886

Project: JacksonCore
  Perfect vs callgraph_bfs: 0.8467787286965645
  Perfect vs callgraph_dfs: 0.9409041183606812

Project: JacksonXml
  Perfect vs callgraph_bfs: 0.9379582875532803
  Perfect vs callgraph_dfs: 0.9961593457543384

Project: Mockito
  Perfect vs callgraph_bfs: 0.9839779198698787
  Per