# Evaluate localization strategies

This notebook does a comparative evaluation of different localization strategies.
- Defines a base interface for localization
- Implements a few localization strategies
- Defines an evaluator that runs a test suite on those localization strategies
- Evaluator dumps the results in a pandas dataframe
- Uses Milvus as the vector database
- Uses OpenAI's embeddings model
- Uses langchain's abstractions for processing

In [1]:
import os
import tempfile
import pandas as pd
import shutil
import yaml

from typing import Dict, List, Tuple, Iterator
from abc import ABC, abstractmethod
from langchain_core.documents import Document
from langchain_milvus import Milvus

## Base interface for localization strategies

In [2]:
class Strategy(ABC):
    @abstractmethod
    def localize(self, issue: Dict[str, str], top_n: int) -> List[Tuple[str, str]]:
        """
        Localizes the issue to a set of relevant packages and files.

        Args:
            issue (Dict[str, str]): A dictionary containing issue details with at least:
                - `title` (str): The title of the issue.
                - `description` (str): The detailed description of the issue.
            top_n (int): The maximum number of localization results to return.

        Returns:
            List[Tuple[str, str]]: A list of tuples representing relevant localization results,
                each containing `package` (str) and `file` (str).
        """
        pass

## Semantic vector search strategy

This implements a simple semantic vector search strategy. It uses Milvus as the vector database and OpenAI's embeddings model. Implementation may be used as-is for multiple strategies by feeding in different types of sources. E.g.,
- **Code file embeddings**: Providing a `source_dir` pointing to code files will directly embed code
- **Code semantics embeddings**: Providing a `source_dir` pointing to semantic descriptions of code files will embed code semantics

In [3]:
class SemanticVectorSearchStrategy(Strategy):
    def __init__(self, source_dir: str, root_package_name: str, embeddings, strategy_name: str):
        self.strategy_name = strategy_name
        self.vector_store = self.create_vector_store(source_dir, root_package_name, embeddings)

    def create_vector_store(self, folder_path: str, root_package_name: str, embeddings) -> Milvus:
        """Creates a Milvus vector store from the files in the specified folder."""
        documents = self.create_documents(folder_path, root_package_name)
        with tempfile.NamedTemporaryFile(suffix='.db', delete=False) as tmp_file:
            uri = tmp_file.name
        return Milvus.from_documents(
            documents,
            embeddings,
            collection_name=root_package_name,
            connection_args={"uri": uri},
        )
    
    def create_documents(self, folder_path: str, root_package_name: str) -> List[Document]:
        """Create a list of Document instances from the files in the specified folder."""
        documents = []
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                with open(file_path, "r") as f:
                    page_content = f.read()
                if not page_content.strip():
                    continue
                filename = file.split('.')[0]
                relative_path = os.path.relpath(root, folder_path)
                package = (f"{root_package_name}/{relative_path.replace(os.sep, '/')}"
                           if relative_path != "." else root_package_name)
                document = Document(
                    page_content=page_content,
                    metadata={"file": filename, "package": package}
                )
                documents.append(document)
        return documents

    def localize(self, issue: Dict[str, str], top_n: int) -> List[Tuple[str, str]]:
        query_string = f"{issue['title']}: {issue['description']}"
        results = self.vector_store.similarity_search(query_string, k=top_n)
        return [(res.metadata["package"], res.metadata["file"]) for res in results]

## Dataset

In [4]:
class Issue:
    def __init__(self, id: str, title: str, content: str, expected_results: List[str]):
        self.id = id
        self.title = title
        self.content = content
        self.expected_results = expected_results

    def to_dict(self) -> Dict[str, str]:
        """Returns the issue data as a dictionary for easy access."""
        return {"title": self.title, "description": self.content}

class Dataset:
    def __init__(self, yaml_path: str):
        self.yaml_dir = os.path.dirname(yaml_path)  # Get the directory containing the YAML file
        with open(yaml_path, 'r') as f:
            data = yaml.safe_load(f)
        self.test_cases = data["test_cases"]

    def __iter__(self) -> Iterator[Issue]:
        """Allows iteration over Issue instances created from test cases."""
        for case in self.test_cases:
            # Construct the full path to the markdown file
            full_path = os.path.join(self.yaml_dir, case["filepath"])
            # Load the content from the markdown file
            with open(full_path, 'r') as f:
                content = f.read()
            # Create an Issue instance for each test case
            yield Issue(
                id=case["id"],
                title=case["title"],
                content=content,
                expected_results=case["expected_results"]
            )

    def __len__(self) -> int:
        """Returns the number of test cases in the dataset."""
        return len(self.test_cases)

dataset = Dataset("test/dataset.yaml")

## Evaluator

In [5]:
class LocalizationEvaluator:
    def __init__(self, dataset: Dataset, strategies_to_evaluate: List[Strategy]):
        self.dataset = dataset
        self.strategies = strategies_to_evaluate

    def calculate_score(self, expected_results: List[str], actual_results: List[str]) -> float:
        """Calculates the score with distance-based penalties for expected results outside the top-k."""
        score = 1.0  # Start with a perfect score of 1

        for expected in expected_results:
            if expected in actual_results:
                index = actual_results.index(expected)
                # Check if expected item is within the top-k
                if index >= len(expected_results):
                    # Distance-based partial penalty if it's outside top-k but present in results
                    distance_factor = index - len(expected_results) + 1
                    penalty = (1 / len(expected_results)) * distance_factor * 0.2
                    score -= penalty
            else:
                # Full penalty if expected item is missing altogether
                score -= 1 / len(expected_results)

        return max(score, 0)  # Ensure score doesn't go below 0

    def evaluate(self) -> pd.DataFrame:
        """Evaluates each strategy on all test issues and returns a DataFrame with results and scores."""
        df = pd.DataFrame(columns=["Issue Title", "Expected Results"] + [f"Results ({strategy.strategy_name})" for strategy in self.strategies])

        # Dictionary to store total scores per strategy
        total_scores = {strategy.strategy_name: 0 for strategy in self.strategies}

        # Iterate over each Issue in the dataset
        for issue in self.dataset:
            issue_data = {"title": issue.title, "description": issue.content}  # Prepare data for localization
            row_data = {
                "Issue Title": issue.title,
                "Expected Results": issue.expected_results
            }

            # Calculate and store results and formatted score+results for each strategy
            for strategy in self.strategies:
                actual_results = [res[1] for res in strategy.localize(issue_data, top_n=5)]
                score = self.calculate_score(issue.expected_results, actual_results)
                total_scores[strategy.strategy_name] += score  # Accumulate score for total

                # Format results with score as requested
                formatted_result = f"{score:.2f} {actual_results}"
                row_data[f"Results ({strategy.strategy_name})"] = formatted_result

            # Append row data to DataFrame
            df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)

        # Append total scores row to DataFrame
        total_row = {"Issue Title": "Score", "Expected Results": ""}
        for strategy in self.strategies:
            total_row[f"Results ({strategy.strategy_name})"] = f"{(total_scores[strategy.strategy_name]/len(self.dataset))*100:.2f}%"

        df = pd.concat([df, pd.DataFrame([total_row])], ignore_index=True)
        return df

**Test setup**

In [6]:
projects_store = "/Users/pdhoolia/projects-store"
repo_full_name = "pdhoolia/se-agent"
src_dir = "se_agent"

code_dir = os.path.join(projects_store, repo_full_name, "repo", src_dir)
code_semantics_dir = os.path.join(projects_store, repo_full_name, "metadata", "package_details")

**Create combinded semantic summary + Code files**

In [7]:
# Create a temporary directory for the combined documents
combined_docs_dir = tempfile.mkdtemp()

# Iterate over the semantic summaries and combine with corresponding code files
for root, _, files in os.walk(code_semantics_dir):
    for file in files:
        if file.endswith(".md"):
            filename_without_extn = file.split('.')[0]
            summary_file_path = os.path.join(root, file)
            # Get corresponding code file path
            relative_path = os.path.relpath(root, code_semantics_dir)
            code_file_path = os.path.join(code_dir, relative_path, f"{filename_without_extn}.py")
            
            # Only proceed if the code file exists
            if os.path.exists(code_file_path):
                # Read content from both summary and code files
                with open(summary_file_path, "r") as summary_file:
                    semantic_summary_content = summary_file.read()
                with open(code_file_path, "r") as code_file:
                    code_content = code_file.read()
                
                # Combine the contents
                combined_content = f"# Semantic summary\n\n{semantic_summary_content}\n\n# Code\n\n```python\n{code_content}\n```"
                
                # Define path for the combined document in the temporary folder
                combined_file_dir = os.path.join(combined_docs_dir, relative_path)
                os.makedirs(combined_file_dir, exist_ok=True)
                combined_file_path = os.path.join(combined_file_dir, f"{filename_without_extn}.md")
                
                # Save the combined content
                with open(combined_file_path, "w") as combined_file:
                    combined_file.write(combined_content)

**Embeddings**

In [8]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="mxbai-embed-large")

**Strategies**

In [9]:
code_file_embeddings = SemanticVectorSearchStrategy(code_dir, src_dir, embeddings, strategy_name="Code File Embeddings")
code_semantics_embeddings = SemanticVectorSearchStrategy(code_semantics_dir, src_dir, embeddings, strategy_name="Code Semantics Embeddings")
combined_embeddings = SemanticVectorSearchStrategy(combined_docs_dir, src_dir, embeddings, strategy_name="Combined Embeddings")

strategies_to_evaluate = [code_file_embeddings, code_semantics_embeddings, combined_embeddings]

**Evaluate**

In [10]:
evaluator = LocalizationEvaluator(
    dataset=dataset,
    strategies_to_evaluate=strategies_to_evaluate
)

evaluation_results = evaluator.evaluate()

**Display results**

In [11]:
# Create a copy of the DataFrame for display purposes
display_df = evaluation_results.copy()

# Set the index to start from 1
display_df.index = display_df.index + 1

# Apply left alignment to all columns, including headers
df_style = display_df.style \
    .set_table_attributes("style='width:100%'") \
    .set_properties(**{'text-align': 'left'}) \
    .set_table_styles([{
        'selector': 'th',
        'props': [('text-align', 'left')]
    }])

df_style

Unnamed: 0,Issue Title,Expected Results,Results (Code File Embeddings),Results (Code Semantics Embeddings),Results (Combined Embeddings)
1,Project level override for github token,"['project', 'project_info', 'onboard_agent']","0.33 ['project_info', 'lambda_function', 'project_manager', 'flask_server', 'issue_analyzer']","0.87 ['onboard_agent', 'project', 'listener_core', 'lambda_function', 'project_info']","0.33 ['project_info', 'issue_analyzer', '__init__', 'markdown', 'retry_with_backoff']"
2,Retry LLM call on Rate Limit Error,"['retry_with_backoff', 'api']","0.50 ['retry_with_backoff', 'file_analyzer', 'lambda_function', 'issue_analyzer', 'flask_server']","1.00 ['retry_with_backoff', 'api', 'change_suggester', 'model_configuration_manager', 'localizer']","0.50 ['retry_with_backoff', 'issue_analyzer', '__init__', 'file_analyzer', 'project_manager']"
3,Handle issue comments as well,"['listener_core', 'issue_analyzer', 'localizer', 'change_suggester', 'project']","0.60 ['project', 'localizer', 'change_suggester', 'model_configuration_manager', 'onboard_agent']","0.20 ['issue_analyzer', 'markdown', 'lambda_function', 'flask_server', 'onboard_agent']","0.20 ['onboard_agent', 'localizer', 'file_analyzer', 'flask_server', 'project_manager']"
4,Update semantic understanding on code push to the main branch,"['listener_core', 'project', 'file_analyzer', 'package_summary']","0.25 ['file_analyzer', 'issue_analyzer', 'lambda_function', 'project_manager', 'flask_server']","0.70 ['project', 'file_analyzer', 'localizer', 'change_suggester', 'package_summary']","0.20 ['issue_analyzer', 'project_info', '__init__', 'markdown', 'project']"
5,API based onboarding for a new project,"['listener_core', 'flask_server', 'lambda_function', 'project_manager', 'project']","0.60 ['lambda_function', 'project_manager', 'flask_server', 'issue_analyzer', 'project_info']","0.80 ['onboard_agent', 'lambda_function', 'flask_server', 'listener_core', 'project']","0.20 ['project_info', 'issue_analyzer', '__init__', 'flask_server', 'retry_with_backoff']"
6,Move lambda function within the se_agent package structure,['lambda_function'],"1.00 ['lambda_function', 'issue_analyzer', 'project_manager', 'file_analyzer', 'flask_server']","1.00 ['lambda_function', 'localizer', 'change_suggester', 'api', 'project']","0.00 ['__init__', 'issue_analyzer', 'project_info', 'retry_with_backoff', 'folder_count']"
7,Use structured output for semantic summary generation,"['localizer', 'file_analyzer', 'package_summary', 'project']","0.25 ['file_analyzer', 'issue_analyzer', '__init__', 'markdown', 'lambda_function']","0.95 ['file_analyzer', 'package_summary', 'project', 'change_suggester', 'localizer']","0.00 ['issue_analyzer', 'project_info', 'markdown', '__init__', 'file_count']"
8,Tool based (no LLM) code structure name generation,"['package_summary', 'project']","0.00 ['file_analyzer', 'project_info', 'project_manager', 'lambda_function', 'issue_analyzer']","0.50 ['package_summary', 'file_analyzer', 'change_suggester', 'localizer', 'project_info']","0.00 ['project_info', 'issue_analyzer', 'markdown', '__init__', 'file_count']"
9,Retrieval based localization,"['localizer', 'project', 'api', 'model_configuration_manager']","0.00 ['file_analyzer', 'lambda_function', 'project_manager', 'issue_analyzer', 'project_info']","0.70 ['localizer', 'change_suggester', 'file_analyzer', 'api', 'project']","0.00 ['issue_analyzer', 'project_info', 'markdown', 'file_count', '__init__']"
10,Checkpoint,['project'],"0.00 ['lambda_function', 'issue_analyzer', 'project_info', 'file_analyzer', 'flask_server']","1.00 ['project', 'issue_analyzer', 'model_configuration_manager', 'listener_core', 'lambda_function']","0.00 ['issue_analyzer', 'file_analyzer', 'onboard_agent', 'project_info', 'project_manager']"


**Cleanup the temporary directory for combined files**

In [12]:
shutil.rmtree(combined_docs_dir)