# Finding Similar issues by semantics

In [41]:
#!pip install sentence-transformers scikit-learn pandas

In [39]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

## Loading github issues for transfomer library

In [40]:
pd.set_option('display.max_colwidth', None)

# Load dataset
df = pd.read_csv("/content/gitissues.csv")  # update this path

#df[['title', 'comments']].sample(10)

## Embedding the issues

In [2]:
def embed_issues(df, column_to_embed='title', model_name='all-MiniLM-L6-v2'):
    """
    Embeds the selected text column using SentenceTransformer.

    Parameters:
    - df: DataFrame containing the GitHub issues.
    - column_to_embed: Column name to embed ('title', 'body', etc.).
    - model_name: Pre-trained model name from sentence-transformers.

    Returns:
    - embeddings: numpy array of sentence embeddings.
    - model: the loaded SentenceTransformer model.
    """
    model = SentenceTransformer(model_name)
    texts = df[column_to_embed].fillna('').tolist()
    embeddings = model.encode(texts, convert_to_numpy=True)
    return embeddings, model

In [None]:
# Configuration
column = 'title'

# Step 1: Embed dataset
issue_embeddings, embed_model = embed_issues(df, column_to_embed=column)

## Finding similar issues

In [28]:
def search_similar_issues(df, embeddings, model, problem_description, k=5, column_to_embed='title'):
    """
    Finds top-k similar issues from the dataset.

    Parameters:
    - df: DataFrame with GitHub issues.
    - embeddings: Precomputed issue embeddings.
    - model: SentenceTransformer model used for embedding.
    - problem_description: Query string to compare against dataset.
    - k: Number of top similar issues to return.
    - column_to_embed: For display context, can be title or body.

    Returns:
    - DataFrame with top-k similar issues and similarity scores.
    """
    query_embedding = model.encode([problem_description], convert_to_numpy=True)
    sim_scores = cosine_similarity(query_embedding, embeddings)[0]
    top_k_idx = np.argsort(sim_scores)[-k:][::-1]

    result_df = df.iloc[top_k_idx].copy()
    result_df['similarity'] = sim_scores[top_k_idx]
    return result_df[['title', 'similarity']]

## Seach similar problems

In [38]:
k = 10

query = "caching configuration not working"

# Step 2: Search similar issues
similar = search_similar_issues(df,
                                issue_embeddings,
                                embed_model,
                                query,
                                k=k,
                                column_to_embed=column)

similar

Unnamed: 0,title,similarity
1424,Possible caching bug,0.672567
1757,Caching doesn't work for map (non-deterministic),0.620773
150,Missing cache file,0.613573
151,Missing cache file,0.613573
349,Cached dataset not loaded,0.57833
346,Cached dataset not loaded,0.57833
348,Cached dataset not loaded,0.57833
347,Cached dataset not loaded,0.57833
1507,Caching processed dataset at wrong folder,0.567681
1509,Caching processed dataset at wrong folder,0.567681
