<a href="https://colab.research.google.com/github/narayan-bhattarai/AI-Classification/blob/main/Domain_Specific_Semantic_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import zipfile

zip_path = "/content/drive/MyDrive/stackoverflow.zip"  # <-- Change this
extract_path = "/content/stackoverflow_data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction completed!")

Extraction completed!


In [6]:
import pandas as pd

# ====================================================
# STEP 1: Load datasets from extracted ZIP folder
# ====================================================
questions = pd.read_csv('/content/stackoverflow_data/Questions.csv',
                        encoding='latin1', on_bad_lines='skip')

answers = pd.read_csv('/content/stackoverflow_data/Answers.csv',
                      encoding='latin1', on_bad_lines='skip')

tags = pd.read_csv('/content/stackoverflow_data/Tags.csv',
                   encoding='latin1', on_bad_lines='skip')

print("Questions:", questions.shape)
print("Answers:", answers.shape)
print("Tags:", tags.shape)


# ====================================================
# STEP 2: Rename ID columns
# ====================================================
questions = questions.rename(columns={"Id": "QuestionId"})
answers = answers.rename(columns={"Id": "AnswerId"})


# ====================================================
# STEP 3: Merge Questions â†” Answers
# ====================================================
merged = questions.merge(
    answers,
    left_on='QuestionId',
    right_on='ParentId',
    how='left',
    suffixes=('_question', '_answer')
)

print("Merged shape:", merged.shape)


# ====================================================
# STEP 4: Rename columns correctly
# ====================================================
merged = merged.rename(columns={
    "Score_answer": "AnswerScore",
    "Body_question": "Body_question",
    "Body_answer": "Body_answer"
})

# Convert AnswerScore to numeric
merged["AnswerScore"] = pd.to_numeric(merged["AnswerScore"], errors='coerce')


# ====================================================
# STEP 5: Drop rows with no answer
# ====================================================
merged = merged.dropna(subset=["Body_answer"])


# ====================================================
# STEP 6: Sort answers per Question by score (DESC)
# ====================================================
merged = merged.sort_values(["QuestionId", "AnswerScore"], ascending=[True, False])


# ====================================================
# STEP 7: Keep only the best answer for each question
# ====================================================
best_answers = merged.drop_duplicates(subset=["QuestionId"], keep="first")

print("Best answers shape:", best_answers.shape)


# ====================================================
# STEP 8: Create the final cleaned dataset
# ====================================================
final_df = best_answers[[
    "QuestionId",
    "Title",
    "Body_question",
    "Body_answer",
    "AnswerScore"
]]

print("Final DF shape:", final_df.shape)
final_df.head()


Questions: (1264216, 7)
Answers: (2014516, 6)
Tags: (3750994, 2)
Merged shape: (2176164, 13)
Best answers shape: (1102568, 13)
Final DF shape: (1102568, 5)


Unnamed: 0,QuestionId,Title,Body_question,Body_answer,AnswerScore
0,80,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,<p>I wound up using this. It is a kind of a ha...,12.0
5,90,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,<p>My easy click-by-click instructions (<stron...,19.0
6,120,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,<p>The Jeff Prosise version from MSDN magazine...,9.0
9,180,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,"<p>My first thought on this is ""how generate N...",21.0
17,260,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"<p><a href=""http://www.codeproject.com/Article...",28.0


In [7]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if needed
nltk.download("stopwords")

# Precompiled regex patterns
html_pattern = re.compile(r'<.*?>')
non_alphanum = re.compile(r'[^a-z0-9 ]')

stop_words = set(stopwords.words("english"))
stopword_pattern = re.compile(r'\b(?:' + '|'.join(stop_words) + r')\b')

def fast_clean(text):
    text = str(text).lower()
    text = html_pattern.sub(" ", text)           # remove HTML
    text = non_alphanum.sub(" ", text)           # remove punctuation
    text = stopword_pattern.sub(" ", text)       # fast stopword removal
    text = re.sub(r'\s+', ' ', text).strip()     # collapse spaces
    return text

print("Starting FAST cleaning...")

final_df.loc[:, "clean_title"] = final_df["Title"].astype(str).apply(fast_clean)
final_df.loc[:, "clean_question"] = final_df["Body_question"].astype(str).apply(fast_clean)
final_df.loc[:, "clean_answer"] = final_df["Body_answer"].astype(str).apply(fast_clean)

final_df.loc[:, "document"] = final_df["clean_title"] + " " + final_df["clean_question"]

print("FAST CLEANING DONE!")
final_df.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Starting FAST cleaning...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.loc[:, "clean_title"] = final_df["Title"].astype(str).apply(fast_clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.loc[:, "clean_question"] = final_df["Body_question"].astype(str).apply(fast_clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.loc[:, "clean_answer"] 

FAST CLEANING DONE!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.loc[:, "document"] = final_df["clean_title"] + " " + final_df["clean_question"]


Unnamed: 0,QuestionId,Title,Body_question,Body_answer,AnswerScore,clean_title,clean_question,clean_answer,document
0,80,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,<p>I wound up using this. It is a kind of a ha...,12.0,sqlstatement execute multiple queries one stat...,written database generation script sql want ex...,wound using kind hack actually works pretty we...,sqlstatement execute multiple queries one stat...
5,90,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,<p>My easy click-by-click instructions (<stron...,19.0,good branching merging tutorials tortoisesvn,really good tutorials explaining branching mer...,easy click click instructions specific tortois...,good branching merging tutorials tortoisesvn r...
6,120,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,<p>The Jeff Prosise version from MSDN magazine...,9.0,asp net site maps,anyone got experience creating sql based asp n...,jeff prosise version msdn magazine works prett...,asp net site maps anyone got experience creati...
9,180,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,"<p>My first thought on this is ""how generate N...",21.0,function creating color wheels,something pseudo solved many times never quite...,first thought generate n vectors space maximiz...,function creating color wheels something pseud...
17,260,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"<p><a href=""http://www.codeproject.com/Article...",28.0,adding scripting functionality net applications,little game written c uses database back end t...,oleg shilo c script solution code project real...,adding scripting functionality net application...


In [3]:
# ==============================================
# BM25 BASELINE IMPLEMENTATION
# ==============================================

!pip install rank_bm25

from rank_bm25 import BM25Okapi
import numpy as np

# Convert final_df to python lists
documents = final_df["document"].tolist()
questions = final_df["clean_title"].tolist()   # will use later
answers = final_df["clean_answer"].tolist()    # useful for evaluation

# Tokenize all documents
tokenized_docs = [doc.split() for doc in documents]

print("Building BM25 index...")
bm25 = BM25Okapi(tokenized_docs)
print("BM25 index ready!")


# ----------------------------------------------
# FUNCTION: Search using BM25
# ----------------------------------------------
def bm25_search(query, k=5):
    query_tokens = query.lower().split()
    scores = bm25.get_scores(query_tokens)
    top_k_idx = np.argsort(scores)[::-1][:k]

    # return cleaned fields instead of raw HTML
    return final_df.iloc[top_k_idx][
        ["QuestionId", "clean_title", "clean_question", "clean_answer", "AnswerScore"]
    ]

# ----------------------------------------------
# TEST BM25 WITH SAMPLE QUERY
# ----------------------------------------------
query = "how to fix null pointer exception"
print("\nSample Query:", query)

results = bm25_search(query, k=5)
results




NameError: name 'final_df' is not defined

In [None]:
!pip install sentence-transformers faiss-cpu

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

print("Loading embedding model...")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("Model loaded!")


In [None]:
docs = final_df["document"].tolist()

print("Encoding documents...")
embeddings = model.encode(
    docs,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True
).astype('float32')

print("Embeddings shape:", embeddings.shape)

In [None]:
dimension = embeddings.shape[1]

# Cosine similarity FAISS index
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

print("FAISS index built with ", index.ntotal, " vectors.")


In [None]:
def dense_search(query, k=5):
    query_vec = model.encode([query], convert_to_numpy=True).astype('float32')
    scores, idx = index.search(query_vec, k)

    return final_df.iloc[idx[0]][[
        "QuestionId",
        "clean_title",
        "clean_question",
        "clean_answer",
        "AnswerScore"
    ]]


In [None]:
query = "how to fix null pointer exception"
dense_results = dense_search(query, k=5)
dense_results


In [None]:
def bm25_top_ids(query, k=5):
    query_tokens = query.lower().split()
    scores = bm25.get_scores(query_tokens)
    top_k_idx = np.argsort(scores)[::-1][:k]
    return final_df.iloc[top_k_idx]["QuestionId"].tolist()

def dense_top_ids(query, k=5):
    qvec = model.encode([query], convert_to_numpy=True).astype('float32')
    qvec = qvec / np.linalg.norm(qvec)
    scores, idx = index.search(qvec, k)
    return final_df.iloc[idx[0]]["QuestionId"].tolist()


In [None]:
import numpy as np

# -------- Precision@k ----------
def precision_at_k(retrieved_ids, true_id, k):
    retrieved = retrieved_ids[:k]
    return 1.0 if true_id in retrieved else 0.0

# -------- Recall@k ----------
def recall_at_k(retrieved_ids, true_id, k):
    return 1.0 if true_id in retrieved_ids[:k] else 0.0

# -------- nDCG@k ----------
def ndcg_at_k(retrieved_ids, true_id, k):
    retrieved = retrieved_ids[:k]
    if true_id in retrieved:
        rank = retrieved.index(true_id) + 1  # rank starts at 1
        return 1 / np.log2(rank + 1)
    else:
        return 0.0

# -------- Average Precision ----------
def average_precision(retrieved_ids, true_id, k):
    """AP = Precision at rank of the relevant item, else 0."""
    retrieved = retrieved_ids[:k]
    if true_id not in retrieved:
        return 0.0
    rank = retrieved.index(true_id) + 1
    return 1.0 / rank  # AP for single relevant doc


In [None]:
import random

def evaluate_model(k=5, samples=200):
    bm25_prec, bm25_rec, bm25_ndcg, bm25_map = [], [], [], []
    dense_prec, dense_rec, dense_ndcg, dense_map = [], [], [], []

    query_indices = random.sample(range(len(final_df)), samples)

    for idx in query_indices:
        query_text = final_df.iloc[idx]["clean_title"]
        true_id = final_df.iloc[idx]["QuestionId"]

        # BM25 results
        bm25_ids = bm25_top_ids(query_text, k)
        bm25_prec.append(precision_at_k(bm25_ids, true_id, k))
        bm25_rec.append(recall_at_k(bm25_ids, true_id, k))
        bm25_ndcg.append(ndcg_at_k(bm25_ids, true_id, k))
        bm25_map.append(average_precision(bm25_ids, true_id, k))

        # Dense results
        dense_ids = dense_top_ids(query_text, k)
        dense_prec.append(precision_at_k(dense_ids, true_id, k))
        dense_rec.append(recall_at_k(dense_ids, true_id, k))
        dense_ndcg.append(ndcg_at_k(dense_ids, true_id, k))
        dense_map.append(average_precision(dense_ids, true_id, k))

    return {
        "BM25": {
            "Precision@k": np.mean(bm25_prec),
            "Recall@k": np.mean(bm25_rec),
            "nDCG@k": np.mean(bm25_ndcg),
            "MAP": np.mean(bm25_map),
        },
        "Dense": {
            "Precision@k": np.mean(dense_prec),
            "Recall@k": np.mean(dense_rec),
            "nDCG@k": np.mean(dense_ndcg),
            "MAP": np.mean(dense_map),
        }
    }


In [None]:
results = evaluate_model(k=5, samples=200)
results
