<a href="https://colab.research.google.com/github/moses-crasto/Mechanics-of-Search/blob/main/MoS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Evaluate**

In [None]:
!git clone https://github.com/usnistgov/trec_eval.git
!make -C trec_eval

fatal: destination path 'trec_eval' already exists and is not an empty directory.
make: Entering directory '/content/trec_eval'
make: 'trec_eval' is up to date.
make: Leaving directory '/content/trec_eval'


In [None]:
import subprocess

# Define paths to relevance judgment file and search engine output files
path_to_cranfield_qrel = "/content/cranqrel.trec.txt"
path_to_engines = {
    "VSM": "/content/vsm_output.txt",
    "BM25": "/content/bm25_output.txt",
    "QL": "/content/query_likelihood_output.txt"
    }

# Define evaluation measures
evaluation_measures = ["map", "P.5", "ndcg"]

# Evaluate each search engine
evaluation_results = {}
for engine_name, output_file in path_to_engines.items():
    print(f"Evaluation results for {engine_name}:")
    !./trec_eval/trec_eval -m map -m P.5 -m ndcg {path_to_cranfield_qrel} {output_file}

Evaluation results for VSM:
map                   	all	0.2856
P_5                   	all	0.3102
ndcg                  	all	0.4920
Evaluation results for BM25:
map                   	all	0.2726
P_5                   	all	0.2862
ndcg                  	all	0.5322
Evaluation results for QL:
map                   	all	0.1906
P_5                   	all	0.2089
ndcg                  	all	0.3794


# **VSM O**/P

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import xml.etree.ElementTree as ET
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define a function to parse the XML file
def parse_xml_file(xml_file):
    docs = []
    tree = ET.parse(xml_file)
    root = tree.getroot()
    for doc in root.findall('doc'):
        doc_dict = {}
        doc_dict['title'] = doc.find('title').text.strip().lower() if doc.find('title') is not None and doc.find('title').text is not None else ''
        doc_dict['author'] = doc.find('author').text.strip().lower() if doc.find('author') is not None and doc.find('author').text is not None else ''
        doc_dict['bib'] = doc.find('bib').text.strip().lower() if doc.find('bib') is not None and doc.find('bib').text is not None else ''
        doc_dict['text'] = doc.find('text').text.strip().lower() if doc.find('text') is not None and doc.find('text').text is not None else ''
        docs.append(doc_dict)
    return docs

# Load XML data into a DataFrame
xml_file = '/content/cran.all.1400.xml'
doc_elements = parse_xml_file(xml_file)
df = pd.DataFrame(doc_elements)

# Parse XML file to extract queries
def parse_query_xml(xml_file):
    queries = {}
    tree = ET.parse(xml_file)
    root = tree.getroot()
    queries_list = root.findall('top')

    for index, query in enumerate(queries_list):
        query_text = query.find('title').text.strip()
        queries[index + 1] = query_text

    return queries

# Sample query
queries = parse_query_xml('cran.qry.xml')

# Step 1: Tokenize, preprocess, stem, and remove stop words from the text
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(title, author, bib, text):
    all_text = ' '.join([title, author, bib, text])
    tokens = word_tokenize(all_text.lower())
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return ' '.join(stemmed_tokens)

# Preprocess documents
df['preprocessed_text'] = df.apply(lambda row: preprocess_text(row['title'], row['author'], row['bib'], row['text']), axis=1)

# Preprocess queries
preprocessed_queries = {query_id: preprocess_text(query_text, '', '', '') for query_id, query_text in queries.items()}

# Adjust TF-IDF vectorizer parameters for better performance
tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=0.001, max_features=5000, ngram_range=(1, 2))

# Fit TF-IDF vectorizer on preprocessed documents
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_text'])

output_file = "vsm_output.txt"
with open(output_file, 'w') as f:
    for query_id, query_text in preprocessed_queries.items():
        # Transform the preprocessed query using the trained TF-IDF vectorizer
        query_vector = tfidf_vectorizer.transform([query_text])

        # Compute cosine similarity between the query vector and all document vectors
        cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)

        # Get the indices of documents sorted by similarity score
        sorted_indices = cosine_similarities.argsort()[0][::-1]

        # Write the top 100 results to the output file
        for rank, idx in enumerate(sorted_indices[:100], start=1):
            docno = idx + 1  # Assuming docno starts from 1
            similarity = cosine_similarities[0][idx]
            f.write(f"{query_id} 0 {docno} {rank} {similarity} vsm\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **BM25** O/P

In [None]:
import math
from collections import Counter
import pandas as pd
import xml.etree.ElementTree as ET
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load XML data into a DataFrame
xml_file = '/content/cran.all.1400.xml'
tree = ET.parse(xml_file)
root = tree.getroot()

# Preprocess documents
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
preprocessed_documents = []
all_doc_tokens = []  # Store all document tokens for IDF calculation
for doc in root.findall('doc'):
    title = doc.find('title').text.strip().lower() if doc.find('title') is not None and doc.find('title').text is not None else ''
    author = doc.find('author').text.strip().lower() if doc.find('author') is not None and doc.find('author').text is not None else ''
    bib = doc.find('bib').text.strip().lower() if doc.find('bib') is not None and doc.find('bib').text is not None else ''
    text = doc.find('text').text.strip().lower() if doc.find('text') is not None and doc.find('text').text is not None else ''

    # Tokenize, stem, and remove stop words for title, author, bib, and text
    title_tokens = [stemmer.stem(token) for token in word_tokenize(title) if token not in stop_words]
    author_tokens = [stemmer.stem(token) for token in word_tokenize(author) if token not in stop_words]
    bib_tokens = [stemmer.stem(token) for token in word_tokenize(bib) if token not in stop_words]
    text_tokens = [stemmer.stem(token) for token in word_tokenize(text) if token not in stop_words]

    # Combine tokens for title, author, bib, and text into a single list
    doc_tokens = title_tokens + author_tokens + bib_tokens + text_tokens
    preprocessed_documents.append(doc_tokens)
    all_doc_tokens.extend(set(doc_tokens))  # Use set for faster lookup and remove duplicates

# Calculate IDF for all terms
term_counts = Counter(all_doc_tokens)
document_count = len(preprocessed_documents)
idf_values = {term: math.log((document_count - term_counts[term] + 0.5) / (term_counts[term] + 0.5) + 1) for term in term_counts}

# Define function to preprocess query
def preprocess_query(query):
    query_tokens = word_tokenize(query.lower())
    filtered_query_tokens = [word for word in query_tokens if word not in stop_words]
    return [stemmer.stem(word) for word in filtered_query_tokens]

# Function to calculate BM25 score for a document
def calculate_bm25_score(document, preprocessed_query, document_length, avg_document_length, k1, b):
    score = 0.0
    doc_term_freq = Counter(document)
    for term in preprocessed_query:
        if term not in document:
            continue
        term_frequency = doc_term_freq[term]
        numerator = term_frequency * (k1 + 1)
        denominator = term_frequency + k1 * (1 - b + b * (document_length / avg_document_length))
        score += idf_values[term] * (numerator / denominator)
    return score

# Read queries from cran.qry.xml
query_file = '/content/cran.qry.xml'
queries_df = pd.read_xml(query_file)
queries_df.index += 1  # Increment index by 1 to start from 1
queries = list(zip(queries_df.index, queries_df['title']))

# BM25 parameters
avg_document_length = sum(len(doc) for doc in preprocessed_documents) / document_count
k1 = 2.5
b = 0.75

# Write results to file
with open("bm25_output.txt", "w") as output_file:
    for query_id, query_text in queries:
        preprocessed_query = preprocess_query(query_text)
        scores = []
        for i, document in enumerate(preprocessed_documents, start=1):
            score = calculate_bm25_score(document, preprocessed_query, len(document), avg_document_length, k1, b)
            scores.append((i, score))
        ranked_documents = sorted(scores, key=lambda x: x[1], reverse=True)
        for rank, (doc_id, score) in enumerate(ranked_documents, start=1):
            output_file.write(f"{query_id} 0 {doc_id} {rank} {score} bm25\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **QL** O/P

In [None]:
import math
from collections import Counter
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load XML data into a DataFrame
df_docs = pd.read_xml('cran.all.1400.xml')

# Preprocess text: tokenize, stem, remove stop words
def preprocess_text(title, author, bib, text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    title_str = str(title) if pd.notnull(title) else ''
    author_str = str(author) if pd.notnull(author) else ''
    bib_str = str(bib) if pd.notnull(bib) else ''
    text_str = str(text) if pd.notnull(text) else ''
    all_text = ' '.join([title_str, author_str, bib_str, text_str])
    tokens = word_tokenize(all_text.lower())
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return ' '.join(stemmed_tokens)

# Preprocess documents
preprocessed_documents = [preprocess_text(row['title'], row['author'], row['bib'], row['text']) for _, row in df_docs.iterrows()]

# Preprocess queries
df_queries = pd.read_xml('cran.qry.xml')
preprocessed_queries = {query_id: preprocess_text('', '', '', query_text) for query_id, query_text in zip(df_queries.index, df_queries['title'])}

# Compute document language models (term frequencies) in batch
doc_language_models = []
for doc in preprocessed_documents:
    term_freqs = Counter(doc.split())
    total_terms = sum(term_freqs.values())
    language_model = {term: freq / total_terms for term, freq in term_freqs.items()}
    doc_language_models.append(language_model)

# Advanced Smoothing Techniques
def calculate_smoothed_likelihood_score(query_language_model, doc_language_models, smoothing_param):
    likelihood_scores = []
    for doc_language_model in doc_language_models:
        likelihood_score = 0.0
        for term, query_term_prob in query_language_model.items():
            doc_term_prob = doc_language_model.get(term, 0)  # Get term probability from document's language model
            smoothed_term_prob = (1 - smoothing_param) * doc_term_prob + smoothing_param * query_term_prob
            likelihood_score += query_term_prob * math.log(smoothed_term_prob + 1e-10)  # Smoothing for unseen terms
        likelihood_scores.append(likelihood_score)
    return likelihood_scores

# Generate output file
output_file = "query_likelihood_output.txt"
with open(output_file, 'w') as f:
    for query_id, query_text in preprocessed_queries.items():
        query_term_freqs = Counter(query_text.split())
        query_total_terms = sum(query_term_freqs.values())
        query_language_model = {term: freq / query_total_terms for term, freq in query_term_freqs.items()}
        # Adjust smoothing parameter for better performance
        smoothing_param = 0.1  # Experiment with different values
        query_likelihood_scores = calculate_smoothed_likelihood_score(query_language_model, doc_language_models, smoothing_param)
        results = sorted(enumerate(query_likelihood_scores, start=1), key=lambda x: x[1], reverse=True)
        for rank, (doc_id, score) in enumerate(results[:100], start=1):
            f.write(f"{query_id} 0 {doc_id} {rank} {score:.4f} QL\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
