<a href="https://colab.research.google.com/github/moses-crasto/Mechanics-of-Search/blob/main/MoS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Evaluate**

In [None]:
!git clone https://github.com/usnistgov/trec_eval.git
!make -C trec_eval

Cloning into 'trec_eval'...
remote: Enumerating objects: 1142, done.[K
remote: Counting objects: 100% (327/327), done.[K
remote: Compressing objects: 100% (97/97), done.[K
remote: Total 1142 (delta 260), reused 273 (delta 224), pack-reused 815[K
Receiving objects: 100% (1142/1142), 763.21 KiB | 20.08 MiB/s, done.
Resolving deltas: 100% (765/765), done.
make: Entering directory '/content/trec_eval'
gcc -g -I.  -Wall -Wno-macro-redefined -DVERSIONID=\"10.0-rc2\"  -o trec_eval trec_eval.c formats.c meas_init.c meas_acc.c meas_avg.c meas_print_single.c meas_print_final.c gain_init.c get_qrels.c get_trec_results.c get_prefs.c get_qrels_prefs.c get_qrels_jg.c form_res_rels.c form_res_rels_jg.c form_prefs_counts.c utility_pool.c get_zscores.c convert_zscores.c measures.c  m_map.c m_P.c m_num_q.c m_num_ret.c m_num_rel.c m_num_rel_ret.c m_gm_map.c m_Rprec.c m_recip_rank.c m_bpref.c m_iprec_at_recall.c m_recall.c m_Rprec_mult.c m_utility.c m_11pt_avg.c m_ndcg.c m_ndcg_cut.c m_Rndcg.c m_ndcg_

In [None]:
import subprocess

# Define paths to relevance judgment file and search engine output files
path_to_cranfield_qrel = "/content/cranqrel.trec.txt"
path_to_engines = {
    "VSM": "/content/vsm_output.txt",
    "BM25": "/content/bm25_output.txt",
    "QL": "/content/query_likelihood_output.txt"
    }

# Define evaluation measures
evaluation_measures = ["map", "P.5", "ndcg"]

# Evaluate each search engine
evaluation_results = {}
for engine_name, output_file in path_to_engines.items():
    print(f"Evaluation results for {engine_name}:")
    !./trec_eval/trec_eval -m map -m P.5 -m ndcg {path_to_cranfield_qrel} {output_file}

Evaluation results for VSM:
map                   	all	0.0091
P_5                   	all	0.0145
ndcg                  	all	0.0395
Evaluation results for BM25:
map                   	all	0.0142
P_5                   	all	0.0062
ndcg                  	all	0.2176
Evaluation results for QL:
map                   	all	0.0074
P_5                   	all	0.0132
ndcg                  	all	0.0377


# **VSM O**/P

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import xml.etree.ElementTree as ET
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define a function to parse the XML file
def parse_xml_file(xml_file):
    """
    Parse XML file to extract document text.

    Args:
    xml_file (str): Path to the XML file.

    Returns:
    list: List of document texts.
    """
    docs = []
    current_doc = ""
    with open(xml_file, 'r') as file:
        for line in file:
            if line.strip() == '<doc>':
                current_doc = line
            elif line.strip() == '</doc>':
                current_doc += line
                docs.append(current_doc)
            elif current_doc:
                current_doc += line
    return docs

# Load XML data into a DataFrame
xml_file = '/content/cran.all.1400.xml'
doc_elements = parse_xml_file(xml_file)
df = pd.DataFrame(doc_elements, columns=['doc'])

# Parse XML file to extract queries
def parse_query_xml(xml_file):
    """
    Parse XML file to extract queries.

    Args:
    xml_file (str): Path to the XML file containing queries.

    Returns:
    dict: Dictionary containing query IDs and corresponding texts.
    """
    queries = {}
    tree = ET.parse(xml_file)
    root = tree.getroot()
    for query in root.findall('top'):
        query_number = query.find('num').text.strip().split()[-1]
        query_text = query.find('title').text.strip()
        queries[query_number] = query_text
    return queries

# Sample query
queries = parse_query_xml('cran.qry.xml')

# Step 1: Tokenize, preprocess, stem, and remove stop words from the text
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocess text by tokenizing, stemming, and removing stop words.

    Args:
    text (str): Input text.

    Returns:
    str: Preprocessed text.
    """
    tokens = word_tokenize(text.lower())
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return ' '.join(stemmed_tokens)

preprocessed_documents = [preprocess_text(doc) for doc in df['doc']]

# Adjust TF-IDF vectorizer parameters for better performance
tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=0.008, max_features=2000, ngram_range=(1, 2))

# Fit and transform the preprocessed documents
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)

output_file = "vsm_output.txt"
with open(output_file, 'w') as f:
    for query_id, query_text in queries.items():
        preprocessed_query = preprocess_text(query_text)
        query_vector = tfidf_vectorizer.transform([preprocessed_query])
        cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
        results = [(df.index[i] + 1, cosine_similarities[0][i]) for i in range(len(df))]
        results.sort(key=lambda x: x[1], reverse=True)
        for rank, (docno, similarity) in enumerate(results[:100], start=1):
            f.write(f"{query_id} 0 {docno} {rank} {similarity} vsm\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# **BM25** O/P

In [None]:
import math
from collections import Counter
import pandas as pd
import xml.etree.ElementTree as ET
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define a function to parse the XML file
def parse_xml_file(xml_file):
    docs = []
    current_doc = ""
    with open(xml_file, 'r') as file:
        for line in file:
            if line.strip() == '<doc>':
                current_doc = line
            elif line.strip() == '</doc>':
                current_doc += line
                docs.append(current_doc)
                current_doc = ""
            elif current_doc:
                current_doc += line
    return docs

# Load XML data into a DataFrame
xml_file = '/content/cran.all.1400.xml'
doc_elements = parse_xml_file(xml_file)
df = pd.DataFrame(doc_elements, columns=['doc'])

# Preprocess documents
documents = []
for doc in df['doc']:
    root = ET.fromstring(doc)
    text_element = root.find('text')
    if text_element is not None and text_element.text is not None:
        text = text_element.text.strip()
        documents.append(text.split())

# Define function to preprocess query
def preprocess_query(query):
    query_tokens = word_tokenize(query.lower())
    stop_words = set(stopwords.words('english'))
    filtered_query_tokens = [word for word in query_tokens if word not in stop_words]
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in filtered_query_tokens]

# Function to calculate BM25 score for a document
def calculate_bm25_score(document, preprocessed_query, document_count, avg_document_length, term_counts, k1, b):
    score = 0.0
    document_length = len(document)
    for term in preprocessed_query:
        if term not in document:
            continue
        document_with_term_count = term_counts[term]
        idf = math.log((document_count - document_with_term_count + 0.5) / (document_with_term_count + 0.5))
        term_frequency = document.count(term)
        numerator = term_frequency * (k1 + 1)
        denominator = term_frequency + k1 * (1 - b + b * (document_length / avg_document_length))
        score += idf * (numerator / denominator)
    return score

# Read queries from cran.qry.xml
def read_queries(query_file):
    queries = []
    current_query = ""
    query_id = 0
    with open(query_file, 'r') as file:
        for line in file:
            if line.strip() == '<top>':
                query_id += 1
                current_query = ""
            elif line.strip() == '</top>':
                queries.append((query_id, current_query.strip()))
            elif current_query:
                current_query += line
    return queries

# Main function
def main():
    # BM25 parameters
    document_count = len(documents)
    avg_document_length = sum(len(doc) for doc in documents) / document_count
    term_counts = Counter()
    for document in documents:
        term_counts.update(document)
    k1 = 1.8
    b = 0.8

    # Read queries
    query_file = '/content/cran.qry.xml'
    queries = read_queries(query_file)

    # Write results to file
    with open("bm25_output.txt", "w") as output_file:
        for query_id, query_text in queries:
            preprocessed_query = preprocess_query(query_text)
            scores = []
            for i, document in enumerate(documents, start=1):
                score = calculate_bm25_score(document, preprocessed_query, document_count, avg_document_length, term_counts, k1, b)
                scores.append((i, score))
            ranked_documents = sorted(scores, key=lambda x: x[1], reverse=True)
            for rank, (doc_id, score) in enumerate(ranked_documents, start=1):
                output_file.write(f"{query_id} 0 {doc_id} {rank} {score} bm25\n")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **QL** O/P

In [None]:
import math
from collections import Counter
import pandas as pd
import xml.etree.ElementTree as ET
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Parse XML file to extract document text
def parse_xml_file(xml_file):
    """
    Parse XML file to extract document text.

    Args:
    xml_file (str): Path to the XML file.

    Returns:
    list: List of document texts.
    """
    documents = []
    current_doc = ""
    with open(xml_file, 'r') as file:
        for line in file:
            if line.strip() == '<doc>':
                current_doc = line
            elif line.strip() == '</doc>':
                current_doc += line
                documents.append(current_doc)
            elif current_doc:
                current_doc += line
    return documents

# Preprocess text: tokenize, stem, remove stop words
def preprocess_text(text):
    """
    Preprocess text by tokenizing, stemming, and removing stop words.

    Args:
    text (str): Input text.

    Returns:
    str: Preprocessed text.
    """
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return ' '.join(stemmed_tokens)

# Load XML data into a DataFrame
xml_file = 'cran.all.1400.xml'
documents = parse_xml_file(xml_file)
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Parse XML file to extract queries
def parse_query_xml(xml_file):
    """
    Parse XML file to extract queries.

    Args:
    xml_file (str): Path to the XML file containing queries.

    Returns:
    dict: Dictionary containing query IDs and corresponding texts.
    """
    queries = {}
    tree = ET.parse(xml_file)
    root = tree.getroot()
    for query in root.findall('top'):
        query_number = query.find('num').text.strip().split()[-1]
        query_text = query.find('title').text.strip()
        queries[query_number] = query_text
    return queries

# Sample query
queries = parse_query_xml('cran.qry.xml')

# Preprocess queries
preprocessed_queries = {query_id: preprocess_text(query_text) for query_id, query_text in queries.items()}

# Compute document language models (term frequencies)
doc_language_models = []
for doc in preprocessed_documents:
    term_freqs = Counter(doc.split())
    total_terms = sum(term_freqs.values())
    language_model = {term: freq / total_terms for term, freq in term_freqs.items()}
    doc_language_models.append(language_model)

# Advanced Smoothing Techniques
def calculate_smoothed_likelihood_score(query_language_model, doc_language_model, smoothing_param):
    likelihood_score = 0.0
    for term, query_term_prob in query_language_model.items():
        doc_term_prob = doc_language_model.get(term, 0)  # Get term probability from document's language model
        smoothed_term_prob = (1 - smoothing_param) * doc_term_prob + smoothing_param * query_term_prob
        likelihood_score += query_term_prob * math.log(smoothed_term_prob + 1e-10)  # Smoothing for unseen terms
    return likelihood_score

# Generate output file
output_file = "query_likelihood_output.txt"
with open(output_file, 'w') as f:
    for query_id, query_text in preprocessed_queries.items():
        query_term_freqs = Counter(query_text.split())
        query_total_terms = sum(query_term_freqs.values())
        query_language_model = {term: freq / query_total_terms for term, freq in query_term_freqs.items()}
        query_likelihood_scores = []
        for doc_language_model in doc_language_models:
            # Adjust smoothing parameter for better performance
            smoothing_param = 0.1
            likelihood_score = calculate_smoothed_likelihood_score(query_language_model, doc_language_model, smoothing_param)
            query_likelihood_scores.append(likelihood_score)
        results = sorted(enumerate(query_likelihood_scores, start=1), key=lambda x: x[1], reverse=True)
        for rank, (doc_id, score) in enumerate(results[:100], start=1):
            f.write(f"{query_id} 0 {doc_id} {rank} {score:.4f} QL\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
