# Import Libraries

In [8]:
import xml.etree.ElementTree as ET
import math
from collections import defaultdict

# Loading the Dataset and Queries

In [9]:
# Load Cranfield Dataset
def load_cranfield_dataset(filename):
    with open(filename, 'r', encoding='utf-8-sig') as file:
        content = file.read().strip()
    if not content.startswith('<?xml'):
        content = '<?xml version="1.0" encoding="utf-8"?>\n<root>\n' + content + '\n</root>'
    root = ET.fromstring(content)
    documents = {}
    for doc in root.findall('doc'):
        doc_id_elem = doc.find('docno')
        text_elem = doc.find('text')
        if doc_id_elem is None:
            continue
        doc_id = doc_id_elem.text.strip()
        text = text_elem.text.strip() if text_elem is not None and text_elem.text else ''
        documents[doc_id] = preprocess(text)
    return documents

# Load Queries
def load_queries(filename):
    with open(filename, 'r', encoding='utf-8-sig') as file:
        content = file.read().strip()
    if not content.startswith('<?xml'):
        content = '<?xml version="1.0" encoding="utf-8"?>\n<root>\n' + content + '\n</root>'
    root = ET.fromstring(content)
    queries = {}
    for i, query in enumerate(root.findall('top'), start=1):
        text_elem = query.find('title')
        text = text_elem.text.strip() if text_elem is not None and text_elem.text else ''
        queries[str(i)] = preprocess(text)
    return queries

# Preprocessing and Interted Indexing

In [10]:
# Preprocessing Function
def preprocess(text):
    text = text.lower()
    tokens = text.split()
    return [token for token in tokens if token.isalnum()]

# Build Inverted Index
def build_inverted_index(documents):
    inverted_index = defaultdict(dict)
    doc_lengths = defaultdict(int)
    for doc_id, text in documents.items():
        term_freqs = defaultdict(int)
        for term in text:
            term_freqs[term] += 1
        for term, freq in term_freqs.items():
            inverted_index[term][doc_id] = freq
        doc_lengths[doc_id] = len(text)
    return inverted_index, doc_lengths

# VSM

In [11]:
# Vector Space Model (TF-IDF)
def compute_tfidf_scores(query, inverted_index, doc_lengths, total_docs):
    query_terms = query
    scores = defaultdict(float)
    for term in query_terms:
        if term in inverted_index:
            doc_freq = len(inverted_index[term])
            idf = math.log((total_docs / (1 + doc_freq)))
            for doc_id, term_freq in inverted_index[term].items():
                tf = term_freq / doc_lengths[doc_id]
                scores[doc_id] += tf * idf
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

# BM25

In [12]:
# BM25 Model
def compute_bm25_scores(query, inverted_index, doc_lengths, total_docs, k1=1.5, b=0.75):
    avg_doc_len = sum(doc_lengths.values()) / total_docs
    query_terms = query
    scores = defaultdict(float)
    for term in query_terms:
        if term in inverted_index:
            doc_freq = len(inverted_index[term])
            idf = math.log((total_docs - doc_freq + 0.5) / (doc_freq + 0.5) + 1)
            for doc_id, term_freq in inverted_index[term].items():
                tf = term_freq
                doc_len = doc_lengths[doc_id]
                score = idf * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avg_doc_len))))
                scores[doc_id] += score
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

# LM with smoothing

In [13]:
# Language Model with Smoothing
def compute_lm_scores(query, documents, lambda_=0.1):
    query_terms = query
    doc_scores = {}
    collection_freqs = defaultdict(int)
    total_terms = sum(len(doc) for doc in documents.values())
    for doc_text in documents.values():
        for token in doc_text:
            collection_freqs[token] += 1
    for doc_id, text in documents.items():
        doc_len = len(text)
        term_freqs = defaultdict(int)
        for token in text:
            term_freqs[token] += 1
        score = 0
        for token in query_terms:
            p_td = (term_freqs[token] + 1) / (doc_len + len(collection_freqs))
            p_tc = collection_freqs[token] / total_terms if collection_freqs[token] > 0 else 1e-10
            score += math.log((lambda_ * p_td) + ((1 - lambda_) * p_tc))
        doc_scores[doc_id] = score
    return sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)

# Writing Results

In [14]:
# Write Results
def write_results(output_file, model_name, results):
    with open(output_file, 'w') as f:
        for query_id, ranked_docs in results.items():
            for rank, (doc_id, score) in enumerate(ranked_docs[:100], start=1):
                f.write(f'{query_id} 0 {doc_id} {rank} {score} {model_name}\n')

# Main Execution

In [15]:
# Main Execution
documents = load_cranfield_dataset('cran.all.1400.xml')
queries = load_queries('cran.qry.xml')

inverted_index, doc_lengths = build_inverted_index(documents)
total_docs = len(documents)

tfidf_results = {q_id: compute_tfidf_scores(q, inverted_index, doc_lengths, total_docs) for q_id, q in queries.items()}
bm25_results = {q_id: compute_bm25_scores(q, inverted_index, doc_lengths, total_docs) for q_id, q in queries.items()}
lm_results = {q_id: compute_lm_scores(q, documents) for q_id, q in queries.items()}

write_results('tfidf_results.txt', 'TFIDF', tfidf_results)
write_results('bm25_results.txt', 'BM25', bm25_results)
write_results('lm_results.txt', 'LM', lm_results)

print('Retrieval and ranking complete. Use trec_eval for evaluation.')

Retrieval and ranking complete. Use trec_eval for evaluation.
