In [13]:
import re
from collections import defaultdict
from math import log

In [14]:
def preprocess(text):
    text = text.lower()
    return text.split()
def load_documents():
    files = [
        "bm25_vs_jm.txt",
        "language_model_jm.txt",
        "okapi_bm25.txt"
    ]

    documents = {}
    for f in files:
        with open(f, "r", encoding="utf-8") as file:
            documents[f] = preprocess(file.read())
    return documents

In [15]:
def compute_stats(documents):
    tf = defaultdict(lambda: defaultdict(int))
    df = defaultdict(int)
    doc_len = {}

    for doc, tokens in documents.items():
        doc_len[doc] = len(tokens)
        unique_terms = set(tokens)

        for term in tokens:
            tf[term][doc] += 1

        for term in unique_terms:
            df[term] += 1

    avgdl = sum(doc_len.values()) / len(doc_len)
    return tf, df, doc_len, avgdl


In [16]:
def bm25_score(query, documents, tf, df, doc_len, avgdl, k1=1.5, b=0.75):
    scores = defaultdict(float)
    N = len(documents)

    for term in preprocess(query):
        if term not in df:
            continue

        idf = log((N - df[term] + 0.5) / (df[term] + 0.5))

        for doc in documents:
            f = tf[term][doc]
            denom = f + k1 * (1 - b + b * (doc_len[doc] / avgdl))
            scores[doc] += idf * (f * (k1 + 1)) / denom

    return scores

In [17]:
def build_collection_model(documents):
    coll_tf = defaultdict(int)
    total_terms = 0

    for tokens in documents.values():
        for term in tokens:
            coll_tf[term] += 1
            total_terms += 1

    return {term: coll_tf[term] / total_terms for term in coll_tf}

In [18]:
def jm_score(query, documents, tf, doc_len, collection_model, lamb=0.2):
    scores = defaultdict(float)

    for term in preprocess(query):
        for doc in documents:
            p_doc = tf[term][doc] / doc_len[doc] if term in tf else 0
            p_coll = collection_model.get(term, 0)

            prob = (1 - lamb) * p_doc + lamb * p_coll

            if prob > 0:
                scores[doc] += log(prob)

    return scores


In [19]:
def rank(scores, top_k=3):
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]


In [20]:
documents = load_documents()
tf, df, doc_len, avgdl = compute_stats(documents)
collection_model = build_collection_model(documents)

queries = [
    "bm25 scoring function",
    "jelinek mercer smoothing",
    "document ranking method"
]

for q in queries:
    print(f"\n=== Query: {q} ===")

    bm25 = bm25_score(q, documents, tf, df, doc_len, avgdl)
    jm = jm_score(q, documents, tf, doc_len, collection_model)

    print("\nBM25 Results:")
    for doc, score in rank(bm25):
        print(f"{doc}: {score}")

    print("\nJM Results:")
    for doc, score in rank(jm):
        print(f"{doc}: {score}")



=== Query: bm25 scoring function ===

BM25 Results:
language_model_jm.txt: 0.0
bm25_vs_jm.txt: -0.9945277630842297
okapi_bm25.txt: -1.3939479256595166

JM Results:
bm25_vs_jm.txt: -13.855858696845942
okapi_bm25.txt: -16.44609865549144
language_model_jm.txt: -20.12231273619019

=== Query: jelinek mercer smoothing ===

BM25 Results:
okapi_bm25.txt: 0.0
bm25_vs_jm.txt: -0.6894578971074721
language_model_jm.txt: -0.8758760695268186

JM Results:
language_model_jm.txt: -3.8524040596624904
bm25_vs_jm.txt: -4.439786438058282
okapi_bm25.txt: -6.061456918928017

=== Query: document ranking method ===

BM25 Results:
bm25_vs_jm.txt: -1.7987404739166764
language_model_jm.txt: -2.883576697563832
okapi_bm25.txt: -4.534512972578895

JM Results:
language_model_jm.txt: -13.977622928959965
okapi_bm25.txt: -15.368410366528149
bm25_vs_jm.txt: -19.015189757854166
