In [2]:
#importing necessary libraries (Information Retrieval)
import math
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
# taking a sample for document from the novel, here I will take three different sample as a documents
documents = {1:"The narrative is notable for its lengthy and intricate involuntary memory episodes, the most famous being the madeleine episode. It explores the themes of time, space and memory, but also raises questions about the nature of art and literature, and the complex relationships between love, sexuality, and possession" ,
2: "Set in Dublin, the novel follows a day in the life of Leopold Bloom, an advertising salesman, as he navigates the city. The narrative, heavily influenced by Homer's Odyssey, explores themes of identity, heroism, and the complexities of everyday life. It is renowned for its stream-of-consciousness style and complex structure, making it a challenging but rewarding read" ,
3: "Set in the summer of 1922, the novel follows the life of a young and mysterious millionaire, his extravagant lifestyle in Long Island, and his obsessive love for a beautiful former debutante. As the story unfolds, the millionaire's dark secrets and the corrupt reality of the American dream during the Jazz Age are revealed. The narrative is a critique of the hedonistic excess and moral decay of the era, ultimately leading to tragic consequences."    
}

In [10]:
query = "involuntary follows life young bloom in complex"

In [12]:
stop_words = set(stopwords.words('english'))

In [13]:
# Calculating the length of the document
avg_doc_length = sum(len(doc.split()) for doc in documents. values())/len(documents)

In [18]:
# function for text processing i.e, punctuation, cases etc.
def preprocess_text(text):
    text = text.lower()
    text = text. translate(str.maketrans('','',string.punctuation))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [19]:
# Preprocessing the documents and the query
preprocessed_docs = {doc_id: preprocess_text(doc) for doc_id, doc in documents.items()}
preprocessed_query = preprocess_text(query)


In [42]:
# Implementing the Term frequency inverse documents frequency(TF-IDF)
def tf_idf(doc, query):
    term_freq_doc = {term: doc.split().count(term) for term in doc.split()}

    idf = {
        term: math.log(
            len(preprocessed_docs) / 
            (1 + sum(1 for doc_text in preprocessed_docs.values() if term in doc_text.split()))
        )
        for term in term_freq_doc.keys()
    }

    tf_idf_scores = {term: tf * idf[term] for term, tf in term_freq_doc.items()}
    return tf_idf_scores


In [44]:
# Implementing okapi BM25 
def okapi_bm25(doc, query, k1=1.5, b=0.75):
    term_frequency_doc = {term: doc.split().count(term) for term in doc.split()}
    document_length = len(doc.split())
    scores = {}
    for term in query.split():
        if term in term_frequency_doc:
            tf = term_frequency_doc[term]
            df = sum(1 for doc_text in preprocessed_docs.values() if term in doc_text.split())
            idf = math.log((len(preprocessed_docs) - df + 0.5) / (df + 0.5) + 1)
            bm25 = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (document_length / avg_doc_length)))
            scores[term] = bm25
    return scores

In [45]:
# Calculating the scores for all the documents based on the TF_IDF and Okapi bm25 methods
tfidf_scores = {}
bm25_scores = {}
for doc_id, doc_text in preprocessed_docs.items():
    tfidf_scores[doc_id]= tf_idf(doc_text, preprocessed_query)
    bm25_scores[doc_id] = okapi_bm25(doc_text, preprocessed_query)

In [47]:
# printing the score for the evaluation
print ("Tf-IDF document scores:")
for doc_id, scores in tfidf_scores.items():
    print(f"Document {doc_id}, score:{sum(scores.values())}")
print ("\nOkapi BM25 document scores:")
for doc_id, scores in bm25_scores.items():
    print(f"Document {doc_id}, score:{sum(scores.values())}")

Tf-IDF document scores:
Document 1, score:8.22708519781967
Document 2, score:9.443480522144164
Document 3, score:13.903596711333973

Okapi BM25 document scores:
Document 1, score:1.9445831967359468
Document 2, score:3.1884731732670026
Document 3, score:2.235563950319066


In [None]:
# the finding tells that as tf-idf document 3 contain more query words but as per bm25 document 2 has balanced and morr