In [1]:
import math
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
  
lemmatizer = WordNetLemmatizer()

In [2]:
class BM25:
    
    tf = {}
    df = {}
    f = {}
    
    def __init__(self, k, b, corpus=None):
        self.k = k
        self.b = b
        self.corpus = None
    
    def fit(self, corpus):
        
        f = {}
        tf = {}
        idf = {}
        df = {}
        
        self.corpus = corpus
        
        corpus_size = len(corpus)
        
        tot_doc_size = 0
        
        doc_id = 0
        
        for document in corpus:
            tot_doc_size += len(document)
            
            f[doc_id] = {}
            
            for term in document:
                
                if term not in f[doc_id]:
                    f[doc_id][term] = 1
                else:
                    f[doc_id][term] += 1
                
                if term not in tf:
                    tf[term] = 1
                else:
                    tf[term] += 1
            
            doc_id += 1
        
        doc_id = 0
        
        for document in f:
            for term in f[doc_id]:
                
                if term not in df:
                    df[term] = 1
                else:
                    df[term] += 1
            
            doc_id += 1
        
        for term in tf:
            idf[term] = math.log(1 + (corpus_size - df[term] + .5) / (df[term] + .5) )
        
        self.f = f
        self.tf = tf
        self.idf = idf
        
        self.avg_doc_size = tot_doc_size / corpus_size
        self.corpus_size = corpus_size
    
    def search(self, query):
        
        score = {}
        
        doc_id = 0
        
        for document in self.corpus:
            score[doc_id] = 0
            for q in query:
                
                # Higher the inverse document frequency of the term q, higher is the importance
                # Higher the count of term q in this document, higher the importance
                # Larger the document is from the average document size, it is less likely to be significant
                curr_score = self.idf.get(q, 0) * ( self.f[doc_id].get(q, 0) * (self.k + 1) ) / ( self.f[doc_id].get(q, 0) + self.k * ( 1 - self.b + (self.b * (len(self.corpus[doc_id]) / self.avg_doc_size) ) ) )
                
                score[doc_id] += curr_score
            
            doc_id += 1
        
        return score

In [16]:
corpus = [
    'Topic Modeling aims to find the topics (or clusters) inside a corpus of texts (like mails or news articles), without knowing those topics at first. ',
    'Topic Modeling is a kind of a probabilistic generative model that has been used widely in the field of computer science with a specific focus on text mining and information retrieval in recent years. The aim of topic modeling is to discover the themes that run through a corpus by analyzing the words of the original texts. We call these themes \“topics.\” ',
    'In machine learning and natural language processing, a topic model is a type of statistical model for discovering the abstract "topics" that occur in a collection of documents. Topic modeling is a frequently used text-mining tool for discovery of hidden semantic structures in a text body. Intuitively, given that a document is about a particular topic, one would expect particular words to appear in the document more or less frequently: "dog" and "bone" will appear more often in documents about dogs, "cat" and "meow" will appear in documents about cats, and "the" and "is" will appear approximately equally in both. A document typically concerns multiple topics in different proportions; thus, in a document that is 10% about cats and 90% about dogs, there would probably be about 9 times more dog words than cat words. The "topics" produced by topic modeling techniques are clusters of similar words. A topic model captures this intuition in a mathematical framework, which allows examining a set of documents and discovering, based on the statistics of the words in each, what the topics might be and what each document\'s balance of topics is.',
    'Latent because the topics are “hidden”. We have a bunch of texts and we want the algorithm to put them into clusters that will make sense to us.',
    'Approaches for temporal information include Block and Newman\'s determination of the temporal dynamics of topics in the Pennsylvania Gazette during 1728–1800.'
]

stop_words = stopwords.words('english')

from nltk.tokenize import word_tokenize

# remove all tokens that are not alphabetic
preprocessed = [[word.lower() for word in word_tokenize(document) if word.isalpha()] for document in corpus]

# remove stop words
preprocessed = [[lemmatizer.lemmatize(word) for word in document if word not in stop_words] for document in preprocessed]

preprocessed

[['topic',
  'modeling',
  'aim',
  'find',
  'topic',
  'cluster',
  'inside',
  'corpus',
  'text',
  'like',
  'mail',
  'news',
  'article',
  'without',
  'knowing',
  'topic',
  'first'],
 ['topic',
  'modeling',
  'kind',
  'probabilistic',
  'generative',
  'model',
  'used',
  'widely',
  'field',
  'computer',
  'science',
  'specific',
  'focus',
  'text',
  'mining',
  'information',
  'retrieval',
  'recent',
  'year',
  'aim',
  'topic',
  'modeling',
  'discover',
  'theme',
  'run',
  'corpus',
  'analyzing',
  'word',
  'original',
  'text',
  'call',
  'theme'],
 ['machine',
  'learning',
  'natural',
  'language',
  'processing',
  'topic',
  'model',
  'type',
  'statistical',
  'model',
  'discovering',
  'abstract',
  'topic',
  'occur',
  'collection',
  'document',
  'topic',
  'modeling',
  'frequently',
  'used',
  'tool',
  'discovery',
  'hidden',
  'semantic',
  'structure',
  'text',
  'body',
  'intuitively',
  'given',
  'document',
  'particular',
  'to

In [17]:
bm25 = BM25(k=1.5, b=0.75)
bm25.fit(preprocessed)

In [18]:
query = 'topic modeling aim'
query = [word.lower() for word in word_tokenize(query) if word.isalpha() not in stop_words]

scores = bm25.search(query)

In [19]:
for score, document in zip(scores.values(), corpus):
    score = round(score, 3)
    print(document + '-----> ' + str(score))

Topic Modeling aims to find the topics (or clusters) inside a corpus of texts (like mails or news articles), without knowing those topics at first. -----> 2.0
Topic Modeling is a kind of a probabilistic generative model that has been used widely in the field of computer science with a specific focus on text mining and information retrieval in recent years. The aim of topic modeling is to discover the themes that run through a corpus by analyzing the words of the original texts. We call these themes \“topics.\” -----> 1.823
In machine learning and natural language processing, a topic model is a type of statistical model for discovering the abstract "topics" that occur in a collection of documents. Topic modeling is a frequently used text-mining tool for discovery of hidden semantic structures in a text body. Intuitively, given that a document is about a particular topic, one would expect particular words to appear in the document more or less frequently: "dog" and "bone" will appear mor