# Scratchpad for BM25

In [1]:
from __future__ import division
import math
import numpy as np

In [2]:
# Simple tokenize per word function
tokenize = lambda doc: doc.lower().split(" ")

In [3]:
# Sample documents 
# Note: Edited doc_1 purposely to show failure of tfidf later
document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption.#Rohan edit China China China China"
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
document_6 = "Vladimir Putin was found to be riding a horse, again, without a shirt on while hunting deer. Vladimir Putin always seems so serious about things - even riding horses."

# List of all documents
all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]

# Tokenize each document
tokenized_documents = [tokenize(d) for d in all_documents]

# print tokenized_documents[0]

In [4]:
# Created 'query' assuming all the preprocessing is done.
query = ['china','strong','economy'] # A part of doc_0

In [5]:
# Get the term frequencies for each word in the query against all docs as a sparse matrix
def get_term_frequencies(query, tokenized_document):
    lis = []
    m = len(tokenized_document)
    n = len(query)
    for i in range(len(tokenized_document)):
        for j in query:
            lis.append(tokenized_document[i].count(j))
    term_frequencies = np.array(lis).reshape(m,n)
    return term_frequencies

In [6]:
tf = get_term_frequencies(query,tokenized_documents)
tf

array([[1, 1, 1],
       [5, 0, 0],
       [0, 0, 0],
       [0, 0, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

* Note:
> doc_1 has higher tf. But our query is for doc_0

In [7]:
# Calculate the IDF for each word against the all documents. Save it as a key-value pair
def inverse_document_frequencies(tokenized_documents):
    idf_values = dict()
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for tkn in all_tokens_set:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
    return idf_values

In [8]:
# Get the idf vector for our query from the IDF dictionary
def get_idfs_for_query(query=['china','strong','economy']):
    idf_values = inverse_document_frequencies(tokenized_documents)
    lis = []
    n = len(query)
    for token in query:
        for key, value in idf_values.items():
            if token == key:
#                 print token,value
                lis.append(value)
    idfs_for_query = np.array(lis).reshape(n,1)
    return idfs_for_query

In [9]:
idf = get_idfs_for_query()
idf

array([[ 2.25276297],
       [ 2.94591015],
       [ 2.25276297]])

In [10]:
# Get the tfidf scores for each token in the query
def get_tfidf(tf,idf):
    tfidf = np.dot(tf,idf) #initialise tfidf
    return tfidf

In [11]:
tfidf = get_tfidf(tf,idf)
tfidf

array([[  7.45143609],
       [ 11.26381484],
       [  0.        ],
       [  2.25276297],
       [  0.        ],
       [  0.        ],
       [  0.        ]])

* Note:
> According to TFIDF, doc_1 is the best recommendation, while our query is more relevant to doc_0

# BM25
***
Formula

> BM25 = IDF * ((k + 1) * tf) / (k * (1.0 - b + b * (L) + tf)

- k = 1.2
- b = 0.75
- L = |d|/avgDl  ---> DocumentLength/Avg(DocumentLength)

Ref : http://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/

In [12]:
# Avg length of all documents <for BM25>
totlen = 0
for i in all_documents:
    totlen += len(i)
avglen = totlen/len(all_documents)

# L = (document length)/(average length of all doc) <for BM25>
L = list(len(i)/avglen for i in all_documents)

# Constants <for BM25>
k = 1.2
b = 0.75

In [13]:
# Get the bm25 scores 
def get_bm25_scores(tf,idf):
    bm25 = 0
    lis = []
    for i in range(len(tf)):
        for j in range(len(query)):
            x = (idf[j]*((k+1)*tf[i][j])/(k*(1.0-b+b*L[i]+tf[i][j])))
            bm25 = bm25 + x
        lis.append(bm25)
        x = 0
        bm25 = 0
    return lis

In [14]:
bm25vec = get_bm25_scores(tf,idf)
bm25vec

[array([ 6.87308484]),
 array([ 3.39265389]),
 array([ 0.]),
 array([ 2.32436241]),
 array([ 0.]),
 array([ 0.]),
 array([ 0.])]

* Note:
> BM25 gives high weightage to doc_0 as it is the most relevant for our query

# What next ?
***

- For Relevancy:
I was thinking, we could get these scores next to their corresponding documents, sort them and pick the best 3 ?

- For Sentiments:
Before bm25, sentiments would go as a new column altogether.