# Query Likelihood Model

You are provided with a corpus and a query:

In [1]:
document0 = 'click go the shears boys click click click'
document1 = 'click click'
document2 = 'metal here'
document3 = 'metal shears click here'
corpus = [document0, document1, document2, document3]

query = 'click shears'

## Calculate the probabilities $𝑃_{𝑀_𝑑}(𝑡)$ and $𝑃_{𝑀_𝑐}(𝑡)$ for all terms and documents

In [2]:
def get_unique_words(corpus):
    words = []
    for doc in corpus:
        words += doc.split()
        
    return list(set(words))

In [3]:
unique_words = get_unique_words(corpus)
unique_words

['here', 'go', 'shears', 'boys', 'the', 'click', 'metal']

In [4]:
def term_probability_for_document(term, document):
    """ 
        The probability of term occurring in document.
        
        The number of occurences of given term in given document 
        divided by total number of terms in this document.
    """
    terms_list = document.split()
    
    return terms_list.count(term)/len(terms_list)

In [5]:
for word in unique_words:
    for i in range(len(corpus)):
        print('P(',word,'| D',i+1,'): ', term_probability_for_document(word, corpus[i]))
    print('\n')

P( here | D 1 ):  0.0
P( here | D 2 ):  0.0
P( here | D 3 ):  0.5
P( here | D 4 ):  0.25


P( go | D 1 ):  0.125
P( go | D 2 ):  0.0
P( go | D 3 ):  0.0
P( go | D 4 ):  0.0


P( shears | D 1 ):  0.125
P( shears | D 2 ):  0.0
P( shears | D 3 ):  0.0
P( shears | D 4 ):  0.25


P( boys | D 1 ):  0.125
P( boys | D 2 ):  0.0
P( boys | D 3 ):  0.0
P( boys | D 4 ):  0.0


P( the | D 1 ):  0.125
P( the | D 2 ):  0.0
P( the | D 3 ):  0.0
P( the | D 4 ):  0.0


P( click | D 1 ):  0.5
P( click | D 2 ):  1.0
P( click | D 3 ):  0.0
P( click | D 4 ):  0.25


P( metal | D 1 ):  0.0
P( metal | D 2 ):  0.0
P( metal | D 3 ):  0.5
P( metal | D 4 ):  0.25




In [None]:
for i in range(len(corpus)):
    for word in unique_words:
        print('P(',word,'| D',i+1,'): ', term_probability_for_document(word, corpus[i]))
        
    print('\n')

P( here | D 1 ):  0.0
P( go | D 1 ):  0.125
P( shears | D 1 ):  0.125
P( boys | D 1 ):  0.125
P( the | D 1 ):  0.125
P( click | D 1 ):  0.5
P( metal | D 1 ):  0.0


P( here | D 2 ):  0.0
P( go | D 2 ):  0.0
P( shears | D 2 ):  0.0
P( boys | D 2 ):  0.0
P( the | D 2 ):  0.0
P( click | D 2 ):  1.0
P( metal | D 2 ):  0.0


P( here | D 3 ):  0.5
P( go | D 3 ):  0.0
P( shears | D 3 ):  0.0
P( boys | D 3 ):  0.0
P( the | D 3 ):  0.0
P( click | D 3 ):  0.0
P( metal | D 3 ):  0.5


P( here | D 4 ):  0.25
P( go | D 4 ):  0.0
P( shears | D 4 ):  0.25
P( boys | D 4 ):  0.0
P( the | D 4 ):  0.0
P( click | D 4 ):  0.25
P( metal | D 4 ):  0.25




In [None]:
def term_probability_for_corpus(term, corpus):
    """ 
        The number of occurences of given term in given corpus 
        divided by total number of terms in this corpus.
    """
    count = 0
    count_all = 0
    for doc in corpus:
        words = doc.split() 
        count_all += len(words)
        for word in words:
            if word == term:
                count += 1

    return count / count_all

In [None]:
for word in unique_words:
    print('P(',word,'|corpus): ', term_probability_for_corpus(word, corpus))

## Calculate the ranked result set according to the un-smoothed, uniform model $𝑃_{𝑢𝑛𝑖}(𝑞 ∣ 𝑑)$

In [None]:
def query_probability_unsmoothed_for_doc(query, document):
    """ 
    - for each query term multiply probabilities of term occurring in the document
    - if at least one of the query words doesn't appear in the document the result is 0
    """
    if query == '':
        return 0
    
    result = 1
    for term in query.split():
        result *= term_probability_for_document(term, document)
    
    return result

In [None]:
from operator import itemgetter

rankings = []
for i in range(len(corpus)):
    rankings += [(i, query_probability_unsmoothed_for_doc(query, corpus[i]))]
    
rankings.sort(key=itemgetter(1), reverse=True)

print('Ranked documents for unsmoothed uniform model:\n',['D'+str(d+1)+'('+str(score)+')' for d, score in rankings])

## Calculate the ranked result set according to the linear-interpolated, uniform model $𝑃_{𝑖𝑛𝑡𝑒𝑟𝑝−𝑢𝑛𝑖}(𝑞 ∣ 𝑑)$ with 𝜆 = 0.5

In [None]:
def query_probability_linear_interpolated(query, document, corpus, coef):
    if query == '':
        return 0
    
    result = 1
    for term in query.split():
        result *= coef * term_probability_for_document(term, document) + \
                (1 - coef) * term_probability_for_corpus(term, corpus)
    
    return result

In [None]:
from operator import itemgetter

rankings = []
coef = 0.5
for i in range(len(corpus)):
    rankings += [(i, query_probability_linear_interpolated(query, corpus[i], corpus, coef))]
    
rankings.sort(key=itemgetter(1), reverse=True)

print('Ranked documents for linear-interpolated uniform model\nwith lambda=', coef,':\n',['D'+str(d+1)+'('+str(score)+')' for d, score in rankings])