# Unigram LM with Jelinek-Mercer-Smoothing


In [5]:
import pandas as pd
import numpy as np
import pickle
#We need this line to find the collection_vocabulary.py here, else we cannot load the col.pkl object
import sys
sys.path.append('../0_Collection_and_Inverted_Index/')
with open('../0_Collection_and_Inverted_Index/pickle/col.pkl', 'rb') as input:
    col = pickle.load(input)
inverted_index = pd.read_pickle('../0_Collection_and_Inverted_Index/pickle/inverted_index.pkl')

### Global Language Model
We want to find out how likely each word is if we look at the whole corpus. 

In [6]:
global_LM=inverted_index.sum(axis=1)/col.collection_length # equal: inverted_index.sum(axis=1)/inverted_index.sum(axis=1).sum()

### Local Language Models
We want to obtain a language model for each document in the collection, therefore we look at the columns of the *inverted_index* dataframe. 

In [7]:
local_LMs=inverted_index/inverted_index.sum()
# Sanity Check: Probabilities should sum columnwise to 1, and adding all columns should yield the collection size (3633)
local_LMs.sum().sum()

3633.0

### Unigram LM with J-M-Smoothing
As introduced in the lecture, this smoothing scheme assigns equal weights to the global and local LMs.

In [8]:
unigram_LM= (local_LMs.apply(lambda x: x+ global_LM)).apply(lambda x: x/2)# same as multiplying both by 0.5 and adding them
unigram_LM.to_pickle('pickle/unigramLM.pkl')#writing Unigram-LM to disk

In [9]:
# sanity check: probabilities in every doc should sum up to one and all docs should sum up tp 3633
unigram_LM.sum().sum() 

3633.0000000012674

In [10]:
#sanity check: we don't want to have any negative values 
unigram_LM.min().min()<0

False

In [11]:
#sanity check: we don't want to have any zeros (since we are smoothing)
unigram_LM.isnull().sum(axis=1).sum()==0 # we check whether therer are no zeros > True intended

True

In [12]:
'''
omitted: operating in log-space to avoid numerical instability
global_LM_log_space=np.log(global_LM)
local_LMs_log_space=local_LMs.applymap(lambda x: np.log(x, out=np.zeros_like(inverted_index.as_matrix),where=x!=0))

 '''

'\nomitted: operating in log-space to avoid numerical instability\nglobal_LM_log_space=np.log(global_LM)\nlocal_LMs_log_space=local_LMs.applymap(lambda x: np.log(x, out=np.zeros_like(inverted_index.as_matrix),where=x!=0))\n\n '