# Probabilistic Models

In [1]:
import os
import re
import pickle
import time
import numpy as np
from IndexerCACM import *
from RelevantParser import *
from Query import *
from IRmodel import *
from Vector import *
from copy import *

### Get Indexers

In [2]:
# Processed collections 
collectionPath = 'data/cacm/cacm.txt'
collectionPath2 = 'data/cisi/cisi.txt'
queriesPath = 'data/cacm/cacm.qry'
relevantPath = 'data/cacm/cacm.rel'

In [3]:
indexer = IndexerCACM(collectionPath, ParserCACM())

# If Index and Inv Index aren't already builded
#indexer.createRepIndex()
#indexer.createRepInvIndex()
#indexer.createRepInvFromAll()

queriesIndexer = IndexerCACM(queriesPath, ParserCACM())

# If Index isn't already builded
#queriesIndexer.createRepIndex()

relevantIndexer = Indexer(relevantPath, RelevantParser())

# If Index and Inv Index aren't already builded
#relevantIndexer.createIndex()

In [4]:
q = query(1, queriesIndexer, relevantIndexer)
q2 = query(10, queriesIndexer, relevantIndexer)
print(q, q2)

(<Query.Query object at 0x7fa817ae7d10>, <Query.Query object at 0x7fa844448410>)


$P_M(s) = \prod_{i=1}^n p_M(s_i)$

where $P_M(s)$ probability to observe sequence of words $s$ with language model $M$.

We want to use this measure to compare documents with a query so we can be satisfied with something proportional to this (so we can use $\log$).

It leads to this measure :

$f(q,d) = \log P_{M_d}(q) = \sum_{t\in q} tf(t,q)\log p_{M_d}(t)$

where $q$ is a query, $d$ a document, $t$ a term.

And we have $p_{M_d}(t)$ equal to $\frac{tf(t,d)}{L(d)}$

** Sometimes t from q isn't in d ** it leads to,

$\log p_M(t) = -\infty $ cad $\log P_{M_d}(q) = -\infty$ even if other terms from q give a good similarity with $M_d$.

Good method to avoid that is a smoothed prob :

$\log p_M(t) = \log(\lambda p_{M_d}(t)+(1-\lambda)p_{M_c}(t)))$

where $M_C$ is the model language on the whole corpus.

In [15]:
class LanguageModel(IRmodel):
    
    def __init__(self, indexer, lbd=1):

        IRmodel.__init__(self, indexer)
        self.lbd = lbd
    
    def logPm(self, id, element):
        
        freq = indexer.getEfFromDoc(id)
        if element in freq:
            return np.log(freq[element]/freq[-1])
        else:
            return 0
    
    def logPmAll(self, element):
        
        if element in indexer.repInvFromAll:
            return indexer.repInvFromAll[element] \
                /indexer.repInvFromAll[-1]
        else:
            return 0
            
    def getScores(self, query):
        
        scores = np.zeros(self.nDoc, [('id', 'a25'), ('score', 'float64')])
        
        i = 0
        for id in self.indexer.index:
            
            score = 0
            for element in query:
                
                score += np.log(self.lbd*self.logPm(id, element) \
                    + (1 - self.lbd)*self.logPmAll(element))
            
            scores[i]['score'] = score
            
            i += 1
                
        return np.array(scores)
                

In [16]:
Lm = LanguageModel(indexer)
scores =  Lm.getRanking(q.el)
scores[:10]



21.9315989017


array([('', nan), ('', nan), ('', nan), ('', nan), ('', nan), ('', nan),
       ('', nan), ('', nan), ('', nan), ('', nan)], 
      dtype=[('id', 'S25'), ('score', '<f8')])