# Probabilistic Models

In [2]:
import os
import re
import pickle
import time
import numpy as np
from IndexerCACM import *
from IndexerQuery import *
from RelevantParser import *
from Query import *
from IRmodel import *
from Vector import *
from IRList import *
from EvalMeasure import *
from EvalIRModel import *
from copy import *

### Get Indexers

In [3]:
# Processed collections 
collectionPath = 'data/cacm/cacm.txt'
collectionPath2 = 'data/cisi/cisi.txt'
queriesPath = 'data/cacm/cacm.qry'
relevantPath = 'data/cacm/cacm.rel'

In [4]:
indexer = IndexerCACM(collectionPath, ParserCACM())

# If Index and Inv Index aren't already builded
#indexer.createRepIndex()
#indexer.createRepInvIndex()
#indexer.createRepInvFromAll()

queriesIndexer = IndexerQuery(queriesPath, ParserCACM())

# If Index isn't already builded
#queriesIndexer.createRepIndex()

relevantIndexer = Indexer(relevantPath, RelevantParser())

# If Index and Inv Index aren't already builded
#relevantIndexer.createIndex()

In [5]:
q = query(1, queriesIndexer, relevantIndexer)
q2 = query(10, queriesIndexer, relevantIndexer)
print(q.text, q.el)

(' \n  \n   1. Richard Alexander, Comp Serv, Langmuir Lab (TSS)    \n  \n   What articles exist which deal with TSS (Time Sharing System), an operating system for IBM computers?', {'comput': 1.0, 'ibm': 1.0, 'deal': 1.0, 'share': 1.0, 'system': 2.0, 'articl': 1.0, 'exist': 1.0, 'operat': 1.0, 'time': 1.0, 'tss': 1.0, -1: 11.0})


## Language Model

$P_M(s) = \prod_{i=1}^n p_M(s_i)$

where $P_M(s)$ probability to observe sequence of words $s$ with language model $M$.

We want to use this measure to compare documents with a query so we can be satisfied with something proportional to this (so we can use $\log$).

It leads to this measure :

$f(q,d) = \log P_{M_d}(q) = \sum_{t\in q} tf(t,q)\log p_{M_d}(t)$

where $q$ is a query, $d$ a document, $t$ a term.

And we have $p_{M_d}(t)$ equal to $\frac{tf(t,d)}{L(d)}$

** Sometimes t from q isn't in d ** it leads to,

$\log p_M(t) = -\infty $ cad $\log P_{M_d}(q) = -\infty$ even if other terms from q give a good similarity with $M_d$.

Good method to avoid that is a smoothed prob :

$\log p_M(t) = \log(\lambda p_{M_d}(t)+(1-\lambda)p_{M_c}(t)))$

where $M_C$ is the model language on the whole corpus.

In [6]:
class LanguageModel(IRmodel):
    
    def __init__(self, indexer, lbd=0.5):

        IRmodel.__init__(self, indexer)
        self.lbd = lbd
    
    def probModelDoc(self, id, element):
        
        freq = indexer.getEfFromDoc(id)
        if element in freq:
            return freq[element]/float(freq[-1])
        else:
            return 0
    
    def probModelAll(self, element):
        
        if element in indexer.repInvFromAll:
            return indexer.repInvFromAll[element] \
                /float(indexer.repInvFromAll[-1])
        else:
            return 0
            
    def getScores(self, query):
        
        query = copy(query)
        query.pop(-1)
        scores = np.zeros(self.nDoc, [('id', 'a25'), ('score', 'float64')])
        
        for element in query:
            i = 0
            if element in self.indexer.invIndex:
                docFromElement = self.indexer.getDfFromEl(element)
                for id in self.indexer.index:
                    scores[i]['id'] = id
                    if id in docFromElement:
                        scores[i]['score'] += np.log(self.lbd*self.probModelDoc(id, element) \
                        + (1 - self.lbd)*self.probModelAll(element))
                    else:
                        scores[i]['score'] += np.log((1 - self.lbd)*self.probModelAll(element))
                    i += 1
                
        return np.array(scores)
                

In [7]:
Lm = LanguageModel(indexer)
scores =  Lm.getRanking(q.el)
scores[:10]

3.92251515388


array([('2319', -67.93184600084517), ('2621', -68.2297030905898),
       ('3025', -69.48244739727727), ('322', -69.71678322154878),
       ('1506', -70.26094887662569), ('2625', -70.54158428890301),
       ('1544', -70.66576729306766), ('2371', -70.77094753757163),
       ('1605', -71.20517610214819), ('2632', -71.4525942458842)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

## Okapi BM25 Model

$f(d,q) = \sum_{t\in q}idf'(t)\frac{(k_1+1)tf(t,d)}{k_1((1-b)+bL(d)/L_{mean}+tf(t,d))}$ where,

probabilistic $idf'(t) = max(0, \log\frac{N-df(t)+0.5}{df(t)+0.5})$.

$L_{mean}$ is the mean size of documents.

$k_1 \in [1.2, 2.0]$ and $b = 0.75$ are free parameters.

In [11]:
class Okapi(IRmodel):
    
    def __init__(self, indexer, k=1.5, b=0.75):
        
        IRmodel.__init__(self, indexer)
        self.k = k
        self.b = b
        
        self.nDoc = len(indexer.indexFromCol)
        
        lMean = 0
        for id in indexer.index:
            lMean += indexer.index[id][-1]
        self.lMean = lMean/len(indexer.index)
    
    def idf(self, elements):
        
        result = {}
        
        for element in elements:
            if element in indexer.invIndex:
                df = len(indexer.getDfFromEl(element))
                result[element] = \
                max(0, np.log((self.nDoc-df+0.5) / (df+0.5)))
            else:
                result[element] = 0
        
        return result
        
    def getScores(self, query):
        
        query = copy(query)
        query.pop(-1)
        scores = np.zeros(self.nDoc, [('id', 'a25'), ('score', 'float64')])
        
        idf = self.idf(query)
        
        for element in query:
            i = 0
            if element in self.indexer.invIndex:
                for id in self.indexer.index:
                    tf = self.indexer.getEfFromDoc(id)
                    docFromElement = self.indexer.getDfFromEl(element)
                    scores[i]['id'] = id
                    if id in docFromElement:
                        scores[i]['score'] \
                        += idf[element] \
                        * ((self.k + 1) * tf[element]) \
                        / (self.k * ((1 - self.b) + self.b * tf[-1] \
                        / self.lMean) + tf[element])
                    i += 1
                
        return np.array(scores)

In [12]:
Lm = Okapi(indexer)
scores =  Lm.getRanking(q.el)
scores[:10]

21.2990880013


array([('4010', 22.76645960714305), ('2319', 21.018766465556922),
       ('3546', 20.688723765450447), ('2625', 19.951332108288458),
       ('4048', 19.79217919159696), ('3644', 19.528324547205344),
       ('2632', 19.309167312890654), ('3442', 19.126193736402666),
       ('2950', 18.282477629556013), ('4109', 18.222217182478776)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

## Optimisation

In [None]:
models = [LanguageModel(indexer, lbd=i) for i in np.linspace(0,1,20)]
queries = [query(i, queriesIndexer, relevantIndexer) for i in queriesIndexer.index if i in relevantIndexer.indexFromCol]
measures = [EvalPrecisionAverage, EvalPrecisionRecall] 

EM = EvalIRModel(models, queries, measures)
results = EM.getResults()

  np.in1d(self.irlist.scores[:i]['id'], self.irlist.query.relevants).sum()


khachian not in language!
im not in language!
rap not in language!
adaba not in language!
co-cit not in language!
pram not in language!
ecl not in language!
class-complet not in language!
topographi not in language!
tcoll not in language!
multi-target not in language!
udo not in language!
window-manag not in language!
fault-toler not in language!


In [9]:
import pickle
print(results)


[[[  5.22672494e-04   1.19966509e-03   8.61168794e-04   1.14579745e-07]
  [  9.32051766e-04   1.74332957e-03   1.33769067e-03   1.64542918e-07]
  [  2.37414816e-03   3.19871118e-03   2.78642967e-03   1.69976044e-07]
  ..., 
  [  0.00000000e+00   5.42005420e-04   2.71002710e-04   7.34424688e-08]
  [  2.15081639e-03   3.78861887e-03   2.96971763e-03   6.70599237e-07]
  [  0.00000000e+00   1.36239782e-03   6.81198910e-04   4.64031955e-07]]

 [[  5.22672494e-04   1.19966509e-03   8.61168794e-04   1.14579745e-07]
  [  9.32051766e-04   1.74332957e-03   1.33769067e-03   1.64542918e-07]
  [  2.37414816e-03   3.19871118e-03   2.78642967e-03   1.69976044e-07]
  ..., 
  [  0.00000000e+00   5.42005420e-04   2.71002710e-04   7.34424688e-08]
  [  2.15081639e-03   3.78861887e-03   2.96971763e-03   6.70599237e-07]
  [  0.00000000e+00   1.36239782e-03   6.81198910e-04   4.64031955e-07]]

 [[  5.22672494e-04   1.19966509e-03   8.61168794e-04   1.14579745e-07]
  [  9.32051766e-04   1.74332957e-03   1.337

In [22]:
len(queriesIndexer.index)
queries = [query(i, queriesIndexer, relevantIndexer) for i in queriesIndexer.index if i in relevantIndexer.indexFromCol]
print(queries)

[<Query.Query object at 0x7f32240d1510>, <Query.Query object at 0x7f32240d1410>, <Query.Query object at 0x7f32240d1110>, <Query.Query object at 0x7f32241ed850>, <Query.Query object at 0x7f32241ed8d0>, <Query.Query object at 0x7f32241ed910>, <Query.Query object at 0x7f32241ed990>, <Query.Query object at 0x7f32241ed950>, <Query.Query object at 0x7f32241ed9d0>, <Query.Query object at 0x7f32241eda10>, <Query.Query object at 0x7f32241eda50>, <Query.Query object at 0x7f32241eda90>, <Query.Query object at 0x7f32241edad0>, <Query.Query object at 0x7f32241edb50>, <Query.Query object at 0x7f32241edb10>, <Query.Query object at 0x7f32241edc10>, <Query.Query object at 0x7f32241edc50>, <Query.Query object at 0x7f32241edbd0>, <Query.Query object at 0x7f32241edc90>, <Query.Query object at 0x7f32241edb90>, <Query.Query object at 0x7f32241edd10>, <Query.Query object at 0x7f32241edd50>, <Query.Query object at 0x7f32241edcd0>, <Query.Query object at 0x7f32241edd90>, <Query.Query object at 0x7f32241eddd0>,

In [11]:
print(query(35, queriesIndexer, relevantIndexer))

ValueError: Bad Identifier