# Text Representation

In [1]:
import os
import re
import time
import pickle
import numpy as np
from IndexerCACM import *
from RelevantParser import *
from Query import *
from copy import *

### Get Indexers

In [2]:
# Processed collections 
collectionPath = 'data/cacm/cacm.txt'
collectionPath2 = 'data/cisi/cisi.txt'
queriesPath = 'data/cacm/cacm.qry'
relevantPath = 'data/cacm/cacm.rel'

In [3]:
indexer = IndexerCACM(collectionPath, ParserCACM())

# If Index and Inv Index aren't already builded
#indexer.createRepIndex()
#indexer.createRepInvIndex()
indexer.createRepInvFromAll()

queriesIndexer = IndexerCACM(queriesPath, ParserCACM())

# If Index isn't already builded
#queriesIndexer.createRepIndex()

relevantIndexer = Indexer(relevantPath, RelevantParser())

# If Index and Inv Index aren't already builded
#relevantIndexer.createIndex()

In [4]:
q = query(1, queriesIndexer, relevantIndexer)
q2 = query(10, queriesIndexer, relevantIndexer)
print(q, q2)

(<Query.Query object at 0x7fd0880a3690>, <Query.Query object at 0x7fd05c811850>)


# Weighter

In [5]:
class Weighter(object):
    
    def __init__(self, indexer):
        
        # Indexer is an Indexer object
        self.indexer = indexer
        
        self.nDoc = len(indexer.indexFromCol)
        self.loadIndex = {}
        
        
    def idf(self, elements):
        
        result = {}
        
        for element in elements:
            if element in indexer.invIndex:
                result[element] = \
                np.log(self.nDoc / float(len(indexer.getDfFromEl(element))-1))
            else:
                result[element] = 0
        
        return result
        
    def loadWeightsFromDoc(self, name):
        
        end = re.search(r'\..*?$', collectionPath).group(0)
        self.path = re.sub(r'\..*?$', name, indexer.collectionPath)+end
        self.indexPath = re.sub(r'\..*?$', 'Index', self.path)+end
        
        if os.path.isfile(self.indexPath):
            indexFile = open(self.indexPath)
            self.loadIndex = pickle.load(indexFile)
            indexFile.close()
        else:
            weightsFile = open(self.path, "w")
            pos = 0
            for id in self.indexer.index:
                toWrite = ''
                elements = self.computeWeightsFromDoc(id)
                elements.pop(-1)
                for element in elements:
                    toWrite += ':'+element+':'+str(elements[element])
                toWrite = toWrite[1:]
                # Get pos in index and size of current rep
                self.loadIndex[id] = [pos, len(toWrite)]
                
                weightsFile.write(toWrite)
                pos += len(toWrite)

            weightsFile.close()
            
            # Indexes'hashtable of doc in col
            indexFile = open(self.indexPath, "w")
            pickle.dump(self.loadIndex, indexFile)
        
            indexFile.close()
            
    def getWeightsFromDoc(self, id):
        
        data = self.indexer.getData(self.path, self.loadIndex, id)
        return self.indexer.freqFromData(data)
        
    def computeWeightsFromDoc(self, id):
        
        raise ValueError('Abstract method')
    
    def getWeightsFromQuery(self, query):
        
        raise ValueError('Abstract method')
        
            
class Weighter1(Weighter):
    
    def __init__(self, indexer):
        
        Weighter.__init__(self, indexer)
        name = 'Weighter1'
        self.loadWeightsFromDoc(name)
        
    def computeWeightsFromDoc(self, id):
        
        return indexer.getEfFromDoc(id)
    
    def getWeightsFromQuery(self, query):
        
        weights = copy(query)
        
        for element in weights:
            weights[element] = 1
        
        return weights
        
class Weighter2(Weighter):
    
    def __init__(self, indexer):
        
        Weighter.__init__(self, indexer)
        name = 'Weighter2'
        self.loadWeightsFromDoc(name)
        
    def computeWeightsFromDoc(self, id):
        
        return indexer.getEfFromDoc(id)
    
    def getWeightsFromQuery(self, query):
        
        weights = copy(query)
        
        return weights
        
class Weighter3(Weighter):
    
    def __init__(self, indexer):
        
        Weighter.__init__(self, indexer)
        name = 'Weighter3'
        self.loadWeightsFromDoc(name)
        
    def computeWeightsFromDoc(self, id):
        
        return indexer.getEfFromDoc(id)
    
    def getWeightsFromQuery(self, query):
        
        return self.idf(query)

class Weighter4(Weighter):
    
    def __init__(self, indexer):
        
        Weighter.__init__(self, indexer)
        name = 'Weighter4'
        self.loadWeightsFromDoc(name)
        
    def computeWeightsFromDoc(self, id):
        
        weights = indexer.getEfFromDoc(id)
        
        for element in weights:
            weights[element] = 1 + np.log(weights[element])
            
        return weights
    
    def getWeightsFromQuery(self, query):
        
        return self.idf(query)
                       
class Weighter5(Weighter):
    
    def __init__(self, indexer):
        
        Weighter.__init__(self, indexer)
        name = 'Weighter5'
        self.loadWeightsFromDoc(name)
        
    def computeWeightsFromDoc(self, id):
        
        weights = indexer.getEfFromDoc(id)
        
        idf = self.idf(weights)
        
        for element in weights:
            weights[element] = \
            (1 + np.log(weights[element])) * idf[element]
            
        return weights
    
    def getWeightsFromQuery(self, query):
        
        weights = copy(query)
        
        idf = self.idf(weights)
        
        for element in weights:
            weights[element] = \
            (1 + np.log(weights[element])) * idf[element]
            
        return weights


In [76]:
w = Weighter1(indexer)
print(w.getWeightsFromDoc(20))
print('\n')
print(w.getWeightsFromQuery(q.el))

{'and': 1.0, 'acceler': 1.0, 'process': 1.0, 'solut': 1.0, 'ca': 1.0, 'procedur': 1.0, 'an': 3.0, 'rate': 1.0, 'if': 2.0, 'wegstein': 1.0, 'techniqu': 1.0, 'for': 1.0, 'diverg': 1.0, 'when': 1.0, 'to': 1.0, 'which': 1.0, 'appli': 1.0, 'is': 2.0, 'pm': 1.0, 'march': 1.0, 'induc': 1.0, 'given': 1.0, 'discuss': 1.0, 'jb': 1.0, 'of': 3.0, 'iter': 4.0, 'converg': 4.0, 'exampl': 1.0, 'equat': 1.0, 'illustr': 1.0, 'accelerat': 1.0, 'the': 4.0, -1: 47.0}


{'comput': 1, 'deal': 1, 'comp': 1, 'share': 1, 'for': 1, 'lab': 1, 'an': 1, 'exist': 1, 'langmuir': 1, 'with': 1, 'serv': 1, 'what': 1, 'alexand': 1, 'ibm': 1, 'richard': 1, 'system': 1, 'articl': 1, 'operat': 1, 'which': 1, 'time': 1, 'tss': 1, -1: 1}


In [77]:
w = Weighter2(indexer)
print(w.getWeightsFromDoc(20))
print('\n')
print(w.getWeightsFromQuery(q.el))

{'and': 1.0, 'acceler': 1.0, 'process': 1.0, 'solut': 1.0, 'ca': 1.0, 'procedur': 1.0, 'an': 3.0, 'rate': 1.0, 'if': 2.0, 'wegstein': 1.0, 'techniqu': 1.0, 'for': 1.0, 'diverg': 1.0, 'when': 1.0, 'to': 1.0, 'which': 1.0, 'appli': 1.0, 'is': 2.0, 'pm': 1.0, 'march': 1.0, 'induc': 1.0, 'given': 1.0, 'discuss': 1.0, 'jb': 1.0, 'of': 3.0, 'iter': 4.0, 'converg': 4.0, 'exampl': 1.0, 'equat': 1.0, 'illustr': 1.0, 'accelerat': 1.0, 'the': 4.0, -1: 47.0}


{'comput': 1.0, 'deal': 1.0, 'comp': 1.0, 'share': 1.0, 'for': 1.0, 'lab': 1.0, 'an': 1.0, 'exist': 1.0, 'langmuir': 1.0, 'with': 1.0, 'serv': 1.0, 'what': 1.0, 'alexand': 1.0, 'ibm': 1.0, 'richard': 1.0, 'system': 2.0, 'articl': 1.0, 'operat': 1.0, 'which': 1.0, 'time': 1.0, 'tss': 2.0, -1: 23.0}


In [78]:
w = Weighter3(indexer)
print(w.getWeightsFromDoc(20))
print('\n')
print(w.getWeightsFromQuery(q.el))

{'and': 1.0, 'acceler': 1.0, 'process': 1.0, 'solut': 1.0, 'ca': 1.0, 'procedur': 1.0, 'an': 3.0, 'rate': 1.0, 'if': 2.0, 'wegstein': 1.0, 'techniqu': 1.0, 'for': 1.0, 'diverg': 1.0, 'when': 1.0, 'to': 1.0, 'which': 1.0, 'appli': 1.0, 'is': 2.0, 'pm': 1.0, 'march': 1.0, 'induc': 1.0, 'given': 1.0, 'discuss': 1.0, 'jb': 1.0, 'of': 3.0, 'iter': 4.0, 'converg': 4.0, 'exampl': 1.0, 'equat': 1.0, 'illustr': 1.0, 'accelerat': 1.0, 'the': 4.0, -1: 47.0}


{'comput': 1.054181210545674, 'deal': 2.6877999211769894, 'share': 2.4973529569391175, 'an': 1.2807435686086692, 'exist': 2.4329950879563147, 'langmuir': 0, 'what': 4.7062055722704557, 'richard': 2.771637699819077, 'system': 1.4300543813371573, 'articl': 2.7792713246741485, 'which': 1.6832165821571556, 'tss': 2.8715210583253672, 'comp': 0, 'lab': 0, 'with': 1.6832165821571556, 'ibm': 2.5177916246163914, 'serv': 2.6704684648253494, 'alexand': 2.8025281868384155, 'for': 1.0239268021878718, 'operat': 3.5075098250453638, 'time': 1.65419246281787

In [79]:
w = Weighter4(indexer)
print(w.getWeightsFromDoc(20))
print('\n')
print(w.getWeightsFromQuery(q.el))

{'and': 1.0, 'acceler': 1.0, 'process': 1.0, 'solut': 1.0, 'ca': 1.0, 'procedur': 1.0, 'an': 2.09861228867, 'rate': 1.0, 'if': 1.69314718056, 'wegstein': 1.0, 'techniqu': 1.0, 'for': 1.0, 'diverg': 1.0, 'when': 1.0, 'to': 1.0, 'which': 1.0, 'appli': 1.0, 'is': 1.69314718056, 'pm': 1.0, 'march': 1.0, 'induc': 1.0, 'given': 1.0, 'discuss': 1.0, 'jb': 1.0, 'of': 2.09861228867, 'iter': 2.38629436112, 'converg': 2.38629436112, 'exampl': 1.0, 'equat': 1.0, 'illustr': 1.0, 'accelerat': 1.0, 'the': 2.38629436112, -1: 39.742402021820006}


{'comput': 1.054181210545674, 'deal': 2.6877999211769894, 'share': 2.4973529569391175, 'an': 1.2807435686086692, 'exist': 2.4329950879563147, 'langmuir': 0, 'what': 4.7062055722704557, 'richard': 2.771637699819077, 'system': 1.4300543813371573, 'articl': 2.7792713246741485, 'which': 1.6832165821571556, 'tss': 2.8715210583253672, 'comp': 0, 'lab': 0, 'with': 1.6832165821571556, 'ibm': 2.5177916246163914, 'serv': 2.6704684648253494, 'alexand': 2.802528186838415

In [80]:
w = Weighter5(indexer)
print(w.getWeightsFromDoc(20))
print('\n')
print(w.getWeightsFromQuery(q.el))

{'and': 0.986873489641, 'acceler': 7.24517944333, 'process': 1.78701337584, 'solut': 2.20390717977, 'ca': 0.224393142385, 'procedur': 2.07469544829, 'an': 2.68778419171, 'rate': 2.65343227767, 'if': 6.31318915518, 'wegstein': 2.69481749384, 'techniqu': 1.9518746186, 'for': 1.02392680219, 'diverg': 2.80252818684, 'when': 3.32651189518, 'to': 1.1052948911, 'which': 1.68321658216, 'appli': 2.3674408227, 'is': 1.5825089658, 'pm': 0.521347002508, 'march': 0.823557175522, 'induc': 2.68779992118, 'given': 1.8745414152, 'discuss': 1.88388727762, 'jb': 0.337090886556, 'of': 1.47773667653, 'iter': 6.07240105947, 'converg': 6.53364930007, 'exampl': 2.23232439249, 'equat': 2.29641955295, 'illustr': 2.57860062921, 'accelerat': 6.55203226277, 'the': 2.02549977368, -1: 82.61147528798199}


{'comput': 1.054181210545674, 'deal': 2.6877999211769894, 'comp': 0.0, 'share': 2.4973529569391175, 'for': 1.0239268021878718, 'lab': 0.0, 'an': 1.2807435686086692, 'exist': 2.4329950879563147, 'langmuir': 0.0, 'wi

# Similarity measure

In [10]:
class IRmodel(object):
    
    def __init__(self, indexer):
        
        # indexer is an indexer object
        self.indexer = indexer
        self.nDoc = len(indexer.indexFromCol)
        
    def getScores(self, query):
        
        raise ValueError('Abstract method')
    
    def getRanking(self, query):
        
        start = time.time()
        
        scores = self.getScores(query)
        sorted_scores = (np.sort(scores, order='score'))[::-1]
        
        end = time.time()
        print(end - start)
        
        return sorted_scores

class Vector(IRmodel):
    
    def __init__(self, indexer, weighter=Weighter1, normalized=False):
    
        IRmodel.__init__(self, indexer)
        
        # weighter is a Weighter object
        self.weighter = weighter(indexer)
        
        # normalized is a boolean
        self.normalized = normalized
    
    def dotProduct(self, vector1, vector2):
        
        result = 0
        
        if len(vector1)>len(vector2):
            tmp = vector1
            vector1 = vector2
            vector2 = tmp
        
        for element in vector1:
            if element in vector2:
                result += vector1[element]*vector2[element]
        
        return result
                
    def norm1(self, vector):
        
        result = 0
        
        for element in vector:
            result += abs(vector[element])
        
        return result
    
    def getScores(self, query):
        
        vecQuery = self.weighter.getWeightsFromQuery(query)
        norm1VecQuery = self.norm1(vecQuery)
        
        doc = {}
        
        for id in query:
            
            if id in self.indexer.invIndex:
                for element in self.indexer.getDfFromEl(id):
                    doc[element] = 1
        doc.pop(-1)
        
        scores = np.zeros(len(doc), [('id', 'a25'), ('score', 'float64')])
        
        i = 0
        for id in doc:
            
            scores[i]['id'] = str(id)
            vecDoc = self.weighter.getWeightsFromDoc(id)
            dotProduct = self.dotProduct(vecDoc, vecQuery)
            
            if self.normalized: 
                scores[i]['score'] = dotProduct/float(self.norm1(vecDoc)*norm1VecQuery)
            else:
                scores[i]['score'] = dotProduct
            
            i += 1
        
        return np.array(scores)
        

In [9]:
vector1 = Vector(indexer, Weighter1)
scores = vector1.getRanking(q.el)
scores[:10]

1.07602405548


array([('3742', 4547.0), ('3444', 4545.0), ('3512', 4526.0),
       ('3480', 4526.0), ('3215', 4526.0), ('3415', 4521.0),
       ('3843', 4520.0), ('3619', 4520.0), ('4141', 4515.0),
       ('3634', 4515.0)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

In [83]:
vectorNorm1 = Vector(indexer, Weighter1, normalized=True)
scores = vectorNorm1.getRanking(q.el)[:10]
scores[:10]

1.34476089478


array([('594', 0.028409090909090908), ('2796', 0.028409090909090908),
       ('2917', 0.02807486631016043), ('2329', 0.02807486631016043),
       ('143', 0.02807486631016043), ('275', 0.027972027972027972),
       ('9', 0.027777777777777776), ('80', 0.027777777777777776),
       ('557', 0.027777777777777776), ('34', 0.027777777777777776)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

In [84]:
vectorNorm2 = Vector(indexer, Weighter2, normalized=True)
scores = vectorNorm2.getRanking(q.el)[:10]
scores[:10]

1.27697682381


array([('594', 0.2536231884057971), ('294', 0.2536231884057971),
       ('2796', 0.2536231884057971), ('2690', 0.2536231884057971),
       ('2311', 0.2536231884057971), ('197', 0.2536231884057971),
       ('1461', 0.2536231884057971), ('2853', 0.25339673913043476),
       ('1304', 0.25337331334332835), ('275', 0.25334448160535117)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

In [85]:
vectorNorm3 = Vector(indexer, Weighter3, normalized=True)
scores = vectorNorm3.getRanking(q.el)[:10]
scores[:10]

1.37277293205


array([('195', 0.006994187441045503), ('1461', 0.006522688023303906),
       ('2796', 0.006160944944785141), ('3068', 0.0046455647822142955),
       ('1069', 0.004427273171593947), ('31', 0.004276397872586725),
       ('143', 0.0042143757311462506), ('234', 0.004210856012993603),
       ('1247', 0.003934211393161682), ('2371', 0.003906939044365772)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

In [86]:
vectorNorm4 = Vector(indexer, Weighter4, normalized=True)
scores = vectorNorm4.getRanking(q.el)
scores[:10]

1.53560519218


array([('195', 0.006994187441045503), ('1461', 0.006522688023303906),
       ('2796', 0.006160944944785141), ('1069', 0.004427273171593947),
       ('31', 0.004276397872586725), ('143', 0.0042143757311462506),
       ('234', 0.004210856012993603), ('61', 0.003741848138513384),
       ('289', 0.0037022337888048037), ('594', 0.0037002725523431726)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

In [87]:
vectorNorm5 = Vector(indexer, Weighter5, normalized=True)
scores = vectorNorm5.getRanking(q.el)
scores[:10]

1.6320130825


array([('195', 0.01962876435762631), ('1461', 0.01362311543995687),
       ('2796', 0.008666266953654524), ('1069', 0.006805058212962663),
       ('289', 0.006468487030047001), ('163', 0.005396430103933236),
       ('718', 0.005059766743091413), ('325', 0.004963377368845642),
       ('339', 0.0048272901333964394), ('2312', 0.004703600168072482)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

# Evaluation

In [88]:
class IRList(object):
    
    def __init__(self, query, scores):
        
        self.query = query
        self.scores = scores

irlist = IRList(q, scores)

In [89]:
class EvalMeasure():
    
    def __init__(self, irlist):
        
        self.irlist = irlist
        
    def recall(self, i):
        
        recall = \
        np.in1d(irlist.scores[:i]['id'], irlist.query.relevants).sum()
        
        return recall/float(len(irlist.query.relevants))
    
    def precision(self, i):
        
        precision = \
        np.in1d(irlist.scores[:i]['id'], irlist.query.relevants).sum()
    
        return precision/float(i)
    
    def eval(self, k):
    
        raise ValueError('Abstract method')
        
class EvalPrecisionRecall(EvalMeasure):
    
    def __init__(self, irlist):
        
        EvalMeasure.__init__(self, irlist)
        
        size = len(self.irlist.scores)
        self.recalls = np.zeros(size)
        self.precisions = np.zeros(size)
        for i in range(1, size):
            self.recalls[i] = self.recall(i)
            self.precisions[i] = self.precision(i)
        self.recalls = np.array(self.recalls)
        self.precisions = np.array(self.precisions)
        
    def eval(self, k=1):
        
        measures = []
        
        # gives good levels between 0 and 1
        levels = [(1/float(k+1))*l for l in range(1, k+1)]
        
        i = 0
        for level in levels:
            measures.append((level, np.max(self.precisions[np.where(self.recalls >= level)])))
            i += 1
            
        if k==1:
            return measures[0][1]
        
        return measures

class EvalPrecisionAverage(EvalMeasure):
    
    def __init__(self, irlist):
        
        EvalMeasure.__init__(self, irlist)
        
    def eval(self):
        
        self.irlist.query.relevants = np.array(self.irlist.query.relevants)
        return np.mean([self.precision(i) for i in np.argwhere(np.in1d(self.irlist.scores['id'], self.irlist.query.relevants[:,0]))])
            
            
        

In [90]:
EM = EvalPrecisionRecall(irlist)
print(EM.eval(k=5))

[(0.16666666666666666, 0.012605042016806723), (0.3333333333333333, 0.012605042016806723), (0.5, 0.012605042016806723), (0.6666666666666666, 0.0106951871657754), (0.8333333333333333, 0.0072992700729927005)]


In [91]:
EM = EvalPrecisionAverage(irlist)
print(EM.eval())

0.00509925501757




In [93]:
class EvalIRModel(object):
    
    def __init__(self, models, queries, measures):
        
        self.models = models
        self.queries = queries
        self.measures = measures
        self.results()
        
    def results(self):
        
        results = np.zeros((len(self.models), len(self.queries), len(self.measures)+2))
        
        i = 0
        for model in self.models:
            j = 0
            for query in self.queries:
                k = 0
                scores = model.getScores(query.el)
                irlist = IRList(query, scores)
                for measure in self.measures:
                    measure = measure(irlist)
                    results[i,j,k] = measure.eval()
                    k += 1
                results[i,j,-2] = np.mean(results[i,j,:-2])
                results[i,j,-1] = np.var(results[i,j,:-2])
                j += 1
            i += 1
        
        self.outcome = results
    
    def getResults(self):
        
        return self.outcome
        

In [94]:
EM = EvalIRModel([vectorNorm1, vectorNorm2], [q, q2], [EvalPrecisionAverage, EvalPrecisionRecall])
EM.getResults()



array([[[  4.83268942e-03,   1.26050420e-02,   8.71886572e-03,
           1.51023662e-05],
        [  4.86343544e-03,   1.26050420e-02,   8.73423873e-03,
           1.49831181e-05]],

       [[  4.83268942e-03,   1.26050420e-02,   8.71886572e-03,
           1.51023662e-05],
        [  4.86343544e-03,   1.26050420e-02,   8.73423873e-03,
           1.49831181e-05]]])