# Text Representation

In [1]:
import os
import re
import time
import pickle
import numpy as np
import matplotlib.pyplot as plt
from IndexerCACM import *
from IndexerQuery import *
from RelevantParser import *
from Query import *
from copy import *

### Get Indexers

In [2]:
# Processed collections 
collectionPath = 'data/cacm/cacm.txt'
collectionPath2 = 'data/cisi/cisi.txt'
queriesPath = 'data/cacm/cacm.qry'
relevantPath = 'data/cacm/cacm.rel'

In [3]:
indexer = IndexerCACM(collectionPath, ParserCACM())

# If Index and Inv Index aren't already builded
#indexer.createRepIndex()
#indexer.createRepInvIndex()
#indexer.createRepInvFromAll()

queriesIndexer = IndexerCACM(queriesPath, ParserCACM())

# If Index isn't already builded
#queriesIndexer.createRepIndex()

relevantIndexer = Indexer(relevantPath, RelevantParser())

# If Index and Inv Index aren't already builded
#relevantIndexer.createIndex()

In [4]:
q = query(1, queriesIndexer, relevantIndexer)
q2 = query(2, queriesIndexer, relevantIndexer)
q3 = query(3, queriesIndexer, relevantIndexer)
q4 = query(4, queriesIndexer, relevantIndexer)
print(q, q2, q3, q4)

(<Query.Query object at 0x7f478b2df310>, <Query.Query object at 0x7f4769648210>, <Query.Query object at 0x7f4769648250>, <Query.Query object at 0x7f4769648290>)


# Weighter

In [5]:
class Weighter(object):
    
    def __init__(self, indexer):
        
        # Indexer is an Indexer object
        self.indexer = indexer
        
        self.nDoc = len(indexer.indexFromCol)
        self.loadIndex = {}
        
        
    def idf(self, elements):
        
        result = {}
        
        for element in elements:
            if element in indexer.invIndex:
                result[element] = \
                self.nDoc / float(len(indexer.getDfFromEl(element))-1)
            else:
                result[element] = 0
        
        return result
        
    def loadWeightsFromDoc(self, name):
        
        end = re.search(r'\..*?$', collectionPath).group(0)
        self.path = re.sub(r'\..*?$', name, indexer.collectionPath)+end
        self.indexPath = re.sub(r'\..*?$', 'Index', self.path)+end
        
        if os.path.isfile(self.indexPath):
            indexFile = open(self.indexPath)
            self.loadIndex = pickle.load(indexFile)
            indexFile.close()
        else:
            weightsFile = open(self.path, "w")
            pos = 0
            for id in self.indexer.index:
                toWrite = ''
                elements = self.computeWeightsFromDoc(id)
                elements.pop(-1)
                for element in elements:
                    toWrite += ':'+element+':'+str(elements[element])
                toWrite = toWrite[1:]
                # Get pos in index and size of current rep
                self.loadIndex[id] = [pos, len(toWrite)]
                
                weightsFile.write(toWrite)
                pos += len(toWrite)

            weightsFile.close()
            
            # Indexes'hashtable of doc in col
            indexFile = open(self.indexPath, "w")
            pickle.dump(self.loadIndex, indexFile)
        
            indexFile.close()
            
    def getWeightsFromDoc(self, id):
        
        data = self.indexer.getData(self.path, self.loadIndex, id)
        return self.indexer.freqFromData(data)
        
    def computeWeightsFromDoc(self, id):
        
        raise ValueError('Abstract method')
    
    def getWeightsFromQuery(self, query):
        
        raise ValueError('Abstract method')
        
            
class Weighter1(Weighter):
    
    def __init__(self, indexer):
        
        Weighter.__init__(self, indexer)
        name = 'Weighter1'
        self.loadWeightsFromDoc(name)
        
    def computeWeightsFromDoc(self, id):
        
        return indexer.getEfFromDoc(id)
    
    def getWeightsFromQuery(self, query):
        
        weights = copy(query)
        
        for element in weights:
            weights[element] = 1
        
        return weights
        
class Weighter2(Weighter):
    
    def __init__(self, indexer):
        
        Weighter.__init__(self, indexer)
        name = 'Weighter2'
        self.loadWeightsFromDoc(name)
        
    def computeWeightsFromDoc(self, id):
        
        return indexer.getEfFromDoc(id)
    
    def getWeightsFromQuery(self, query):
        
        weights = copy(query)
        
        return weights
        
class Weighter3(Weighter):
    
    def __init__(self, indexer):
        
        Weighter.__init__(self, indexer)
        name = 'Weighter3'
        self.loadWeightsFromDoc(name)
        
    def computeWeightsFromDoc(self, id):
        
        return indexer.getEfFromDoc(id)
    
    def getWeightsFromQuery(self, query):
        
        return self.idf(query)

class Weighter4(Weighter):
    
    def __init__(self, indexer):
        
        Weighter.__init__(self, indexer)
        name = 'Weighter4'
        self.loadWeightsFromDoc(name)
        
    def computeWeightsFromDoc(self, id):
        
        weights = indexer.getEfFromDoc(id)
        
        for element in weights:
            weights[element] = 1 + np.log(weights[element])
            
        return weights
    
    def getWeightsFromQuery(self, query):
        
        return self.idf(query)
                       
class Weighter5(Weighter):
    
    def __init__(self, indexer):
        
        Weighter.__init__(self, indexer)
        name = 'Weighter5'
        self.loadWeightsFromDoc(name)
        
    def computeWeightsFromDoc(self, id):
        
        weights = indexer.getEfFromDoc(id)
        
        idf = self.idf(weights)
        
        for element in weights:
            weights[element] = \
            (1 + np.log(weights[element])) * idf[element]
            
        return weights
    
    def getWeightsFromQuery(self, query):
        
        weights = copy(query)
        
        idf = self.idf(weights)
        
        for element in weights:
            weights[element] = \
            (1 + np.log(weights[element])) * idf[element]
            
        return weights


In [6]:
w = Weighter1(indexer)
print(w.getWeightsFromDoc(20))
print('\n')
print(w.getWeightsFromQuery(q.el))

{'Wegstein': 1.0, 'illustr': 1.0, 'appli': 1.0, 'acceler': 1.0, 'procedur': 1.0, 'rate': 1.0, 'techniqu': 1.0, 'diverg': 1.0, 'solut': 1.0, 'A': 1.0, 'H': 1.0, 'induc': 1.0, 'J': 1.0, 'Iter': 1.0, 'Converg': 1.0, 'Accelerat': 1.0, 'discuss': 1.0, 'Process': 1.0, 'iter': 2.0, 'An': 1.0, 'converg': 2.0, 'exampl': 1.0, 'equat': 1.0, -1: 25.0}


{'What': 1, 'comput': 1, 'IBM': 1, 'deal': 1, 'Share': 1, 'System': 1, 'articl': 1, 'exist': 1, 'operat': 1, 'Time': 1, 'TSS': 1, -1: 1}


In [7]:
w = Weighter2(indexer)
print(w.getWeightsFromDoc(111))
print('\n')
print(w.getWeightsFromQuery(q2.el))

{'comput': 1.0, 'On': 1.0, 'illustr': 1.0, 'appli': 1.0, 'Solut': 1.0, 'procedur': 1.0, 'Ellenberg': 1.0, 'high': 1.0, 'rule': 1.0, 'polynomial': 1.0, 'pitfal': 1.0, 'Bairstow': 1.0, 'shown': 1.0, 'rapid': 1.0, 'techniqu': 2.0, 'Numer': 3.0, 'invers': 1.0, 'accuraci': 1.0, 'circumv': 1.0, 'iter': 1.0, 'basic': 1.0, 'K': 1.0, 'By': 1.0, 'applic': 1.0, 'Equation': 1.0, 'recommend': 2.0, 'realiz': 1.0, 'scale': 1.0, 'W': 1.0, 'Programm': 1.0, 'present': 1.0, 'great': 1.0, 'reliabl': 1.0, 'Newton': 1.0, 'Raphson': 1.0, 'converg': 1.0, 'degre': 1.0, 'exampl': 1.0, 'Both': 1.0, 'equat': 1.0, 'Polynomial': 1.0, 'root': 1.0, -1: 46.0}


{'B': 1.0, 'I': 1.0, 'Priev': 2.0, 'written': 1.0, 'articl': 1.0, 'Pooch': 2.0, 'U': 1.0, 'interest': 1.0, 'Udo': 1.0, -1: 11.0}


In [8]:
w = Weighter3(indexer)
print(w.getWeightsFromDoc(400))
print('\n')
print(w.getWeightsFromQuery(q3.el))

{'Comment': 1.0, 'Feurzeig': 1.0, 'E': 1.0, 'W': 1.0, -1: 12.0, 'Procedur': 1.0, '60': 1.0, 'T': 1.0, 'Iron': 1.0, 'Implement': 1.0, 'Recurs': 1.0, 'ALGOL': 1.0, 'Block': 1.0}


{'compil': 10.057416267942584, 'multi': 14.250847457627119, 'target': 15.343065693430656, 'TCOLL': 0, 'Intermedi': 4204.0, 'construct': 10.457711442786069, 'languag': 6.780645161290322, -1: 0}


In [9]:
w = Weighter4(indexer)
print(w.getWeightsFromDoc(400))
print('\n')
print(w.getWeightsFromQuery(q4.el))

{'Comment': 1.0, 'Feurzeig': 1.0, 'E': 1.0, 'W': 1.0, -1: 12.0, 'Procedur': 1.0, '60': 1.0, 'T': 1.0, 'Iron': 1.0, 'Implement': 1.0, 'Recurs': 1.0, 'ALGOL': 1.0, 'Block': 1.0}


{'process': 6.2373887240356085, 'abstract': 13.261829652996845, 'procedur': 8.544715447154472, 'oppos': 1051.0, 'theoret': 12.73939393939394, 'pass': 13.605177993527509, 'disjoint': 15.231884057971014, 'messag': 13.828947368421053, 'Remot': 1051.0, 'call': 11.180851063829786, 'interest': 168.16, 'communicat': 420.4, 'complet': 12.437869822485206, 'distribut': 11.84225352112676, 'I': 25.17365269461078, 'mechan': 12.817073170731707, 'problem': 5.997146932952925, 'possibl': 14.013333333333334, 'work': 11.486338797814208, 'descript': 11.677777777777777, 'exampl': 9.93853427895981, 'environ': 8.275590551181102, 'exclus': 233.55555555555554, 'implement': 8.74012474012474, -1: 0}


In [10]:
w = Weighter5(indexer)
print(w.getWeightsFromDoc(111))
print('\n')
print(w.getWeightsFromQuery(q4.el))

{'comput': 3.24884080371, 'On': 32.5891472868, 'illustr': 13.3885350318, 'appli': 10.8911917098, 'Solut': 50.6506024096, 'procedur': 8.54471544715, 'Ellenberg': 2102.0, 'high': 13.346031746, 'rule': 11.5178082192, 'polynomial': 52.55, 'pitfal': 16.2945736434, 'Bairstow': 600.571428571, 'shown': 8.74012474012, 'rapid': 14.8028169014, 'techniqu': 12.9183135156, 'Numer': 180.052368603, 'invers': 105.1, 'accuraci': 15.3430656934, 'circumv': 16.421875, 'iter': 13.1375, 'basic': 11.8757062147, 'K': 22.9726775956, 'By': 93.4222222222, 'applic': 8.2431372549, 'Equation': 51.9012345679, 'recommend': 26.0732261798, 'realiz': 14.1073825503, 'scale': 14.1548821549, 'W': 9.64220183486, 'Programm': 22.847826087, 'present': 5.50261780105, 'great': 14.8551236749, 'reliabl': 13.6493506494, 'Newton': 135.612903226, 'Raphson': 600.571428571, 'converg': 15.6865671642, 'degre': 89.4468085106, 'exampl': 9.93853427896, 'Both': 127.393939394, 'equat': 10.6972010178, 'Polynomial': 56.0533333333, 'root': 13.096

# Similarity measure

In [22]:
class IRmodel(object):
    
    def __init__(self, indexer):
        
        # indexer is an indexer object
        self.indexer = indexer
        self.nDoc = len(indexer.indexFromCol)
        
    def getScores(self, query):
        
        raise ValueError('Abstract method')
    
    def getRanking(self, query):
        
        start = time.time()
        
        scores = self.getScores(query)
        sorted_scores = (np.sort(scores, order='score'))[::-1]
        
        end = time.time()
        print(end - start)
        
        return sorted_scores

class Vector(IRmodel):
    
    def __init__(self, indexer, weighter=Weighter1, normalized=False):
    
        IRmodel.__init__(self, indexer)
        
        # weighter is a Weighter object
        self.weighter = weighter(indexer)
        
        # normalized is a boolean
        self.normalized = normalized
    
    def dotProduct(self, vector1, vector2):
        
        result = 0
        
        if len(vector1)>len(vector2):
            tmp = vector1
            vector1 = vector2
            vector2 = tmp
        
        for element in vector1:
            if element in vector2:
                result += vector1[element]*vector2[element]
        
        return result
                
    def norm2(self, vector):
        
        result = 0
        
        for element in vector:
            result += np.power(vector[element], 2)
        
        return np.sqrt(result)
    
    def getScores(self, query):
        
        vecQuery = self.weighter.getWeightsFromQuery(query)
        vecQuery.pop(-1)
        norm2VecQuery = self.norm2(vecQuery)
        
        doc = {}
        
        for id in query:
            
            if id in self.indexer.invIndex:
                for element in self.indexer.getDfFromEl(id):
                    doc[element] = id
        doc.pop(-1)
        
        scores = np.zeros(self.nDoc, [('id', 'a25'), ('score', 'float64')])
        
        i = 0 
        for id in self.indexer.index:
            
            scores[i]['id'] = str(id)
            
            if id in doc:
                vecDoc = self.weighter.getWeightsFromDoc(id)
                vecDoc.pop(-1)
                dotProduct = self.dotProduct(vecDoc, vecQuery)

                if self.normalized: 
                    scores[i]['score'] = dotProduct/float(self.norm2(vecDoc)*norm2VecQuery)
                else:
                    scores[i]['score'] = dotProduct

            else:
                scores[i]['score'] = 0
                
            i += 1
        
        return np.array(scores)
        

In [23]:
vector1 = Vector(indexer, Weighter1)
scores = vector1.getRanking(q4.el)
scores[:10]

2.38761115074


array([('1922', 24.0), ('4084', 19.0), ('4048', 19.0), ('3332', 19.0),
       ('4152', 18.0), ('3461', 18.0), ('3911', 17.0), ('3840', 17.0),
       ('3637', 17.0), ('3372', 17.0)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

In [24]:
vectorNorm1 = Vector(indexer, Weighter1, normalized=True)
scores = vectorNorm1.getRanking(q4.el)
scores[:10]

14.3645188808


array([('3128', 0.26854307776478736), ('1601', 0.21320071635561047),
       ('3043', 0.20272121351984582), ('1588', 0.20257967806063143),
       ('3101', 0.199204768222399), ('2376', 0.18609684207969418),
       ('1530', 0.17201561551404668), ('2377', 0.1689343445998715),
       ('2152', 0.16835875742536846), ('824', 0.16666666666666669)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

In [25]:
vectorNorm2 = Vector(indexer, Weighter2, normalized=True)
scores = vectorNorm2.getRanking(q4.el)
scores[:10]

14.3963270187


array([('3128', 0.2882306768491569), ('1588', 0.24590447683052283),
       ('1530', 0.2417728402451219), ('3059', 0.23939782946951918),
       ('3101', 0.2138089935299395), ('2377', 0.2115392598254248),
       ('2376', 0.20806259464411975), ('1601', 0.19069251784911845),
       ('2166', 0.18257418583505536), ('3043', 0.18131936556464984)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

In [26]:
vectorNorm3 = Vector(indexer, Weighter3, normalized=True)
scores = vectorNorm3.getRanking(q4.el)
scores[:10]

14.4996881485


array([('1325', 0.0699317984268452), ('1550', 0.06714205438480218),
       ('1135', 0.06209709842107191), ('2519', 0.05817034745389737),
       ('2342', 0.05457254331254995), ('2501', 0.0501026359196736),
       ('1681', 0.046022018607429126), ('2359', 0.0449677594563381),
       ('1829', 0.04263605771675477), ('3041', 0.040472625634702934)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

In [27]:
vectorNorm4 = Vector(indexer, Weighter4, normalized=True)
scores = vectorNorm4.getRanking(q4.el)
scores[:10]

13.2735040188


array([('1550', 0.07918145694508417), ('1325', 0.07449260761367496),
       ('1135', 0.06919815193666695), ('2501', 0.0650217011690884),
       ('2359', 0.06082364310271595), ('1829', 0.0607209242961512),
       ('1681', 0.05977021287621712), ('3041', 0.05901044457082188),
       ('2519', 0.057955028577259676), ('2342', 0.05281884886916953)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

In [17]:
vectorNorm5 = Vector(indexer, Weighter5, normalized=True)
scores = vectorNorm5.getRanking(q4.el)
scores[:10]

14.9790391922


array([('1135', 0.14483061709178638), ('1325', 0.10260641141079291),
       ('1550', 0.09323034694287828), ('2519', 0.09094601526466026),
       ('1829', 0.07427957876369132), ('3073', 0.07257440143648919),
       ('2359', 0.06642339610218967), ('1681', 0.06560282928348392),
       ('2501', 0.05542593819472901), ('2342', 0.05192728693027786)], 
      dtype=[('id', 'S25'), ('score', '<f8')])

# Evaluation

In [28]:
class IRList(object):
    
    def __init__(self, query, scores):
        
        self.query = query
        self.scores = scores


In [29]:
class EvalMeasure():
    
    def __init__(self, irlist):
        
        self.irlist = irlist
        
    def recall(self, i):
        
        recall = \
        np.in1d(self.irlist.scores[:i]['id'], self.irlist.query.relevants).sum()
        
        return recall/float(len(self.irlist.query.relevants))
    
    def precision(self, i):
        
        precision = \
        np.in1d(self.irlist.scores[:i]['id'], self.irlist.query.relevants).sum()
    
        return precision/float(i)
    
    def eval(self, k):
    
        raise ValueError('Abstract method')
        
class EvalPrecisionRecall(EvalMeasure):
    
    def __init__(self, irlist):
        
        EvalMeasure.__init__(self, irlist)
        
        start = time.time()
        
        size = len(self.irlist.scores)
        self.recalls = np.zeros(size)
        self.precisions = np.zeros(size)
        for i in range(1, size):
            self.recalls[i] = self.recall(i)
            self.precisions[i] = self.precision(i)
        self.recalls = np.array(self.recalls)
        self.precisions = np.array(self.precisions)
        
        end = time.time()
        print(end - start)
        
    def eval(self, k=20):
        
        measures = np.zeros((k, 2))
        
        # gives good levels between 0 and 1
        levels = np.linspace(0, 1, k)
        
        i = 0
        for level in levels:
            measures[i,0] = level
            wh = self.precisions[np.where(self.recalls >= level)]
            if len(wh) > 0:
                measures[i,1] = np.max(wh)
            else:
                measures[i,1] = 0
            i += 1
            
        return measures

class EvalPrecisionAverage(EvalMeasure):
    
    def __init__(self, irlist):
        
        EvalMeasure.__init__(self, irlist)
        
    def eval(self):
        
        self.irlist.query.relevants = np.array(self.irlist.query.relevants)
        return np.mean([self.precision(i) for i in np.argwhere(np.in1d(self.irlist.scores['id'], self.irlist.query.relevants[:,0]))])
            

In [30]:
print(scores)
irlist = IRList(q4, scores)
EM = EvalPrecisionRecall(irlist)
scores = EM.eval(k=10)
print(scores)
plt.plot(scores[:,0], scores[:,1])
plt.show()

[('1550', 0.07918145694508417) ('1325', 0.07449260761367496)
 ('1135', 0.06919815193666695) ..., ('100', 0.0) ('10', 0.0) ('1', 0.0)]
8.45885705948
[[ 0.          0.16666667]
 [ 0.11111111  0.16666667]
 [ 0.22222222  0.16666667]
 [ 0.33333333  0.14814815]
 [ 0.44444444  0.04      ]
 [ 0.55555556  0.02734375]
 [ 0.66666667  0.0230179 ]
 [ 0.77777778  0.01148106]
 [ 0.88888889  0.00351662]
 [ 1.          0.00349548]]


In [29]:
EM = EvalPrecisionAverage(irlist)
print(EM.eval())

0.00896619066302




In [31]:
class EvalIRModel(object):
    
    def __init__(self, models, queries, measures):
        
        self.models = models
        self.queries = queries
        self.measures = measures
        self.results()
        
    def results(self):
        
        results = np.zeros((len(self.models), len(self.queries), len(self.measures)+2))
        
        i = 0
        for model in self.models:
            j = 0
            for query in self.queries:
                k = 0
                scores = model.getScores(query.el)
                irlist = IRList(query, scores)
                for measure in self.measures:
                    measure = measure(irlist)
                    results[i,j,k] = measure.eval()
                    k += 1
                results[i,j,-2] = np.mean(results[i,j,:-2])
                results[i,j,-1] = np.var(results[i,j,:-2])
                j += 1
            i += 1
        
        self.outcome = results
    
    def getResults(self):
        
        return self.outcome
        

In [None]:
queries = [query(i, queriesIndexer, relevantIndexer) for i in queriesIndexer.index if i in relevantIndexer.indexFromCol]
EM = EvalIRModel([vectorNorm1, vectorNorm2], queries, [EvalPrecisionAverage, EvalPrecisionRecall])
results = EM.getResults()