# Documents Extraction

In [7]:
import os
import re
import pickle
import string
import numpy as np
from ParserCACM import *
from porter import *
from copy import *
from nltk.corpus import stopwords

In [8]:
# Processed collections 
collectionPath = 'data/cacm/cacm.txt'
collectionPath2 = 'data/cisi/cisi.txt'

In [9]:
parser = ParserCACM()
parser.initFile(collectionPath)
docExample = parser.nextDocument()
print(docExample)

id=1
 Preliminary Report-International Algebraic Language 
  Perlis, A. J. Samelson,K. 
  CA581203 JB March 22, 1978  8:28 PM 
  
 
{'from': '/home/mcrilo33/Master/Master2/RI/TP1/data/cacm/cacm.txt;0;402', 'links': '100;123;164;205;210;214;1982;398;642;669;165;196;196;1273;1883;324;43;53;91;410;3184;', 'author': ' Perlis, A. J. Samelson,K.', 'text': '', 'title': ' Preliminary Report-International Algebraic Language', 'answer': ' CA581203 JB March 22, 1978  8:28 PM', 'keywords': ''}


# Documents Indexing

In [30]:
class Indexer(object):
    '''Build an Index from a Collection'''
    
    def __init__(
        self, collectionPath,
        parser,
        fromCol="",
        repPath="",
        repIndexPath="",
        repInvPath="",
        repInvIndexPath="",
        repInvFromAllPath="",
        repInvIndexFromAllPath=""
    ):
        
        # parser is a Parser object
        self.parser = parser
                 
        # collectionPath is a String object
        self.collectionPath = collectionPath
        end = re.search(r'\..*?$', collectionPath).group(0)
                 
        # Path names
        if repPath=="":
            self.repPath = \
                re.sub(r'\..*$', 'Rep'+end, self.collectionPath)
        else:
            self.repPath = repPath
        if fromCol=="":
            self.fromCol = \
                re.sub(r'\..*$', 'Index'+end, self.collectionPath)
        else:
            self.fromCol = fromCol
        if repIndexPath=="":
            self.repIndexPath = \
                re.sub(r'\..*$', 'Index'+end, self.repPath)
        else:
            self.repIndexPath = repindexPath
        if repInvIndexPath=="":
            self.repInvIndexPath = \
                re.sub(r'\..*$', 'InvIndex'+end, self.repPath)
        else:
            self.repInvIndexPath = repInvIndexPath
        if repInvFromAllPath=="":
            self.repInvFromAllPath = \
                re.sub(r'\..*$', 'InvFromAll'+end, self.repPath)
        else:
            self.repInvFromAllPath = repInvFromAllPath
        if repInvIndexFromAllPath=="":
            self.repInvIndexFromAllPath = \
                re.sub(r'\..*$', 'InvIndexFromAll'+end, self.repPath)
        else:
            self.repInvIndexFromAllPath = repInvIndexFromAllPath
        if repInvPath=="":
            self.repInvPath = re.sub(r'\..*$', 'Inv'+end, self.repPath)
        else:
            self.repInvPath
        
        # Loads Hashtables if they exist
        if os.path.isfile(self.repIndexPath):
            repIndexFile = open(self.repIndexPath)
            self.index = pickle.load(repIndexFile)
            repIndexFile.close()
        else:
            self.index = {}
        if os.path.isfile(self.fromCol):
            fromColFile = open(self.fromCol)
            self.indexFromCol = pickle.load(fromColFile)
            fromColFile.close()
        else:
            self.indexFromCol = {}
        if os.path.isfile(self.repInvIndexPath):
            repInvIndexFile = open(self.repInvIndexPath)
            self.invIndex = pickle.load(repInvIndexFile)
            repInvIndexFile.close()
        else:
            self.invIndex = {}
        if os.path.isfile(self.repInvIndexFromAllPath):
            repInvIndexFromAllFile = open(self.repInvIndexFromAllPath)
            self.invIndexFromAll = pickle.load(repInvIndexFromAllFile)
            repInvIndexFromAllFile.close()
        else:
            self.InvIndexFromAll = {}
            
        self.elements = {} # elements in self for optimisation reasons
        
    def __filters(self, doc):
        '''Filters applied to each document of the collection'''
        
        return True
    
    def __getData(self, rep, index, id):
        '''Return Something frequencies from a rep and his index at id==id'''
        
        id = str(id)
        
        if index=={}:
            raise ValueError('Index undefined')
        if not(os.path.isfile(rep)):
            raise ValueError('Rep file does no exist')
        if not(id in index):
            raise ValueError('Bad Identifier')
             
        pos = index[id]
        repFile = open(rep, 'r')
        repFile.seek(pos[0])
        rep = repFile.read(pos[1])
        repFile.close()
        
        return rep
        
    def __freqFromData(self, data):
    
        freq = {}
        total = 0
        rep = data.split(':')
        
        for i in range(0, len(rep), 2):
            added = int(rep[i+1])
            freq[rep[i]] = added
            total += added
        freq[-1] = total
        
        return freq
    
    def __updatePosElements(self, doc):
        '''Update pos and size of each element in inv index'''
        
        elements = self.elementsFromDoc(doc)
        id = len(doc.getId())
        
        for element in elements:
            
            added = id+len(str(elements[element]))+2
            if element in self.elements:
                self.elements[element] += added
            else:
                self.elements[element] = added-1
            
        return self.elements
    
    def elementsFromDoc(self, doc):
        '''Return an hashtable of the count of each element in a doc'''
        
        raise ValueError('Abstract method')
    
    def getEfFromDoc(self, id):
        '''Return the element frequencies of a doc with identifier==id'''
        
        data = self.__getData(self.repPath, self.index, str(id))
        return self.__freqFromData(data)
            
    def getDfFromEl(self, element):
        '''Return the document frequencies of an element'''
        
        data = self.__getData(self.repInvPath, self.invIndex, str(element))
        return self.__freqFromData(data)
        
    def getStrFromDoc(self, id):
        '''Return the string of a doc with identifier==id in Col'''
        
        return self.__getData(self.collectionPath, self.indexFromCol, str(id))
    
    def getObjFromDoc(self, id):
        '''Return the object of a doc with identifier==id in Col'''
        
        return self.parser.getDocument(self.getStrFromDoc(id))
    
    def createIndex(self):
        
        self.parser.initFile(self.collectionPath)
        posCol = self.parser.file.tell()
        doc = self.parser.nextDocument()
        self.indexFromCol = {}
        
        while doc!=None:
            
            posCol2 = self.parser.file.tell()
        
            if self.__filters(doc)==True:
                # Get pos in col and size of current doc
                self.indexFromCol[doc.getId()] = [posCol, posCol2-posCol]
                
            doc = self.parser.nextDocument()
            posCol = posCol2
            
        # Indexes'hashtable of doc in col
        fromCol = open(self.fromCol, "w")
        pickle.dump(self.indexFromCol, fromCol)
        
        fromCol.close()
        
    def createRepIndex(self):
            
        self.parser.initFile(self.collectionPath)
        posCol = self.parser.file.tell()
        doc = self.parser.nextDocument()
        repFile = open(self.repPath, "w")
        pos = 0
        self.index = {}
        self.indexFromCol = {}
        
        while doc!=None:
            
            posCol2 = self.parser.file.tell()
            
            if self.__filters(doc)==True:
                # Get rep of current doc
                elements = self.elementsFromDoc(doc)
                toWrite = ''
                for element in elements:
                    toWrite += ':'+element+':'+str(elements[element])
                toWrite = toWrite[1:]
                # Get pos in index and size of current rep
                self.index[doc.getId()] = [pos, len(toWrite)]
                # Get pos in col and size of current doc
                self.indexFromCol[doc.getId()] = [posCol, posCol2-posCol]
                
                repFile.write(toWrite)
                pos += len(toWrite)
                
            doc = self.parser.nextDocument()
            posCol = posCol2
            
        repFile.close()
        
        # Indexes'hashtable of rep
        repIndexFile = open(self.repIndexPath, "w")
        pickle.dump(self.index, repIndexFile)
        # Indexes'hashtable of doc in col
        fromCol = open(self.fromCol, "w")
        pickle.dump(self.indexFromCol, fromCol)
        
        repIndexFile.close()
        fromCol.close()
        
    def createRepInvIndex(self):
        
        # First pass
        self.parser.initFile(self.collectionPath)
        doc = self.parser.nextDocument()
        self.elements = {}
        self.invIndex = {}
        
        while doc!=None:
            
            if self.__filters(doc)==True:
                # Updates pos and size of each element in inv index
                self.__updatePosElements(doc)
                
            doc = self.parser.nextDocument()
            
        # Get pos and size of each elements from there size in inv index
        cumsum = 0
        for element in self.elements:
            tmp = self.elements[element]
            self.elements[element] = [cumsum, tmp]
            cumsum += tmp

        # Indexes'hashtable of elements in inv index
        for element in self.elements:
            self.invIndex[element] = self.elements[element]
        repInvIndexFile = open(self.repInvIndexPath, "w")
        pickle.dump(self.invIndex, repInvIndexFile)
        repInvIndexFile.close()
        
        # Second pass
        repInvFile = open(self.repInvPath, "w")
        self.parser.initFile(self.collectionPath)
        doc = self.parser.nextDocument()
        
        while doc!=None:
            
            if self.__filters(doc)==True:
                elements = self.elementsFromDoc(doc)
                toWrite = ''
                for element in elements:
                    toWrite = doc.getId()+':'+str(elements[element])+':'
                    repInvFile.seek(self.elements[element][0])
                    if (self.elements[element][1]-len(toWrite)) < 0:
                        toWrite = toWrite[:-1]
                    repInvFile.write(toWrite)
                    self.elements[element][0] += len(toWrite)
                    self.elements[element][1] -= len(toWrite)
                    
            doc = self.parser.nextDocument()
        
        repInvIndexFile = open(self.repInvIndexPath)
        self.invIndex = pickle.load(repInvIndexFile)
        repInvIndexFile.close()
        repInvFile.close()
        
    def createRepInvIndexFromAll(self):
        
        if self.index == {}:
            raise ValueError('Rep index undefined')
            
        if self.invIndex == {}:
            raise ValueError('Rep invIndex undefined')
            
        self.invIndexFromAll = {}
        repInvFromAll = {}
        
        for element in self.invIndex:
            docs = self.getDfFromEl(element)
            repInvFromAll[element] = np.sum([docs[doc] for doc in docs])
        
        repInvFromAll[-1] = 0
        for id in self.index:
            freq = self.getEfFromDoc(id)
            repInvFromAll[-1] += freq[-1]
                
        repInvFromAllFile = open(self.repInvFromAllPath, "w")
        pos = 0
        for element in repInvFromAll:
            toWrite = ''
            toWrite += ':'+str(element)+':'+str(repInvFromAll[element])
            toWrite = toWrite[1:]
            # Get pos in index and size of current rep
            self.invIndexFromAll[id] = [pos, len(toWrite)]

            repInvFromAllFile.write(toWrite)
            pos += len(toWrite)

        repInvFromAllFile.close()

        # Indexes'hashtable of doc in col
        invIndexFromAllFile = open(self.repInvIndexFromAllPath, "w")
        pickle.dump(self.invIndexFromAll, invIndexFromAllFile)

        invIndexFromAllFile.close()
                
            

In [31]:
class IndexerCACM(Indexer):
    
    def __init__(self, collectionPath, parser):
        
        Indexer.__init__(self, collectionPath, parser)
        
    def elementsFromDoc(self, doc):
        
        elements = {}
        text = doc.getText()
        
        # preprocessing
        text = text.lower()
        exclude = set(string.punctuation)
        exclude.remove("-")
        text = ''.join(word for word in text if word not in exclude)
        # removes digit and \n
        text = re.sub(r'\d+|\n', '', text)
        # removes one letter words
        text = re.sub(r'(^| )(\w( |$))+', ' ', text)
        text = text.split()
        
        for word in text:
            word = stem(word)
            if word in elements:
                elements[word] += 1
            else:
                elements[word] = 1
        
        return elements
        

## Demo

### Constructions of Index and Inversed Index

In [32]:
indexer = IndexerCACM(collectionPath, ParserCACM())

# If Index and Inv Index aren't already builded
#indexer.createRepIndex()
#indexer.createRepInvIndex()
indexer.createRepInvIndexFromAll()

print(indexer)

<__main__.IndexerCACM object at 0x7f6eebbd8a90>


### Get string from document

In [57]:
print(indexer.getStrFromDoc(20))

.I 20
.T
Accelerating Convergence of Iterative Processes
.W
A technique is discussed which, when applied
to an iterative procedure for the solution of
an equation, accelerates the rate of convergence if
the iteration converges and induces convergence if
the iteration diverges.  An illustrative example is given.
.B
CACM June, 1958
.A
Wegstein, J. H.
.N
CA580602 JB March 22, 1978  9:09 PM
.X
20	5	20
20	5	20
20	5	20




### Get obj from Doc

In [58]:
obj = indexer.getObjFromDoc(20)
print(obj.getText())

 Accelerating Convergence of Iterative Processes 
  Wegstein, J. H. 
  CA580602 JB March 22, 1978  9:09 PM 
  
  A technique is discussed which, when applied to an iterative procedure for the solution of an equation, accelerates the rate of convergence if the iteration converges and induces convergence if the iteration diverges.  An illustrative example is given.


### Get element frequencies from document

In [59]:
print(indexer.getEfFromDoc(20))

{'and': 1, 'acceler': 1, 'process': 1, 'solut': 1, 'ca': 1, 'procedur': 1, 'an': 3, 'rate': 1, 'if': 2, 'given': 1, 'techniqu': 1, 'for': 1, 'diverg': 1, 'when': 1, 'to': 1, 'which': 1, 'appli': 1, 'is': 2, 'pm': 1, 'march': 1, 'induc': 1, 'wegstein': 1, 'discuss': 1, 'jb': 1, 'of': 3, 'iter': 4, 'converg': 4, 'exampl': 1, 'equat': 1, 'illustr': 1, 'accelerat': 1, 'the': 4, -1: 47}


### Get document frequencies from element

In [60]:
print(indexer.getDfFromEl('wegstein'))

{'3715': 1, '4003': 2, '3520': 2, '3712': 2, '3496': 2, '3571': 1, '3494': 1, '3495': 1, '3492': 1, '3491': 1, '3320': 1, '3322': 1, '3326': 1, '3656': 1, '3437': 1, '3652': 1, '3808': 1, '3554': 1, '20': 1, '3551': 1, '3250': 2, '3794': 1, '3797': 1, '3254': 2, '3255': 1, '4143': 1, '4199': 1, '3805': 1, '3853': 1, '3999': 1, '3750': 1, '3996': 2, '4138': 1, '3723': 1, '3993': 1, '3928': 1, '3489': 1, '3761': 2, '3762': 1, '3837': 2, '3764': 1, '3938': 1, '3767': 1, '3480': 1, '3936': 1, '3838': 1, '3930': 1, '3486': 1, '3401': 1, '3332': 1, '3817': 1, '3407': 1, '4070': 1, '4039': 1, '4186': 1, '3642': 1, '4178': 2, '3562': 1, '4026': 1, '3248': 1, '3566': 1, '3883': 1, '3564': 1, '3243': 1, '3241': 1, '3569': 1, '4024': 1, '3245': 1, '3814': 1, '3444': 3, '4180': 2, '3568': 1, '3948': 1, '3772': 1, '3680': 1, '3777': 1, '3410': 1, '3821': 1, '3820': 1, '3687': 1, '3942': 1, '3649': 1, '3709': 1, '3947': 1, '4109': 1, '3273': 1, '3308': 1, '4108': 1, '3672': 1, '3661': 1, '4200': 1, 

# Queries Indexing

### Queries Indexer

In [61]:
queriesPath = 'data/cacm/cacm.qry'
parser = ParserCACM()
queriesIndexer = IndexerCACM(queriesPath, ParserCACM())

# If Index isn't already builded
#queriesIndexer.createRepIndex()

### Relevant Indexer

In [62]:
class RelevantParser(Parser):

    def __init__(self):
        
        Parser.__init__(self, '')
        self.curId = None
    
    def nextDocument(self):
        
        if self.curId==None:
            self.curLine = self.file.readline()
            if self.curLine!=None:
                self.curId = ((self.curLine).split())[0]
            
        id = self.curId
        text = ''
        
        while(id==self.curId and self.curLine!=''):
            text += self.curLine
            pos2 = self.file.tell()
            self.curLine = self.file.readline()
            if self.curLine!='':
                self.curId = ((self.curLine).split())[0]
        
        if text=='':
            return None
        
        return self.getDocument(text)
        
    def getDocument(self, text):
        
        tab = \
        [
            [(i.split())[1], int((i.split())[2]), int((i.split())[3])] \
            for i in text.split('\n')[:-1] \
        ]
        identifier = str(int(text.split()[0]))
        
        return Document(identifier, others={'tab': tab})


In [63]:
relevantPath = 'data/cacm/cacm.rel'
relevantIndexer = Indexer(relevantPath, RelevantParser())

# If Index and Inv Index aren't already builded
#relevantIndexer.createIndex()

### Query Object

In [64]:
class Query(object):

    def __init__(self, id, text="", el=None, relevants=None):
        
        self.id = id
        self.text = text
        self.el = el
        self.relevants = relevants
        
def query(id, queriesIndexer, relevantIndexer):
    
    id = str(id)
    text = queriesIndexer.getObjFromDoc(id).getText()
    el = queriesIndexer.getEfFromDoc(id)
    relevants = relevantIndexer.getObjFromDoc(id).get('tab')
    
    return Query(id, text, el, relevants)

In [65]:
q = query(1, queriesIndexer, relevantIndexer)
print('Id : ', q.id)
print('Text : ', q.text)
print('Elements : ', q.el)
print('Relevants : ', q.relevants)

('Id : ', '1')
('Text : ', ' \n  \n   1. Richard Alexander, Comp Serv, Langmuir Lab (TSS)    \n  \n   What articles exist which deal with TSS (Time Sharing System), an operating system for IBM computers?')
('Elements : ', {'comput': 1, 'deal': 1, 'share': 1, 'an': 1, 'exist': 1, 'langmuir': 1, 'what': 1, 'richard': 1, 'system': 2, 'articl': 1, 'which': 1, 'tss': 2, 'comp': 1, 'lab': 1, 'with': 1, 'ibm': 1, 'serv': 1, 'alexand': 1, 'for': 1, 'operat': 1, 'time': 1, -1: 23})
('Relevants : ', [['1410', 0, 0], ['1572', 0, 0], ['1605', 0, 0], ['2020', 0, 0], ['2358', 0, 0], ['2434', 0, 0]])
