#### Indexing for wt10g

In [39]:
with open('./wt10g.spec') as file:
    files = [line.rstrip() for line in file]

In [40]:
import lucene
from org.apache.lucene.analysis.en import EnglishAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import FSDirectory
import org.apache.lucene.document as document
from java.io import File
import itertools

In [41]:
lucene.initVM()

<jcc.JCCEnv at 0x7fc38c39f0b0>

In [42]:
indexPath = File("index_wt10g_cleaned").toPath()
indexDir = FSDirectory.open(indexPath)

analyzer = EnglishAnalyzer()
writerConfig = IndexWriterConfig(analyzer)
writer = IndexWriter(indexDir, writerConfig)

In [43]:
import re
tag_exp = re.compile('<.*?>', re.DOTALL)

docCount = 0

def cleanTag(rawDoc):
    
    cleanDoc = re.sub(tag_exp, '', rawDoc)
    return cleanDoc

def process(oneDoc):
    global docCount
    docCount += 1
    # print(docCount)
    return cleanTag(oneDoc)


# this function needs to be called for each of the files in the directory
def processFile(filePath):
    with open(filePath, 'r', encoding='ISO-8859-1') as f:
        inDoc = False
        docid,oneDoc = "",""
        docids,contents = [],[]     # will store all the docs (docIDs, Contents) of a single file in a list
                                    # with docid and contents in one-to-one list index-wise correspondence
                                    # Why making lists? See the note in the next cell.
        dochdr_flag = 0
        for line in f:
            if inDoc:
                if line.startswith("<DOCNO>"):
                    m = re.search('<DOCNO>(.+?)</DOCNO>', line)
                    docid = m.group(1)
                    continue
                elif line.strip() == "</DOC>":
                    inDoc = False
                    contents.append(process(oneDoc))
                    docids.append(docid.strip())
                    oneDoc = ""
                else:
                    if line.startswith("<DOCHDR>") or line.startswith("<DOCOLDNO>"):
                        dochdr_flag += 1
                    if dochdr_flag != 0:
                        if line.startswith("</DOCHDR>") or line.startswith("</DOCOLDNO>"):
                            dochdr_flag = 0
                    if dochdr_flag == 0:
                        oneDoc += line

            elif line.strip() == "<DOC>":
                inDoc = True
        return docids,contents

In [44]:
di , dc = processFile(files[0])

In [50]:
from org.apache.lucene.document import FieldType
from org.apache.lucene.index import IndexOptions

ft = FieldType()
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
# ft.setStoreTermVectorOffsets(True)
# ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

ftc = FieldType()
ftc.setStored(True)

# Main Indexer function
def make_inverted_index(filePaths, fieldType, ftc):
    for filePath in filePaths:
        docids,contents = processFile(filePath)
        for i in range(len(docids)):
            doc = document.Document()
            doc.add(document.Field('ID', docids[i], ftc))
            doc.add(document.Field('CONTENTS', contents[i], fieldType))
            writer.addDocument(doc)
    writer.close()
    print('Indexing completed successfully!')

make_inverted_index(files, ft, ftc)

Indexing completed successfully!
