In [13]:
'''
Remove STOP WORDS
Unicode normalization to string decoding
No entity yet index
'''

'\nRemove STOP WORDS\nUnicode normalization to string decoding\nNo entity yet index\n'

In [1]:
import xapian
import os
import io
from zipfile import ZipFile
import shutil
import csv
import re
import pandas as pd
import numpy as np
import json
import time
from nltk.corpus import stopwords
import nltk

In [2]:
pd.options.display.max_colwidth = 100

In [3]:
dbpath = "index_V6_nostop"

In [4]:
zf = ZipFile("wiki-pages-text.zip")

In [5]:
files = [item.filename for item in zf.filelist]
len(files)

110

In [6]:
files[:10]

['wiki-pages-text/',
 'wiki-pages-text/wiki-009.txt',
 'wiki-pages-text/wiki-021.txt',
 'wiki-pages-text/wiki-035.txt',
 'wiki-pages-text/wiki-034.txt',
 'wiki-pages-text/wiki-020.txt',
 'wiki-pages-text/wiki-008.txt',
 'wiki-pages-text/wiki-036.txt',
 'wiki-pages-text/wiki-022.txt',
 'wiki-pages-text/wiki-023.txt']

In [7]:
import unicodedata
import spacy
import en_core_web_sm
#import en_core_web_lg

def remove_extra(wordlist):
    text = re.sub(r"(-LRB-|-LSB-|-RSB-|-RRB-|-COLON-)*", "", wordlist)
    return text

def unicodes(string):
    nfd_string = unicodedata.normalize("NFD", string)
    nfd = nfd_string.encode('WINDOWS-1252', 'ignore')
    strg=nfd.decode('latin-1')#"utf-8") #utf-8 didn't work as the data is in latin-1
    return remove_extra(strg)

def titleunicodes(string):
    nfd_string = unicodedata.normalize("NFD", string)
    nfd = nfd_string.encode('WINDOWS-1252', 'ignore')
    strg = (nfd.decode('latin-1')) #"utf-8")
    strg = remove_extra(strg)
    cleanString = re.sub('[-_]',' ', strg)
    #cleanString = re.sub('\W+','', str(cleanString))
    return cleanString

nlp = en_core_web_sm.load()
def get_entity(document):
    doc = nlp(document)
    entity = []
    for ent in doc.ents:
        entity.append(str(ent).lower())
    return entity

In [8]:
def read_doc(zf, path):
    items = []
    fp = zf.open(path, mode='r')
    tfp = io.TextIOWrapper(fp)
    cnt=0
    for line in tfp.readlines():
        #match = re.match("(\w+)\s(\d+)\s(.*)\n", line)
        match = re.match("([\S]+)\s(\d+)\s(.*)\n", line)
        #if cnt <= 2:
            #print("line",line)
            #print("match",match)
            #print("match0",match[0])
            #print("match1",match[1])
            #cnt += 1
        if match:
            combine = match[0]
            unicodetext = unicodes(combine)
            unicodetitle = titleunicodes(match[1])
            items.append([match[1],combine,unicodetitle,unicodetext])
    fp.close()
    tfp.close()
    return items

In [9]:
def read_doc_as_df(zf, path):
    items = read_doc(zf, path)
    raw_df = pd.DataFrame(data=items, columns=['doc_id', 'sentence_text','unicode_doc_id','unicode_text'])
    func = lambda x: " ".join(x)
    func2 = lambda x: min(x)
    return raw_df.groupby('doc_id')['sentence_text','unicode_doc_id','unicode_text'].agg(
        {'sentence_text': func, 'unicode_doc_id': func2,'unicode_text':func})

In [10]:

def mprint(text):
    print(time.strftime("%a, %d %b %Y %H:%M:%S +0000"),text)

def getstopper():
    stopper = xapian.SimpleStopper()
    for s in stopwords.words('english'):
        stopper.add(s)
    return stopper



In [12]:
%%time

#Imporvements done:
# Remove stop words
# add STOP_STEMMED
# Create or open the database we're going to be writing to.
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

# Set up a TermGenerator that we'll use in indexing.
termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))
stopper = getstopper()
termgenerator.set_stopper(stopper)
termgenerator.set_stopper_strategy(termgenerator.STOP_ALL)

#termgenerator.set_stopper()
#termgenerator.set_stopper_strategy(xapian.Stopper.STOP_ALL)
for path in sorted(files): #['wiki-pages-text/wiki-056.txt']: # sorted(files):
    mprint(path)
    documents_df = read_doc_as_df(zf, path)
    #for doc_id, text in documents_df.items():
    for row in documents_df.itertuples(index=True, name='Pandas'):
        #get the fields we will index
        doc_id=row[0]
        text=row[1]
        unicode_doc_id=row[2]
        unicode_text=row[3]
        
        # We make a document and tell the term generator to use this.
        doc = xapian.Document()
        termgenerator.set_document(doc)
        
        # Index each field with a suitable prefix.
        termgenerator.index_text(doc_id, 1, 'S') #this will git stemmed as we insert it using termgenerator
        termgenerator.index_text(unicode_doc_id, 2, 'B') #B => Topic
        #termgenerator.index_text(' '.join([x for x in get_entity(unicode_text)]), 3, 'K') #K => Keyward
        
        # Index fields without prefixes for general search.
        termgenerator.index_text(doc_id)
        termgenerator.increase_termpos()
        termgenerator.index_text(unicode_text)

        # Store all the fields for display purposes.
        doc.set_data(json.dumps(text))

        # We use the identifier to ensure each object ends up in the
        # database only once no matter how many times we run the
        # indexer.
        idterm = u"Q" + doc_id
        doc.add_boolean_term(idterm)
        db.replace_document(idterm, doc)
db.commit()
db.close()

Tue, 21 May 2019 00:09:58 +0000 wiki-pages-text/
Tue, 21 May 2019 00:09:58 +0000 wiki-pages-text/wiki-001.txt
Tue, 21 May 2019 00:10:46 +0000 wiki-pages-text/wiki-002.txt
Tue, 21 May 2019 00:11:45 +0000 wiki-pages-text/wiki-003.txt
Tue, 21 May 2019 00:12:47 +0000 wiki-pages-text/wiki-004.txt
Tue, 21 May 2019 00:13:50 +0000 wiki-pages-text/wiki-005.txt
Tue, 21 May 2019 00:15:12 +0000 wiki-pages-text/wiki-006.txt
Tue, 21 May 2019 00:16:25 +0000 wiki-pages-text/wiki-007.txt
Tue, 21 May 2019 00:17:56 +0000 wiki-pages-text/wiki-008.txt
Tue, 21 May 2019 00:19:39 +0000 wiki-pages-text/wiki-009.txt
Tue, 21 May 2019 00:20:54 +0000 wiki-pages-text/wiki-010.txt
Tue, 21 May 2019 00:22:01 +0000 wiki-pages-text/wiki-011.txt
Tue, 21 May 2019 00:23:32 +0000 wiki-pages-text/wiki-012.txt
Tue, 21 May 2019 00:25:11 +0000 wiki-pages-text/wiki-013.txt
Tue, 21 May 2019 00:26:54 +0000 wiki-pages-text/wiki-014.txt
Tue, 21 May 2019 00:29:11 +0000 wiki-pages-text/wiki-015.txt
Tue, 21 May 2019 00:30:45 +0000 wiki

For Verification

In [14]:
!xapian-delve $dbpath

UUID = 555aff35-a522-4e95-ab76-c6208b823d91
number of documents = 5396106
average document length = 130.881
document length lower bound = 3
document length upper bound = 136226
highest document id ever used = 5396106
has positional information = true
revision = 540
currently open for writing = false


In [15]:
dbpath

'index_V6_nostop'

In [16]:
!xapian-delve -t QLara_Croft-COLON-_Tomb_Raider $dbpath

Posting List for term 'QLara_Croft-COLON-_Tomb_Raider' (termfreq 1, collfreq 0, wdf_max 0): 2782041


In [27]:
d=2782041
a = !xapian-delve -r $d $dbpath

Query Verification Testing

In [13]:
def get_doc_id(match):
    for term in match.document.termlist():
        term = term.term.decode("utf-8") 
        m = re.match("Q(.*)", term)
        if m:
            return m[1]
    return None

In [14]:
# Prepare enquiry object

# Open the database we're going to search.
db = xapian.Database(dbpath)

# Set up a QueryParser with a stemmer and suitable prefixes
queryparser = xapian.QueryParser()
queryparser.set_stemmer(xapian.Stem("en"))
queryparser.set_stemming_strategy(queryparser.STEM_SOME)
queryparser.add_prefix('docid', 'S')

# Use an Enquire object on the database to run the query
enquire = xapian.Enquire(db)    

In [24]:
#Search within selected document list
claim_text = "Drake Bell released an album"
pagesize = 100
doc_list = ['1994–95_Venezuelan_Primera_División_season','Lara_Croft-COLON-_Tomb_Raider']

db = xapian.Database(dbpath)
print("db=",db,dbpath)
qp = xapian.QueryParser()
qp.set_stemmer(xapian.Stem("en"))
qp.set_stemming_strategy(qp.STEM_SOME)
stopper = getstopper()
qp.set_stopper(stopper)
print(qp.stoplist)
qp.add_prefix('docid', 'S') #useless

##def doclist_query(query_string, doc_list):
#doc_queries=[qp.parse_query('{}'.format(doc), 0, 'S') for doc in doc_list]
doc_queries = ['Q{}'.format(rated) for rated in doc_list] #this could to ensure the exact full id text is taken
doc_query = xapian.Query(xapian.Query.OP_OR, doc_queries)

claim_query = qp.parse_query(claim_text)
#use filter to search only in selected docs
if len(doc_list)>0:
    final_query = xapian.Query(xapian.Query.OP_FILTER, claim_query, doc_query)
else:
    final_query = claim_query
#join_query = xapian.Query(xapian.Query.OP_AND, query1, query2)
print(final_query)

# setup the enquire object to perform the query
enq = xapian.Enquire(db)

# set the weighting model for ranking
enq.set_weighting_scheme(xapian.BM25Weight())
enq.set_weighting_scheme(xapian.TfIdfWeight())
enq.set_query(final_query)

# iterate through the matched set and display the stored json dup
print("Doc Level Results:\n")

matches = enq.get_mset(0, pagesize)
query_results = []
doc_title = []
for match in matches:
    print(match)
    result = dict(
        found_doc = get_doc_id(match),
        rank = match.rank + 1, 
        term = [t for t in match.document.termlist()],
        percent = match.percent,
        weight = match.weight,
        docid = match.docid,
        text = match.document.get_data()
    )
    query_results.append(result)
    doc_title.append(get_doc_id(match))

query_results

db= Database() index_V6_nostop
<bound method _queryparser_gen_stoplist_iter of <xapian.QueryParser; proxy of <Swig Object of type 'Xapian::QueryParser *' at 0x11e1c13f0> >>
Query(((drake@1 OR bell@2 OR Zreleas@3 OR Zalbum@5) FILTER (Q1994–95_Venezuelan_Primera_División_season OR QLara_Croft-COLON-_Tomb_Raider)))
Doc Level Results:

<xapian.MSetItem object at 0x125afef98>


[{'docid': 2782041,
  'found_doc': 'Lara_Croft-COLON-_Tomb_Raider',
  'percent': 25,
  'rank': 1,
  'term': [<xapian.TermListItem at 0x126008990>,
   <xapian.TermListItem at 0x126008900>,
   <xapian.TermListItem at 0x1260089d8>,
   <xapian.TermListItem at 0x126008ab0>,
   <xapian.TermListItem at 0x126008af8>,
   <xapian.TermListItem at 0x126008a68>,
   <xapian.TermListItem at 0x126008a20>,
   <xapian.TermListItem at 0x126008b40>,
   <xapian.TermListItem at 0x126008bd0>,
   <xapian.TermListItem at 0x126008b88>,
   <xapian.TermListItem at 0x126008c18>,
   <xapian.TermListItem at 0x126008ca8>,
   <xapian.TermListItem at 0x126008c60>,
   <xapian.TermListItem at 0x126008cf0>,
   <xapian.TermListItem at 0x126008d80>,
   <xapian.TermListItem at 0x126008d38>,
   <xapian.TermListItem at 0x126008dc8>,
   <xapian.TermListItem at 0x126008ee8>,
   <xapian.TermListItem at 0x126008ea0>,
   <xapian.TermListItem at 0x126008e58>,
   <xapian.TermListItem at 0x126008e10>,
   <xapian.TermListItem at 0x1260

In [73]:
doc_list = ['1994–95_Venezuelan_Primera_División_season','Lara_Croft-COLON-_Tomb_Raider']
rated_queries = ['Q{}'.format(rated) for rated in doc_list]
rated_query = xapian.Query(xapian.Query.OP_OR, rated_queries)

print(rated_queries)
print(rated_query)

['Q1994–95_Venezuelan_Primera_División_season', 'QLara_Croft-COLON-_Tomb_Raider']
Query((Q1994–95_Venezuelan_Primera_División_season OR QLara_Croft-COLON-_Tomb_Raider))
