### Defining paths to data and retrieving filename-source list

In [23]:
import os
GM_dirpath = '/home/ec2-user/SageMaker/data/The_Globe_and_Mail_with_DP_filter_by_article_type/'
TS_dirpath = '/home/ec2-user/SageMaker/data/Toronto_Star_Publication_with_query/'
filenames = os.listdir(GM_dirpath) + os.listdir(TS_dirpath)
sources = ['GM']*len(os.listdir(GM_dirpath)) + ['TS']*len(os.listdir(TS_dirpath))
print(f'{len(filenames):,} files detected.')
data = list(zip(filenames, sources))
data[:3]

192,433 files detected.


[('1237469385.xml', 'GM'), ('1352134238.xml', 'GM'), ('1323706441.xml', 'GM')]

### Defining `Schema` for index (only useful if creating new index)

In [25]:
from whoosh.fields import Schema, TEXT, ID, STORED, DATETIME

# STORED: Stored not indexed and not searchable
# TEXT: indexed, by default not stored. It also stores postition to allow phrase search.
# ID: Indexed, by default not stored. 
#     Entire unit stored as a whole (not tokenized). Do not store frequency information. bad for scoring
# DATETIME: Stores datetime objects (in a compact storable format)

schema = Schema(body=TEXT,
                file_id=ID(stored=True),
                title=TEXT(field_boost=2.0),
                source=STORED,
                date=DATETIME
               )

### Loading index (if exists) or creating new empty index

In [2]:
import os
from lxml import etree

from whoosh import index

index_dirpath = '/home/ec2-user/SageMaker/mariano/notebooks/03. High Recall Retrieval System/index'
if not os.path.exists(index_dirpath):
    os.mkdir(index_dirpath)
    
if index.exists_in(index_dirpath):
    print('Loading index')
    ix = index.open_dir(index_dirpath)
else:
    print('Creating new index')
    ix = index.create_in(index_dirpath, schema)
    
print(f'Number of documents in index: {ix.doc_count()}')

Loading index
Number of documents in index: 192427


### adding all documents to open index (only if index is empty (doc_count==0))

In [27]:
%%time
from lxml import etree
from bs4 import BeautifulSoup
import datetime

# We define a function to get the text content that we need from the XML articles available in our dataset
def getxmlcontent(root):
    if root.find('.//HiddenText') is not None:
        return(root.find('.//HiddenText').text)
    
    elif root.find('.//Text') is not None:
        return(root.find('.//Text').text)
    
    else:
        return None
    
if ix.doc_count()==0:
    q = input('Are you sure you want to add new elements to the index?')
    if 'yes' in q.lower():
        
        writer = ix.writer()

        for filename, source in data:
            full_filename = GM_dirpath+filename if source=='GM' else TS_dirpath+filename
            tree = etree.parse(full_filename)
            root = tree.getroot()

            if getxmlcontent(root) is not None:
                soup = BeautifulSoup(getxmlcontent(root))
                text = soup.get_text()
                y,m,d = root.find('.//NumericDate').text.split('-')
                date = datetime.datetime(int(y),int(m),int(d))
                title = root.find('.//Title').text
                source_name = 'Toronto Star' if source=='TS' else 'The Globe and Mail'
        #         print(f'Title :\t\t{title:20}')
        #         print(f'Date  :\t\t{str(date):20}')
        #         print(f'SRC NM:\t\t{source_name:20}')
        #         print(f'Text :\t\t{text[:100]:20}')
        #         print(f'ID   :\t\t{filename}')

                writer.add_document(title=title, 
                                    body=text, 
                                    file_id=filename,
                                    source=source_name,
                                    date=date
                                   )

        # schema = Schema(body=TEXT,
        #                 file_id=ID(stored=True),
        #                 title=TEXT(field_boost=2.0),
        #                 source=STORED,
        #                 date=DATETIME
        #                )
            else:
                text = 'Error in processing document'



        writer.commit()



CPU times: user 1h 12min 20s, sys: 1min 43s, total: 1h 14min 4s
Wall time: 1h 32min 38s


In [3]:
!du -hs index/

3.4G	index/


In [31]:
# searcher = ix.searcher()
# searcher.close()


In [13]:
from whoosh.qparser import MultifieldParser

mp = MultifieldParser(['title','body'],schema=ix.schema)
q = mp.parse(u'Refugee and date:[19941112 TO 19941113]')

with ix.searcher() as s:
    results = s.search(q)
    for result in results:
        print(result)
results

<Hit {'file_id': '1140932582.xml', 'source': 'The Globe and Mail'}>
<Hit {'file_id': '437132719.xml', 'source': 'Toronto Star'}>
<Hit {'file_id': '1140932027.xml', 'source': 'The Globe and Mail'}>
<Hit {'file_id': '1140923310.xml', 'source': 'The Globe and Mail'}>
<Hit {'file_id': '437138434.xml', 'source': 'Toronto Star'}>
<Hit {'file_id': '1140931445.xml', 'source': 'The Globe and Mail'}>
<Hit {'file_id': '1140926088.xml', 'source': 'The Globe and Mail'}>


<Top 7 Results for And([Or([Term('title', 'refugee'), Term('body', 'refugee')]), NumericRange('date', 62920195200000000, 62920367999999999, False, False, boost=1.0, constantscore=True)]) runtime=0.010276828000314708>

In [19]:
QueryParser?

In [20]:
s = ix.searcher()
s.search?