In [1]:
# RUN: 'pip install jsonlines' on: ModuleNotFoundError: No module named 'jsonlines'
import json
import jsonlines
from tqdm import tqdm
import pandas as pd
import pickle

# RUN: 'pip install whoosh' on: ModuleNotFoundError: No module named 'whoosh'
import os, os.path

#from whoosh import index
import whoosh
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.writing import AsyncWriter

from datetime import datetime
from pytz import timezone

from IPython.display import clear_output

######################################

class Indexer:
    '''This class is used to create index of the WP database'''
    
    index_created = False
    index_directory = ""
    data_path = ""       
    
    def __init__(self):
        print('Indexer has been initiated.')
        
    def indexCreated(self):
        return self.index_created
    
    def openIndex(self):
        pass
      
    ## Count documents
    def countDocuments(self):
        print("Counting documents, this might take a while...")
        counter = 0
        tenKCounter = 0

        with jsonlines.open(self.data_path) as reader:
            for obj in tqdm(reader.iter(type=dict, skip_invalid=True)):
                tenKCounter += 1
                counter += 1
            
                if tenKCounter >= 10000:
                    print("Current count is: " + str(counter))
                    tenKCounter = 0 
        print("Last count is: " + str(counter))
        print("Counting done.")
    
    def printDocumentData(self):
        '''Prints every 10000th object'''
        #dataframe = pd.DataFrame(columns = ['Title', 'Content', 'Year'])
        with jsonlines.open(self.data_path) as reader:
            counter = 0
            for obj in tqdm(reader.iter(type=dict, skip_invalid=True)):
                counter += 1
                if (counter % 10000) == 0:
                    print("Object number:" + str(counter))
                    print(type(obj))
                    doc = Document(**obj)
                    print(doc)
        
    def extractDocumentContents(self, contents):
        '''Extracts document contents from array of dicts to string'''
        build_string = ""
        json_array = json.dumps(str(contents))
        
        for item in json_array:
            try:
                print(str(item['type']))
                if str(item['type']) == "sanitized_html":
                    build_string = build_string + str(item['content'])
                    build_string = build_string + "\n"
            except:
                build_string = "Could not retreive content"
        
        print(build_string)
        return build_string
    
    def extractDocumentDate(self, epochTimestamp):
        '''Converst UNIX epoch timestamp to DATETIME'''
        # TODO IMPLENT Default value on Handling Error: ValueError: invalid literal for int() with base 10: 'None'
        if (epochTimestamp == ''):
            return datetime(1,1,1)
        try:
            # Test if source is within specified dates
            date_info = epochTimestamp
            removed_zeros = str(date_info)[0:10]
            timestamp = int(removed_zeros)
            return datetime.fromtimestamp(timestamp, timezone('EST'))
        except:
            print("ERROR: In extractDocumentDate()")
            print("ERROR: ", sys.exc_info()[0])
            #print("Caused by value: " + str(date_info))
            return datetime(1,1,1)
    
    def setIndexDirectory(self, directory):
        self.index_directory = directory
    
    def getIndexLocation(self):
        return self.index_directory
    
    def setDataPath(self, file_location):
        self.data_path = file_location
    
    def getDataPath(self):
        return self.data_path
    
    def setIndexCreatedTrue(self):
        self.index_created = True        
    
    def index(self):
        '''This method indexes the data'''
        # https://whoosh.readthedocs.io/en/latest/api/writing.html
        # https://appliedmachinelearning.blog/2018/07/31/developing-a-fast-indexing-and-full-text-search-engine-with-whoosh-a-pure-python-library/
        # TODO add document contents to index
        
        #  [id, article_url, title, author, publised_date, contents, type, source]
        # Define fields using whoosh's 'Schema' | https://whoosh.readthedocs.io/en/latest/schema.html#
        # Can add field boost here
        # TODO: critically look at different fields, which are important?, which need to be shown to the user? etc.
        # TODO: check if index_location has been set
        
        assert self.index_directory and self.data_path # Both variables have to be set
        
        schema = Schema(doc_id = whoosh.fields.ID(unique=True, stored=True),\
                       article_url = whoosh.fields.STORED,\
                       title = whoosh.fields.TEXT(stored=True),\
                       author = whoosh.fields.ID,\
                       published_date = whoosh.fields.DATETIME,\
                       contents = whoosh.fields.TEXT)
              
        # Creating a index writer to add document as per schema
        myindex = whoosh.index.create_in(self.index_directory,schema)
        writer = whoosh.writing.AsyncWriter(myindex)
        
        # Loop over data
        print("Looping over data. Indexing each article.")
        print("This might take a few minutes...")
        counter = 0
        checker = 10000
        fault_counter = 0
        with jsonlines.open(self.data_path) as reader:
            for obj in tqdm(reader.iter(type=dict, skip_invalid=True)):
                retreived_date = self.extractDocumentDate(obj['published_date'])
                if retreived_date != datetime(1,1,1):
                    counter = counter + 1
                    writer.add_document(doc_id=obj['id'],\
                                        article_url=obj['article_url'],\
                                        title=obj['title'],\
                                        author=obj['author'],\
                                        published_date=retreived_date,\
                                        contents = self.extractDocumentContents(obj['contents']))
                else:
                    fault_counter = fault_counter + 1
                    
                if counter > checker:
                    writer.commit()
                    writer = whoosh.writing.AsyncWriter(myindex)
                    checker = checker + 10000
                    clear_output(wait=True)
                    print("Looping over data. Indexing each article.")
                    print("This might take a few minutes...")
                    print("Indexed " + str(counter - 1) + " articles")
                    break
                    if fault_counter > 0:
                        print("Found " + str(fault_counter) + " wrongly formatted articles")
            
            print("Looping complete.")
        print("Index created!")
        self.setIndexCreatedTrue()
        
    


# indexer = Indexer()
# #indexer.countDocuments()
# #indexer.printDocumentData()
# #indexer.createDocumentDataStore()
# indexer.index()

In [2]:
###############################################
from whoosh.qparser import QueryParser
from whoosh import scoring
from whoosh.index import open_dir

class Ranking:
    '''This class contains functions that are used to create a ranking based on different algorithms'''
    
    show_n_results = 3
    index_directory = ""
    
    def __init__(self):
        print("Ranking class has been initiated.")
    
    def indexCreated(self):
        return indexCreated
    
    def createIndex(self):
        pass
    
    def setIndexDirectory(self, directory):
        self.index_directory = directory
    
    def openIndex(self):
        assert self.index_directory
        return open_dir(self.index_directory)
    
   
    def resultsToList(self, results):
        resultsList = []
        for result in results:
            result_dict = result.fields()
            result_dict['score'] = result.score
            resultsList.append(result_dict)
        return resultsList
            
    
    def searchTermFrequency(self, user_query, indexer):
        '''Returns results for a given query based on the Term Frequency search algorithm. Returned value is a list of dictionaries.'''
        #TODO check if index has been created
        index_dir = indexer.getIndexLocation()
        self.setIndexDirectory(index_dir)
        index = self.openIndex()
        resultsList = []        
        # Add: whoosh.qparser.MultifieldParser(fieldnames, schema, fieldboosts=None, **kwargs)
        # TODO: get fieldnames and schema from indexer -> add getters in Inderex class
        # TODO: add parameter that dictates search algorithm
 
        with index.searcher(weighting=scoring.Frequency) as searcher:
            parsed_query = QueryParser("title", index.schema).parse(user_query)
            results = searcher.search(parsed_query, limit=self.show_n_results)
            #self.printResults(results)
            for result in results:
                print("result ", result)
                print("type result ", type(result))
                print("fields result ", result.fields())
                print("keys result ", result.keys())
                
    def searchTermFrequencyReturnResults(self, user_query, indexer):
        #TODO check if index has been created
        index_dir = indexer.getIndexLocation()
        self.setIndexDirectory(index_dir)
        index = self.openIndex()
        
        with index.searcher(weighting=scoring.Frequency) as searcher:
            parsed_query = QueryParser("title", index.schema).parse(user_query)
            results = searcher.search(parsed_query)     
            results = resultsToList(results)
            return results
            
    
    def searchTF_IDF():
        '''Returns results for a given query based on the TF-IDF search algorithm. Returned value is a list of dictionaries.'''
        pass
    
    
    def searchBM25FReturnResults(self, user_query, indexer):
        
        index_dir = indexer.getIndexLocation()
        self.setIndexDirectory(index_dir)
        index = self.openIndex()
        
        with index.searcher(weighting=scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)) as searcher:
            parsed_query = QueryParser("title", index.schema).parse(user_query)
            results = searcher.search(parsed_query)       
            results = self.resultsToList(results)

            return results
        
        
    def searchBM25F():
        '''Returns results for a given query based on the BM25F search algorithm. Returned value is a list of dictionaries.'''
        pass
        
    

In [3]:
################################################
import urllib

class Evaluation:
    '''Used to evaluate the ranking results'''
    #should check if query used is in the 
    
    # TODO load evaluation document and queries
    # TODO pass queries on to ranking and obtain ranking results
    # TODO compare ranking results to TREC evaluation
    # TODO display results, how?
    # TODO test different algorithms
    
    queries = []
    ranking = Ranking()
    
    def load_queries(self):
        
        query_url = "https://trec.nist.gov/data/core/topics2018.txt"
        query_file = urllib.request.urlopen(query_url)
        titles, numbers = [], []
        title = False
        
        for line in query_file:
            decoded_line = line.decode("utf-8")
            
            if title and not "</title>" in decoded_line:
                titles.append(decoded_line.replace("\n", "").strip())

            if "<title>" in decoded_line:
                title = True
            
            if "</title>" in decoded_line:
                title = False
                
            if "<num>" in decoded_line:
                num = decoded_line.replace("<num>", "").replace("</num>", "").replace("\n", "").replace("Number: ", "").strip()
                numbers.append(num)
        
        queries = list(zip(titles, numbers))
        return queries 

        
    def addResults(self, results_formatted, results, query, number):
        count = 0
        
        for hit in results:        
            print(hit)
            current_result = str(number) + " 0 "                    
            current_result = str(current_result) + str(hit['doc_id']) + " " + str(hit['score'])           
            results_formatted.append(current_result)
            count += 1
            
        print("for query {} there are {} hits".format(query, count))
        return results_formatted
        
        
    def writeResults(self, results):
        
        with open("results.txt", "w") as results_file:
            for result in results:
                results_file.write(result + "\n")
                
    
    def compareResults(self):        
        # read in results.txt
        predictions = []
        with open("results.txt", "r") as results_file:
            for line in results_file:
                current_result = line.split(" ")
                current_result = [i.strip("\n") for i in current_result]
                
                predictions.append(current_result)
                
        # read in true scores file
        true_scores = []
        true_scores_url = "https://trec.nist.gov/data/core/qrels2018.txt"
        true_scores_file = urllib.request.urlopen(true_scores_url)

        for line in true_scores_file:
            decoded_line = line.decode("utf-8")
            decoded_line = decoded_line.split(" ")
            decoded_line = [i.strip("\n") for i in decoded_line]
            
            true_scores.append(decoded_line)
        
        
        # compare results.txt (preds) with true scores
        correct_preds, incorrect_preds = 0, 0
        
        for pred in predictions:
            for true_score in true_scores:
                
                if pred[0] == true_score[0] and pred[2] == true_score[2]:
                    if pred[3] == true_score[3]:
                        correct_preds += 1
                    else:
                        incorrect_preds += 1
                        
                    break
                    
        if correct_preds == incorrect_preds == 0:
            print("No matches found, because doc id's do not yet work")

        else:
            print("Performance: {:.2f}%".format(correct_preds/(correct_preds+incorrect_preds)))
        


Ranking class has been initiated.


In [4]:
##################################################
class UserInterface:
    '''Class used to handle user interaction'''
    # TODOS:
    # implement method that asks used if 
    # implement methods that asks the user to set different options
          
    def __init__(self):
        '''This is the constructor method of the UI'''
        print("UserInterface has been initiated.")       
        
    def getUserQuery(self):
        ''''Retrieves the query from the user and returns it'''
        userInput = input("Please enter a query: ")
        return userInput
    
    def indexAlreadyCreated(self, directory_location):
        print("Index is set.")
        print("Index files are stored in directory:" + directory_location)
        return
    
    def shouldCreateIndex(self):
        print("Do you wish to create a new index?")
        userInput = input("Answer [y/n]: ")
        if userInput == ('y' or 'Y' or "yes"):
            return True
        return False
    
    def indexIsNotSet(self):
        print("Index has not been set.")
        return
    
    def getIndexDirectory(self, default_directory):
        userInput = input("Please enter a directory name for the index (default = /"+ default_directory + "): ")
        if userInput == "":
            return default_directory
        return userInput
    
    def getDataPath(self, default_data_path):
        print("Please enter the path to the TREC_Washington_Post_collection.v2.jl file.")
        print("The default location is: /"+ default_data_path + "): ")
        userInput = input()
        if userInput == "":
            return default_data_path
        return userInput
    
    def creatingIndex(self, index_directory, data_path):
        pass
    
    def shouldAddExistingIndex(self):
        print("Do you wish to add an existing index?")
        userInput = input("Answer [y/n]: ")
        if userInput == ('y' or 'Y' or "yes"):
            return True
        return False
    
    def stopSearchEngine(self):
        print("Search Engine is stopped.")
        
    def shouldTerminateSearchEngine(self):
        print("Do you wish to stop the search engine?")
        userInput = input("Answer [y/n]: ")
        if userInput == ('y' or 'Y' or "yes"):
            return True
        return False
    
    def printResults(self, results):
        print("\nPrinting results:")
        for result in results:
            print(result)
    

In [5]:
# Block used to run everything:
# TODO expand this 'Controll' class that handles information flow
# TODO add querying functionality
# TODO implement choice of search algorithm
# TODO implement evaluation mode
import time

class SearchEngine:
    '''This class embodies the search engine and acts a a controller class'''
    
    UI = None
    Indexer = None
    Ranking = None
    
    RUNNING = False
    STOPPED = False
    EVALUATION_MODE = True
    USER_MODE = False
    
    user_query = ""
    DEFAULT_data_path = 'WP-corpus/data/TREC_Washington_Post_collection.v2.jl'
    DEFAULT_index_directory = "indexdir"
    
    
    def __init__(self):
        print("Search Engine has been initiated.")
    
    def stopSearchEngine(self, UI):
        self.RUNNING = False
        UI.stopSearchEngine()        
        
    def setIndex(self, Indexer, UI):
        if Indexer.indexCreated():
            directory_location = Indexer.getIndexLocation()
            UI.indexAlreadyCreated(directory_location)
            return
        else:
            UI.indexIsNotSet()
            if UI.shouldCreateIndex(): # This is for creating a new index
                # Index directory (setDir)
                index_directory = UI.getIndexDirectory(self.DEFAULT_index_directory)
                Indexer.setIndexDirectory(index_directory)
                # Check if index_dir exist and makes one if it doesn't
                if not os.path.exists(index_directory):
                    os.mkdir(index_directory)
                
                # Data file path (setPath)    
                data_path = UI.getDataPath(self.DEFAULT_data_path)
                Indexer.setDataPath(data_path)
                
                # Creating Index
                UI.creatingIndex(index_directory, data_path)
                Indexer.index()
                return
                
            elif UI.shouldAddExistingIndex(): # This is for adding an existing index
                
                # TODO communicate chosen directories and chosen data file
                # TODO add checks to check for existence of index?
                
                # Index directory, duplicate code -> (setDir)
                index_directory = UI.getIndexDirectory(self.DEFAULT_index_directory)
                Indexer.setIndexDirectory(index_directory)
                # Check if index_dir exist and makes one if it doesn't
                if not os.path.exists(index_directory):
                    os.mkdir(index_directory)
                
                # Data file path, duplicate code -> (setPath)    
                data_path = UI.getDataPath(self.DEFAULT_data_path)
                Indexer.setDataPath(data_path)
                
                # Set index created to true
                Indexer.setIndexCreatedTrue()
                return
                                
            else: # TODO: Ask if the user wants to continue or stop
                if UI.shouldTerminateSearchEngine(): 
                    self.STOPPED = True
                return
            return
       
    def run(self):
        '''This function start the search engine'''
        print('Search Engine started.')
        
        self.UI = UserInterface()
        self.Indexer = Indexer()
        self.Ranking = Ranking()
                
        self.RUNNING = True
        while self.RUNNING:
            if not self.Indexer.indexCreated():
                #TODO Inform user that an index needs to be set
                self.setIndex(self.Indexer, self.UI)
                if self.STOPPED == True: # Stop SE if user did not want to continue
                    self.stopSearchEngine(self.UI)
                    break
            #TODO ask user to choose a mode
                       
            if self.EVALUATION_MODE:
            # This is used to evaluate SE against the TREC relevance judgements
                index_dir = self.Indexer.getIndexLocation()
                self.Ranking.setIndexDirectory(index_dir)
                
                evaluation = Evaluation()
                queries  = evaluation.load_queries()
                print("Queries loaded")
                
                results_formatted = []
                
                for (query, number) in tqdm(queries):
                    #results = self.Ranking.searchTermFrequencyReturnResults(query, self.Indexer)
                    results = self.Ranking.searchBM25FReturnResults(query, self.Indexer)
                    results_formatted = evaluation.addResults(results_formatted, results, query, number)
                
                evaluation.writeResults(results_formatted)
                evaluation.compareResults()    
                
                print("Done")
                self.RUNNING = False
 
            if self.USER_MODE:
                # This is used to query questions
                #TODO ask for search algorithm
                #TODO ask for user query
                #TODO move printing results to UI
                print("Entering USER_MODE")
                user_query = self.UI.getUserQuery()
                print("Entered query: " + user_query)
                

                results = self.Ranking.searchTermFrequency(user_query, self.Indexer)
                self.UI.printResults(results)
                
            print("Starting over.")
            time.sleep(1)

In [6]:
mySearchEngine = SearchEngine()
mySearchEngine.run()

Search Engine has been initiated.
Search Engine started.
UserInterface has been initiated.
Indexer has been initiated.
Ranking class has been initiated.
Index has not been set.
Do you wish to create a new index?
Answer [y/n]: n
Do you wish to add an existing index?
Answer [y/n]: y
Please enter a directory name for the index (default = /indexdir): 
Please enter the path to the TREC_Washington_Post_collection.v2.jl file.
The default location is: /WP-corpus/data/TREC_Washington_Post_collection.v2.jl): 



  0%|                                                                                                                                                                                      | 0/50 [00:00<?, ?it/s]

Queries loaded


  2%|███▍                                                                                                                                                                          | 1/50 [00:00<00:20,  2.40it/s]

for query Women in Parliaments there are 0 hits


  4%|██████▉                                                                                                                                                                       | 2/50 [00:00<00:20,  2.37it/s]

for query Black Bear Attacks there are 0 hits


  6%|██████████▍                                                                                                                                                                   | 3/50 [00:01<00:22,  2.11it/s]

{'article_url': 'https://www.washingtonpost.com/opinions/hypocrisy-on-airport-security/2013/01/31/ffe162ea-6a37-11e2-9a0b-db931670f35d_story.html', 'doc_id': 'ffe162ea-6a37-11e2-9a0b-db931670f35d', 'source': 'The Washington Post', 'title': 'Hypocrisy on airport security', 'type': 'article', 'score': 20.33511825376157}
{'article_url': 'https://www.washingtonpost.com/opinions/airport-security-is-there-for-a-reason/2013/01/22/47f292fa-6314-11e2-889b-f23c246aa446_story.html', 'doc_id': '47f292fa-6314-11e2-889b-f23c246aa446', 'source': 'The Washington Post', 'title': 'Airport security is there for a reason', 'type': 'article', 'score': 18.8430811633607}
{'article_url': 'https://www.washingtonpost.com/news/volokh-conspiracy/wp/2014/04/10/the-fourth-amendment-and-airport-security-contd/', 'doc_id': 'fca85901256c6079027319e585ed6922', 'source': 'The Washington Post', 'title': 'The Fourth Amendment and Airport Security, cont’d', 'type': 'blog', 'score': 17.55502619010065}
{'article_url': 'https

  8%|█████████████▉                                                                                                                                                                | 4/50 [00:01<00:21,  2.17it/s]

for query Wildlife Extinction there are 0 hits


 10%|█████████████████▍                                                                                                                                                            | 5/50 [00:02<00:20,  2.23it/s]

for query Health and Computer Terminals there are 0 hits


 12%|████████████████████▉                                                                                                                                                         | 6/50 [00:02<00:21,  2.02it/s]

{'article_url': 'https://www.washingtonpost.com/local/immigration/horrific-episode-of-human-smuggling-fuels-both-sides-of-immigration-debate/2017/07/24/58bbcc82-7098-11e7-9eac-d56bd5568db8_story.html', 'doc_id': '58bbcc82-7098-11e7-9eac-d56bd5568db8', 'source': 'The Washington Post', 'title': 'Horrific episode of human smuggling fuels both sides of immigration debate', 'type': 'article', 'score': 16.587387096460013}
{'article_url': 'https://www.washingtonpost.com/news/post-nation/wp/2017/07/25/trucking-company-associated-with-migrant-smuggling-case-has-history-of-complaints-legal-problems/', 'doc_id': '3864768a7d556cf62cf7f9b614bdc363', 'source': 'The Washington Post', 'title': 'Company whose trailer was used in human smuggling case has history of legal problems', 'type': 'blog', 'score': 14.286682575283795}
for query human smuggling there are 2 hits


 14%|████████████████████████▎                                                                                                                                                     | 7/50 [00:03<00:20,  2.08it/s]

for query transportation tunnel disasters there are 0 hits


 16%|███████████████████████████▊                                                                                                                                                  | 8/50 [00:03<00:21,  1.99it/s]

{'article_url': 'https://www.washingtonpost.com/news/wonk/wp/2013/03/03/the-economics-of-somali-piracy/', 'doc_id': '2c09d07808e0691170d3ac70f87814f0', 'source': 'The Washington Post', 'title': 'The economics of Somali piracy', 'type': 'blog', 'score': 14.740048814293074}
{'article_url': 'https://www.washingtonpost.com/news/wonk/wp/2013/03/03/the-economics-of-somali-piracy/feed/', 'doc_id': 'a7669b32d226e75c42687f7f78e6d993', 'source': 'The Washington Post', 'title': 'The economics of Somali piracy', 'type': 'blog', 'score': 14.740048814293074}
{'article_url': 'https://www.washingtonpost.com/opinions/a-fair-block-on-internet-piracy/2011/12/16/gIQAIkn3WP_story.html', 'doc_id': '4a3b86e4-2840-11e1-af61-6efac089e2f6', 'source': 'The Washington Post', 'title': 'A fair block on Internet piracy', 'type': 'article', 'score': 13.658535578382757}
{'article_url': 'https://www.washingtonpost.com/business/technology/google-fights-piracy-with-search/2012/08/10/f8d1be0e-e31f-11e1-a25e-15067bb31849_s

 18%|███████████████████████████████▎                                                                                                                                              | 9/50 [00:04<00:21,  1.87it/s]

{'article_url': 'https://www.washingtonpost.com/business/capitalbusiness/hydrogen-fuel-cell-clean-energy-concepts-reach-washington-auto-show/2014/01/22/f15df44c-8398-11e3-8099-9181471f7aaf_story.html', 'doc_id': 'f15df44c-8398-11e3-8099-9181471f7aaf', 'source': 'The Washington Post', 'title': 'Hydrogen fuel cell, clean energy concepts reach Washington Auto Show', 'type': 'article', 'score': 16.65276804929327}
for query hydrogen energy there are 1 hits


 20%|██████████████████████████████████▌                                                                                                                                          | 10/50 [00:04<00:19,  2.02it/s]

for query euro opposition there are 0 hits


 22%|██████████████████████████████████████                                                                                                                                       | 11/50 [00:05<00:20,  1.92it/s]

{'article_url': 'https://www.washingtonpost.com/news/morning-mix/wp/2015/05/29/a-murder-or-a-mercy-killing-the-tangled-and-troubling-trial-of-gigi-jordan/', 'doc_id': '30ce3bae124447f415898b4436ca904e', 'source': 'The Washington Post', 'title': 'A murder or a ‘mercy killing?’ The tangled and troubling trial of Gigi Jordan', 'type': 'blog', 'score': 17.232416063902953}
{'article_url': 'https://www.washingtonpost.com/news/morning-mix/wp/2015/05/29/a-murder-or-a-mercy-killing-the-tangled-and-troubling-trial-of-gigi-jordan/', 'doc_id': '70e1545dec7e946173c07bbb1ac1d88c', 'source': 'The Washington Post', 'title': 'The millionaire mom who poisoned her autistic son and called it a mercy killing', 'type': 'blog', 'score': 15.476612824304762}
for query mercy killing there are 2 hits


 24%|█████████████████████████████████████████▌                                                                                                                                   | 12/50 [00:05<00:18,  2.04it/s]

for query automobile recalls there are 0 hits


 26%|████████████████████████████████████████████▉                                                                                                                                | 13/50 [00:06<00:19,  1.90it/s]

{'article_url': 'https://www.washingtonpost.com/news/opinions/wp/2016/05/24/you-cant-see-the-amazon-rain-forest-for-the-lack-of-trees/', 'doc_id': '00d8d4fb51170bf791af249aeed2a342', 'source': 'The Washington Post', 'title': 'You can’t see the Amazon Rain-forest for the lack of trees', 'type': 'blog', 'score': 26.681009096515634}
{'article_url': 'https://www.washingtonpost.com/news/innovations/wp/2014/11/26/when-a-shaman-in-the-amazon-rain-forest-outperforms-western-medicine/', 'doc_id': '348deba4cb7148ec80698e5c64968ee3', 'source': 'The Washington Post', 'title': 'When a shaman in the Amazon rain forest outperforms Western medicine', 'type': 'blog', 'score': 25.076531771388726}
{'article_url': 'https://www.washingtonpost.com/news/speaking-of-science/wp/2015/03/02/zip-from-the-amazon-rainforests-canopy-to-the-forest-floor-with-google-street-view/', 'doc_id': 'c04684338e0abe7c5a8656cc4eacd7ac', 'source': 'The Washington Post', 'title': 'Zip from the Amazon rain forest’s canopy to the fo

 28%|████████████████████████████████████████████████▍                                                                                                                            | 14/50 [00:07<00:19,  1.84it/s]

{'article_url': 'https://www.washingtonpost.com/world/the_americas/mexicos-coasts-battered-by-deadly-storms/2013/09/16/95894ee6-1eee-11e3-9ad0-96244100e647_story.html', 'doc_id': '95894ee6-1eee-11e3-9ad0-96244100e647', 'source': 'The Washington Post', 'title': 'At least 30 dead as tropical storms rake Mexico', 'type': 'article', 'score': 17.379426913446622}
{'article_url': 'https://www.washingtonpost.com/news/capital-weather-gang/wp/2016/07/27/the-tropical-atlantic-ocean-is-toasty-warm-so-where-are-all-the-storms/', 'doc_id': '9f9ac16d3cd4a88d5052c50817c76a53', 'source': 'The Washington Post', 'title': 'The tropical Atlantic Ocean is toasty warm. So where are all the storms?', 'type': 'blog', 'score': 15.513590163521531}
{'article_url': 'https://www.washingtonpost.com/news/capital-weather-gang/wp/2016/09/16/the-atlantic-ocean-has-three-tropical-storms-in-it-for-the-first-time-since-2012/feed/', 'doc_id': 'f0e7b1c202f275e6611c47cf43b639ac', 'source': 'The Washington Post', 'title': 'The

 30%|███████████████████████████████████████████████████▉                                                                                                                         | 15/50 [00:07<00:17,  1.96it/s]

for query Cuba, sugar, exports there are 0 hits


 32%|███████████████████████████████████████████████████████▎                                                                                                                     | 16/50 [00:08<00:17,  1.99it/s]

for query art, stolen, forged there are 0 hits


 34%|██████████████████████████████████████████████████████████▊                                                                                                                  | 17/50 [00:08<00:17,  1.87it/s]

{'article_url': 'https://www.washingtonpost.com/local/trafficandcommuting/belvoir-veterinarians-keep-law-enforcements-dogs-in-good-health/2013/07/09/4ac2e312-dcf0-11e2-9218-bc2ac7cd44e2_story.html', 'doc_id': '4ac2e312-dcf0-11e2-9218-bc2ac7cd44e2', 'source': 'The Washington Post', 'title': 'Belvoir veterinarians keep law enforcement’s dogs ready for action', 'type': 'article', 'score': 23.1078851589932}
{'article_url': 'https://www.washingtonpost.com/local/crime/2-maryland-law-enforcement-dogs-die-of-heat-exhaustion-inside-car/2015/09/01/13afe080-50ae-11e5-933e-7d06c647a395_story.html', 'doc_id': '13afe080-50ae-11e5-933e-7d06c647a395', 'source': 'The Washington Post', 'title': '2 Maryland law enforcement dogs die of heat exhaustion inside car', 'type': 'article', 'score': 21.86746619818905}
for query law enforcement, dogs there are 2 hits


 36%|██████████████████████████████████████████████████████████████▎                                                                                                              | 18/50 [00:09<00:15,  2.03it/s]

for query UV damage, eyes there are 0 hits


 38%|█████████████████████████████████████████████████████████████████▋                                                                                                           | 19/50 [00:09<00:14,  2.07it/s]

for query Greek, philosophy, stoicism there are 0 hits


 40%|█████████████████████████████████████████████████████████████████████▏                                                                                                       | 20/50 [00:09<00:13,  2.17it/s]

for query inventions, scientific discoveries there are 0 hits


 42%|████████████████████████████████████████████████████████████████████████▋                                                                                                    | 21/50 [00:10<00:14,  2.01it/s]

{'article_url': 'https://www.washingtonpost.com/express/wp/2014/09/11/rory-kennedy-reveals-heroic-acts-with-last-days-in-vietnam/', 'doc_id': '4084a7627ab6a184983ad133b32deef3', 'source': 'The Washington Post', 'title': 'Rory Kennedy reveals heroic acts with  ‘Last Days in Vietnam’', 'type': 'blog', 'score': 19.375622115650863}
{'article_url': 'https://www.washingtonpost.com/news/early-lead/wp/2015/10/02/family-of-oregon-shooting-victim-chris-mintz-credits-wrestling-and-mma-background-for-heroic-acts/', 'doc_id': 'f7fda5121215b09571dd5e8d724228ee', 'source': 'The Washington Post', 'title': 'Family of Oregon shooting victim Chris Mintz credits wrestling and MMA background for heroic acts', 'type': 'blog', 'score': 15.792370886440816}
{'article_url': 'https://www.washingtonpost.com/news/federal-eye/wp/2014/09/10/three-heroes-among-u-s-postal-service-letter-carriers/', 'doc_id': '9903845e7e4773065f4fc0636f27524c', 'source': 'The Washington Post', 'title': 'Saving babies from fires, rescui

 44%|████████████████████████████████████████████████████████████████████████████                                                                                                 | 22/50 [00:11<00:14,  1.95it/s]

{'article_url': 'https://www.washingtonpost.com/news/acts-of-faith/wp/2015/07/05/seventh-day-adventists-to-decide-whether-women-can-become-clergy/', 'doc_id': '14b0cd1e1e136f67f14f0746adfef0fb', 'source': 'The Washington Post', 'title': 'Seventh-day Adventists to decide whether women can become clergy', 'type': 'blog', 'score': 15.786779359870991}
{'article_url': 'https://www.washingtonpost.com/news/acts-of-faith/wp/2016/05/12/pope-francis-discussed-the-possibility-of-women-deacons-which-other-religions-have-female-clergy/', 'doc_id': '017dd874357b107e3a897964af8d8698', 'source': 'The Washington Post', 'title': 'Pope Francis discussed the possibility of women deacons. Which other religions have female clergy?', 'type': 'blog', 'score': 12.86723457266996}
{'article_url': 'https://www.washingtonpost.com/news/acts-of-faith/wp/2016/05/12/pope-francis-discussed-the-possibility-of-women-deacons-which-other-religions-have-female-clergy/0/', 'doc_id': '1bd3bf9bb832af8a9ad1257b74f4083c', 'sourc

 46%|███████████████████████████████████████████████████████████████████████████████▌                                                                                             | 23/50 [00:11<00:13,  2.06it/s]

for query human stampede there are 0 hits


 48%|███████████████████████████████████████████████████████████████████████████████████                                                                                          | 24/50 [00:11<00:12,  2.11it/s]

for query food stamps increase there are 0 hits


 50%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                                      | 25/50 [00:12<00:11,  2.13it/s]

for query college education advantage there are 0 hits


 52%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 26/50 [00:12<00:11,  2.13it/s]

for query Africa polio vaccination there are 0 hits


 54%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                               | 27/50 [00:13<00:11,  1.95it/s]

{'article_url': 'https://www.washingtonpost.com/news/worldviews/wp/2013/10/28/saudi-arabias-oppression-of-women-goes-way-beyond-its-ban-on-driving/', 'doc_id': '6b90757387020a36d33dccc485867139', 'source': 'The Washington Post', 'title': 'Saudi Arabia’s oppression of women goes way beyond its ban on driving', 'type': 'blog', 'score': 26.54464063258013}
for query women driving in Saudi Arabia there are 1 hits


 56%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 28/50 [00:13<00:10,  2.04it/s]

for query declining middle class in U.S. there are 0 hits


 58%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 29/50 [00:14<00:09,  2.13it/s]

for query "Women on 20s" there are 0 hits


 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 30/50 [00:14<00:09,  2.00it/s]

{'article_url': 'https://www.washingtonpost.com/national/health-science/can-us-eliminate-invasive-species-by-eating-them/2014/05/25/e61cbb4c-e449-11e3-8f90-73e071f3d637_story.html', 'doc_id': 'e61cbb4c-e449-11e3-8f90-73e071f3d637', 'source': 'The Washington Post', 'title': 'Can U.S. eliminate invasive species by eating them?', 'type': 'article', 'score': 31.400425560231092}
for query eating invasive species there are 1 hits


 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 31/50 [00:15<00:09,  2.10it/s]

for query computers and paralyzed people there are 0 hits


 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 32/50 [00:15<00:08,  2.17it/s]

for query Chavez medical treatment in Cuba there are 0 hits


 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 33/50 [00:16<00:07,  2.25it/s]

for query Boston marathon bombing verdict there are 0 hits


 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 34/50 [00:16<00:07,  2.07it/s]

{'article_url': 'https://www.washingtonpost.com/news/innovations/wp/2016/01/13/how-nasas-planetary-defense-officer-will-protect-earth-from-asteroids/', 'doc_id': '5e8d3671f7dcf4a27b0175f2153a970e', 'source': 'The Washington Post', 'title': 'How NASA’s planetary defense officer will protect Earth from asteroids', 'type': 'blog', 'score': 26.59283654323248}
for query protect Earth from asteroids there are 1 hits


 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 35/50 [00:17<00:07,  2.10it/s]

for query diabetes and toxic chemicals there are 0 hits


 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 36/50 [00:17<00:07,  1.94it/s]

{'article_url': 'https://www.washingtonpost.com/news/morning-mix/wp/2015/10/28/hacking-your-car-is-cool-with-us-says-u-s-copyright-authority/', 'doc_id': '5a9d8850cdc264ec5419ec07187a4da0', 'source': 'The Washington Post', 'title': 'Hacking your car is cool with us, says U.S. copyright authority', 'type': 'blog', 'score': 15.863104276278591}
{'article_url': 'https://www.washingtonpost.com/news/innovations/wp/2017/03/08/what-we-know-about-car-hacking-the-cia-and-those-wikileaks-claims/', 'doc_id': '7756e722fa18120f9f5dba393312337f', 'source': 'The Washington Post', 'title': 'What we know about car hacking, the CIA and those WikiLeaks claims', 'type': 'blog', 'score': 14.160058308538758}
{'article_url': 'https://www.washingtonpost.com/news/morning-mix/wp/2015/07/22/car-hacking-just-got-real-hackers-disable-suv-on-busy-highway/', 'doc_id': '1cf0d9b500504e613bd2d151744761fd', 'source': 'The Washington Post', 'title': '‘Car hacking’ just got real: In experiment, hackers disable SUV on busy 

 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                             | 37/50 [00:18<00:06,  2.01it/s]

for query social media and teen suicide there are 0 hits


 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                         | 38/50 [00:18<00:06,  1.91it/s]

{'article_url': 'https://www.washingtonpost.com/local/public-safety/shatter-super-high-potency-marijuana-now-appearing-on-east-coast/2015/12/23/e09dfde4-a8fa-11e5-bff5-905b92f5f94b_story.html', 'doc_id': 'e09dfde4-a8fa-11e5-bff5-905b92f5f94b', 'source': 'The Washington Post', 'title': 'Shatter, a super-high-potency marijuana, is appearing on the East Coast', 'type': 'article', 'score': 19.94741734875734}
{'article_url': 'https://www.washingtonpost.com/news/to-your-health/wp/2015/06/24/many-marijuana-products-have-wildly-inaccurate-labeling-for-potency-study-says/', 'doc_id': 'f66966a49be9635488271b675e572a5d', 'source': 'The Washington Post', 'title': 'Many marijuana products have wildly inaccurate labeling for potency, study says', 'type': 'blog', 'score': 18.87665061574705}
for query marijuana potency there are 2 hits


 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 39/50 [00:19<00:05,  1.96it/s]

for query China one-child impact there are 0 hits


 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 40/50 [00:19<00:05,  2.00it/s]

for query Jason Rezaian released from Iran there are 0 hits


 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 41/50 [00:20<00:04,  2.08it/s]

for query federal minimum wage increase there are 0 hits


 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 42/50 [00:20<00:03,  2.12it/s]

for query Alan Gross released by Cuba there are 0 hits


 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 43/50 [00:21<00:03,  2.22it/s]

for query eggs in a healthy diet there are 0 hits


 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 44/50 [00:21<00:02,  2.25it/s]

for query U.S. age demographics there are 0 hits


 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 45/50 [00:21<00:02,  2.26it/s]

for query bacterial infection mortality rate there are 0 hits


 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 46/50 [00:22<00:01,  2.31it/s]

for query email scams there are 0 hits


 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 47/50 [00:22<00:01,  2.11it/s]

{'article_url': 'https://www.washingtonpost.com/world/national-security/us-attributes-sony-attack-to-north-korea/2014/12/19/fc3aec60-8790-11e4-a702-fa31ff4ae98e_story.html', 'doc_id': 'fc3aec60-8790-11e4-a702-fa31ff4ae98e', 'source': 'The Washington Post', 'title': 'U.S. attributes cyberattack on Sony to North Korea', 'type': 'article', 'score': 21.46863457051561}
{'article_url': 'https://www.washingtonpost.com/news/the-switch/wp/2014/12/03/the-cyberattack-on-sony-pictures-made-employees-collateral-damage/', 'doc_id': '56dbef5d663082005716ca2cbadb0204', 'source': 'The Washington Post', 'title': 'The cyberattack on Sony Pictures made employees collateral damage', 'type': 'blog', 'score': 20.177606287244032}
{'article_url': 'https://www.washingtonpost.com/business/economy/energy-stocks-rally-even-as-oil-retreats/2014/12/02/8329da9e-7a65-11e4-b821-503cc7efed9e_story.html', 'doc_id': '8329da9e-7a65-11e4-b821-503cc7efed9e', 'source': 'The Washington Post', 'title': 'Experts say it is unlike

 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 48/50 [00:23<00:00,  2.17it/s]

for query control of MRSA there are 0 hits


 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 49/50 [00:23<00:00,  2.19it/s]

for query Bezos purchases Washington Post there are 0 hits


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:24<00:00,  2.07it/s]

for query ethanol and food prices there are 0 hits





Performance: 0.00%
Done
Starting over.
