In [102]:
# RUN: 'pip install jsonlines' on: ModuleNotFoundError: No module named 'jsonlines'
import json
import jsonlines
from tqdm import tqdm
import pandas as pd
import pickle

# RUN: 'pip install whoosh' on: ModuleNotFoundError: No module named 'whoosh'
import os, os.path

#from whoosh import index
import whoosh
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.writing import AsyncWriter

from datetime import datetime
from pytz import timezone

from IPython.display import clear_output

######################################

class Indexer:
    '''This class is used to create index of the WP database'''
    
    index_created = False
    index_directory = ""
    data_path = ""
    # [id, article_url, title, author, publised_date, contents, type, source]
    # Define fields using whoosh's 'Schema' | https://whoosh.readthedocs.io/en/latest/schema.html#
    # Can add field boost here
    schema = Schema(doc_id = whoosh.fields.ID(unique=True, stored=True),\
                       article_url = whoosh.fields.STORED,\
                       title = whoosh.fields.TEXT(stored=True),\
                       author = whoosh.fields.ID,\
                       published_date = whoosh.fields.DATETIME,\
                       contents = whoosh.fields.TEXT)
    
    def __init__(self):
        print('Indexer has been initiated.')
        
    def indexCreated(self):
        return self.index_created
      
    ## Count documents
    def countDocuments(self):
        print("Counting documents, this might take a while...")
        counter = 0
        tenKCounter = 0

        with jsonlines.open(self.data_path) as reader:
            for obj in tqdm(reader.iter(type=dict, skip_invalid=True)):
                tenKCounter += 1
                counter += 1
            
                if tenKCounter >= 10000:
                    print("Current count is: " + str(counter))
                    tenKCounter = 0 
        print("Last count is: " + str(counter))
        print("Counting done.")
        
    def extractDocumentContents(self, contents):
        '''Extracts document contents from array of dicts to string'''
        build_string = ""
        json_array = json.dumps(str(contents))
        
        for item in json_array:
            try:
                print(str(item['type']))
                if str(item['type']) == "sanitized_html":
                    build_string = build_string + str(item['content'])
                    build_string = build_string + "\n"
            except:
                build_string = "Could not retreive content"
        
        print(build_string)
        return build_string
    
    def extractDocumentDate(self, epochTimestamp):
        '''Converst UNIX epoch timestamp to DATETIME'''
        if (epochTimestamp == ''):
            return datetime(1,1,1)
        try:
            # Test if source is within specified dates
            date_info = epochTimestamp
            removed_zeros = str(date_info)[0:10]
            timestamp = int(removed_zeros)
            return datetime.fromtimestamp(timestamp, timezone('EST'))
        except:
            print("ERROR: In extractDocumentDate()")
            print("ERROR: ", sys.exc_info()[0])
            #print("Caused by value: " + str(date_info))
            return datetime(1,1,1)
    
    def setIndexDirectory(self, directory):
        self.index_directory = directory
    
    def getIndexLocation(self):
        return self.index_directory
    
    def setDataPath(self, file_location):
        self.data_path = file_location
    
    def getDataPath(self):
        return self.data_path
    
    def setIndexCreatedTrue(self):
        self.index_created = True
    
    def getSchema(self):
        return self.schema
    
    def index(self):
        '''This method indexes the data'''
        # https://whoosh.readthedocs.io/en/latest/api/writing.html
        # https://appliedmachinelearning.blog/2018/07/31/developing-a-fast-indexing-and-full-text-search-engine-with-whoosh-a-pure-python-library/
        
        assert self.index_directory and self.data_path # Both variables have to be set
        
        schema = self.getSchema()
              
        # Creating a index writer to add document as per schema
        myindex = whoosh.index.create_in(self.index_directory, schema)
        writer = whoosh.writing.AsyncWriter(myindex)
        
        # Loop over data
        print("Looping over data. Indexing each article.")
        print("This might take a few minutes...")
        counter = 0
        checker = 10000
        fault_counter = 0
        with jsonlines.open(self.data_path) as reader:
            for obj in tqdm(reader.iter(type=dict, skip_invalid=True)):
                retreived_date = self.extractDocumentDate(obj['published_date'])
                if retreived_date != datetime(1,1,1):
                    counter = counter + 1
                    writer.add_document(doc_id=obj['id'],\
                                        article_url=obj['article_url'],\
                                        title=obj['title'],\
                                        author=obj['author'],\
                                        published_date=retreived_date,\
                                        contents = self.extractDocumentContents(obj['contents']))
                else:
                    fault_counter = fault_counter + 1
                    
                if counter > checker:
                    writer.commit()
                    writer = whoosh.writing.AsyncWriter(myindex)
                    checker = checker + 10000
                    clear_output(wait=True)
                    print("Looping over data. Indexing each article.")
                    print("This might take a few minutes...")
                    print("Indexed " + str(counter - 1) + " articles")
                    break
                    if fault_counter > 0:
                        print("Found " + str(fault_counter) + " wrongly formatted articles")
            
            print("Looping complete.")
        print("Index created!")
        self.setIndexCreatedTrue()
        

In [103]:
###############################################
from whoosh.qparser import QueryParser, MultifieldParser
from whoosh import scoring
from whoosh.index import open_dir

class Ranking:
    '''This class contains functions that are used to create a ranking based on different algorithms'''
    
    show_n_results = 350
    index_directory = ""
    
    def __init__(self):
        print("Ranking class has been initiated.")
    
    def indexCreated(self):
        return indexCreated
    
    def setIndexDirectory(self, directory):
        self.index_directory = directory
    
    def openIndex(self):
        assert self.index_directory
        return open_dir(self.index_directory)
    
   
    def resultsToList(self, results):
        results_list = []
        for result in results:
            result_dict = result.fields()
            result_dict['score'] = result.score
            results_list.append(result_dict)
        return results_list
            
    
    def searchWithSelectedAlgorithm(self, user_query, indexer, scoring_algorithm):
    #TODO check if index has been created
        index_dir = indexer.getIndexLocation()
        self.setIndexDirectory(index_dir)
        index = self.openIndex()
        results_list = []
        schema = indexer.getSchema()
        fields = schema.scorable_names()
        
        with index.searcher(weighting=scoring_algorithm) as searcher:
            #parsed_query = QueryParser("title", index.schema).parse(user_query)
            parsed_query = MultifieldParser(fields, schema).parse(user_query)
            results = searcher.search(parsed_query, limit=self.show_n_results)
            results_list = self.resultsToList(results)
        
        return results_list
    
    def searchTermFrequency(self, user_query, indexer):
        '''Returns results for a given query based on the TF-IDF search algorithm. Returned value is a list of dictionaries.'''
        scoring_algorithm = scoring.Frequency
        return self.searchWithSelectedAlgorithm(user_query, indexer, scoring_algorithm)            
    
    def searchTF_IDF(self, user_query, indexer):
        '''Returns results for a given query based on the TF-IDF search algorithm. Returned value is a list of dictionaries.'''
        scoring_algorithm = scoring.TF_IDF
        return self.searchWithSelectedAlgorithm(user_query, indexer, scoring_algorithm)
    
    def searchBM25F(self, user_query, indexer):
        '''Returns results for a given query based on the BM25F search algorithm. Returned value is a list of dictionaries.'''
        scoring_algorithm = scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)
        return self.searchWithSelectedAlgorithm(user_query, indexer, scoring_algorithm)
        

In [113]:
################################################
import urllib
import matplotlib.pyplot as plt
from collections import defaultdict


class Evaluation:
    '''Used to evaluate the ranking results'''
    #should check if query used is in the 
    
    # TODO load evaluation document and queries
    # TODO pass queries on to ranking and obtain ranking results
    # TODO compare ranking results to TREC evaluation
    # TODO display results, how?
    # TODO test different algorithms
    
    queries = []
    query_url = "https://trec.nist.gov/data/core/topics2018.txt"
    
    def load_queries(self):
        '''Returns (query, number) pair where number is the query ID'''
        
        query_file = urllib.request.urlopen(self.query_url)
        titles, numbers = [], []
        title = False
        
        for line in query_file:
            decoded_line = line.decode("utf-8")
            
            if title and not "</title>" in decoded_line:
                titles.append(decoded_line.replace("\n", "").strip())

            if "<title>" in decoded_line:
                title = True
            
            if "</title>" in decoded_line:
                title = False
                
            if "<num>" in decoded_line:
                num = decoded_line.replace("<num>", "").replace("</num>", "").replace("\n", "").replace("Number: ", "").strip()
                numbers.append(num)
        
        queries = list(zip(titles, numbers))
        return queries 

        
    def addResults(self, results, results_formatted, query, query_id):
        count = 0
        for hit in results: 
            score = int(hit['score'])
                
            current_result = str(query_id) + " 0 "  + str(hit["doc_id"]) + " " + str(score)           
            results_formatted.append(current_result)
            
            count += 1
        #print("for query {} there are {} hits".format(query, count))
        return results_formatted
        
        
    def writeResults(self, results):
        
        with open("results.txt", "w") as results_file:
            for result in results:
                results_file.write(result + "\n")
    
    def plotResults(self):
        
        all_scores = []
        
        with open("results.txt", "r") as results_file:
            for line in results_file:
                current_result = line.split(" ")
                current_result = [i.strip("\n") for i in current_result]                    
                all_scores.append(int(float(current_result[-1])))
                
        _ = plt.hist(all_scores, bins='auto')  
        plt.title("Histogram with 'auto' bins")
        
        plt.show()
        
        print("Max score ", max(all_scores))
        print("Min score ", min(all_scores))
        
    def getRelevantDocumentsFromTREC(self):
        '''Returns a dict of {ID, [documents]} where the ID indicates the query ID and documents contains a list of relevant documents.'''
        #NOTE: this does NOT take into account differences between 1 and 2 judgments
        #Can add tuple of (doc_id, score) in dict: {Query_id: [(doc_id, score)]}
        # read in true scores file
        true_scores = []
        true_scores_url = "https://trec.nist.gov/data/core/qrels2018.txt"
        true_scores_file = urllib.request.urlopen(true_scores_url)
        stored_lines = []
        for line in true_scores_file:
            decoded_line = line.decode("utf-8")
            decoded_line = decoded_line.split(" ")
            decoded_line = [i.strip("\n") for i in decoded_line]
            
            stored_lines.append([decoded_line[0], decoded_line[2], decoded_line[3]]) # Creates list of lines of [query_id, document_id, judgement]
        
        #Init dict
        relevant_documents_dict = {}
        for line in stored_lines:
            query_id = line[0]
            relevant_documents_dict[query_id] = []   
        #Fill dict
        for line in stored_lines:
            query_id = line[0]
            document_id = line[1]
            judgement = line[2]
            if judgement == '1' or judgement == '2':
                relevant_documents_dict[query_id].append(document_id)    
        #print(relevant_documents_dict)
        return relevant_documents_dict
    
    def getResultsFromTXT(self):
        # read in results.txt
        #Can add tuple of (doc_id, score) in dict: {Query_id: [(doc_id, score)]}, but should convert score
        stored_lines = []
        with open("results.txt", "r") as results_file:
            for line in results_file:
                current_result = line.split(" ")
                current_result = [i.strip("\n") for i in current_result]
                
                stored_lines.append([current_result[0], current_result[2], current_result[3]]) # Creates list of lines of [query_id, document_id, judgement]
        # Init dict
        result_documents_dict = {}
        for line in stored_lines:
            query_id = line[0]
            result_documents_dict[query_id] = []   
        #Fill dict
        for line in stored_lines:
            query_id = line[0]
            document_id = line[1]
            judgement = line[2]
            result_documents_dict[query_id].append(document_id)
        #print(result_documents_dict)
        return result_documents_dict
    
    def calculateFmeasure(self, recall, precision, B=1):
        if (recall + precision) == 0:
            return 0
        else:
            return ((B*B + 1)*precision*recall)/((B*B)*precision + recall)
    
    def calculateEvaluationMeasuresPerQuery(self, search_engine_results_dict, relevant_documents_dict):
        resultEvalList = [] #holds results for each query
        for query_id, documents in relevant_documents_dict.items():       
            true_positives = 0
            false_positives = 0
            false_negatives = 0
            #calculates tp, fp, fn
            if query_id in search_engine_results_dict.keys(): #SE results contains matches for this query
                for document in search_engine_results_dict[query_id]:
                    if document in relevant_documents_dict[query_id]: 
                        true_positives += 1
                    else: # document from results is not relevant
                        false_positives += 1
                # false negatives are number of documents that are relevant but are not in the results
                # so, number of relevant documents minus the found matches
                false_negatives = len(set(relevant_documents_dict[query_id]) - set(search_engine_results_dict[query_id]))
            
            else: # SE results did not contain matches for current query
                true_positives = 0
                false_positives = 0
                false_negatives = len(relevant_documents_dict[query_id])
            
            # Recall: (number of retrieved docs that are relevant) / (number of retrieved docs)
            if (true_positives + false_positives) != 0: # avoid dividing by 0
                recall = true_positives / (true_positives + false_positives) 
            else:
                recall = 0
            
            # Precision: (number of retrieved docs that are relevant) / (number of relevant docs)
            if (true_positives + false_negatives) != 0: # avoid dividing by 0
                precision = true_positives / (true_positives + false_negatives)
            else:
                precision = 0
            
            f_1_measure = self.calculateFmeasure(recall, precision, 1)
            
            queryEval = {}
            queryEval['query_id'] = query_id
            queryEval['true_positives'] = true_positives
            queryEval['false_positives'] = false_positives
            queryEval['false_negatives'] = false_negatives
            queryEval['recall'] = recall # round(recall, 4) #round to 4 decimals
            queryEval['precision'] = precision #round(precision, 4)
            queryEval['f_1_measure'] = f_1_measure #round(f_1_measure, 4)
            
            resultEvalList.append(queryEval)
        return resultEvalList
    
    def evaluateResults(self):
                
        search_engine_results_dict = self.getResultsFromTXT() # contains results from our search engine
        relevant_documents_dict = self.getRelevantDocumentsFromTREC() # contains relevant results defined by TREC
        
        resultList = self.calculateEvaluationMeasuresPerQuery(search_engine_results_dict, relevant_documents_dict)
        
        recall_list = []
        precision_list = []
        f_measure_list = []
        for result in resultList:
            recall_list.append(result['recall'])
            precision_list.append(result['precision'])
            f_measure_list.append(result['f_1_measure'])
        
        average_recall= sum(recall_list) / len(recall_list)
        average_precision= sum(precision_list) / len(precision_list)
        average_f_measure= sum(f_measure_list) / len(f_measure_list)
        
        # calculate average precision

        print("Average Recall", average_recall)
        print("Average Precision", average_precision)
        print("Average F_1-Measure", average_f_measure)

# # ---------------
# #             all_true_scores.append(int(decoded_line[-1]))
            
# #             true_scores.append(decoded_line)
# #             scores_per_query.append(int(decoded_line[-1]))
# #             if judgement == '1' or judgement == '2':
# #                 judgements_per_query.append(query_id)
    
# #             if query != current_query:
# #                 if count != 0:
# #                     judgements_per_query.append(count)
# #                     count = 0
# #                 current_query = query
                
                
# #             else:
# #                 judgements_per_query.append(decoded_line[0])
# #                 count += 1
            
#         _ = plt.hist(judgements_per_query, bins=50)
#         plt.title("Nr of judgements per query")
#         plt.xticks(rotation='vertical')
#         plt.show()
        
# #         _ = plt.hist(all_true_scores, bins='auto')  
# #         plt.title("Histogram true relevance scores with 'auto' bins")
        
# #         plt.show()
# #         print("In total there are {} query-doc pairs".format(len(all_true_scores)))
# #         print("of which {}: 0, {}: 1, {}: 2".format(all_true_scores.count(0), all_true_scores.count(1), all_true_scores.count(2)))
#         # compare results.txt (preds) with true scores
#         #correct_preds, incorrect_preds = 0, 0
        
#         best_performance = 0
        
        
#         for lower_bound in [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24]:
#             for upper_bound in [26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48]:
                
#                 correct_preds, incorrect_preds = 0, 0
#                 matches_found = 0
#                 for pred in predictions:
#                     for true_score in true_scores:

#                         if pred[0] == true_score[0] and pred[2] == true_score[2]:
#                             matches_found += 1

#                             pred_score = 0
#                             current_score = int(float(pred[3]))

#                             if lower_bound < int(float(pred[3])) < upper_bound:
#                                 pred_score = 1
#                             if upper_bound < int(float(pred[3])):
#                                 pred_score = 2

#                             if pred_score == int(true_score[3]):
#                                 correct_preds += 1
#                             else:
#                                 incorrect_preds += 1

#                             break

#                 if correct_preds == incorrect_preds == 0:
#                     print("No matches found, because doc id's do not yet work")

#                 else:
#                     performance = correct_preds/(correct_preds+incorrect_preds) * 100
#                     if performance > best_performance:
                        
#                         print("For boundaries: [{}, {}]".format(lower_bound, upper_bound))
#                         print("Matches found {} of {} predictions ".format(matches_found, len(predictions)))

#                         print("Performance: {:.2f}%".format(performance))
#                         print("Correct predictions: ", correct_preds)
#                         print("Incorrect predictions: ", incorrect_preds)

#                         best_performance = performance

        


In [105]:
##################################################
class UserInterface:
    '''Class used to handle user interaction'''
          
    def __init__(self):
        '''This is the constructor method of the UI'''
        print("UserInterface has been initiated.")       
        
    def getUserQuery(self):
        ''''Retrieves the query from the user and returns it'''
        userInput = input("Please enter a query: ")
        return userInput
    
    def indexAlreadyCreated(self, directory_location):
        print("Index is set.")
        print("Index files are stored in directory:" + directory_location)
        return
    
    def shouldCreateIndex(self):
        print("Do you wish to create a new index?")
        userInput = input("Answer [y/n]: ")
        if userInput == ('y' or 'Y' or "yes"):
            return True
        return False
    
    def indexIsNotSet(self):
        print("Index has not been set.")
        return
    
    def getIndexDirectory(self, default_directory):
        userInput = input("Please enter a directory name for the index (default = /"+ default_directory + "): ")
        if userInput == "":
            return default_directory
        return userInput
    
    def getDataPath(self, default_data_path):
        print("Please enter the path to the TREC_Washington_Post_collection.v2.jl file.")
        print("The default location is: /"+ default_data_path + "): ")
        userInput = input()
        if userInput == "":
            return default_data_path
        return userInput
    
    def creatingIndex(self, index_directory, data_path):
        print("Creating Index.")
        print("Selected index directory:\t" + index_directory)
        print("Selected file to index:\t" + data_path)
        
    def openingIndex(self, index_directory):
        print("Opening Index.")
        print("Selected index directory to use:\t" + index_directory)
    
    def shouldAddExistingIndex(self):
        print("Do you wish to add an existing index?")
        userInput = input("Answer [y/n]: ")
        if userInput == ('y' or 'Y' or "yes"):
            return True
        return False
    
    def stopSearchEngine(self):
        print("Search Engine is stopped.")
        
    def shouldTerminateSearchEngine(self):
        print("Do you wish to stop the search engine?")
        userInput = input("Answer [y/n]: ")
        if userInput == ('y' or 'Y' or "yes"):
            return True
        return False
    
    def printResults(self, results):
        if len(results) > 0:
            print("\nPrinting results:")
            counter = 1
            for result in results:
                print("Result " + str(counter) + ":")
                print(result)
                print("")
                counter += 1
            return
        print("No results found")
        return
            
    

In [123]:
import time

class SearchEngine:
    '''This class embodies the search engine and acts a a controller class'''
    
    UI = None
    Indexer = None
    Ranking = None
    
    RUNNING = False
    STOPPED = False
    EVALUATION_MODE = True
    USER_MODE = False
    
    user_query = ""
    DEFAULT_data_path = 'WP-corpus/data/TREC_Washington_Post_collection.v2.jl'
    DEFAULT_index_directory = "indexdir"
    
    
    def __init__(self):
        print("Search Engine has been initiated.")
    
    def stopSearchEngine(self, UI):
        self.RUNNING = False
        UI.stopSearchEngine()        
        
    def setIndex(self, Indexer, UI):
        if Indexer.indexCreated():
            directory_location = Indexer.getIndexLocation()
            UI.indexAlreadyCreated(directory_location)
            return
        else:
            UI.indexIsNotSet()
            if UI.shouldCreateIndex(): # This is for creating a new index
                # Index directory (setDir)
                index_directory = UI.getIndexDirectory(self.DEFAULT_index_directory)
                Indexer.setIndexDirectory(index_directory)
                # Check if index_dir exist and makes one if it doesn't
                if not os.path.exists(index_directory):
                    os.mkdir(index_directory)
                
                # Data file path (setPath)    
                data_path = UI.getDataPath(self.DEFAULT_data_path)
                Indexer.setDataPath(data_path)
                
                # Creating Index
                UI.creatingIndex(index_directory, data_path)
                Indexer.index()
                return
                
            elif UI.shouldAddExistingIndex(): # This is for adding an existing index
                
                # TODO add checks to check for existence of index?
                
                # Index directory, duplicate code -> (setDir)
                index_directory = UI.getIndexDirectory(self.DEFAULT_index_directory)
                Indexer.setIndexDirectory(index_directory)
                # Check if index_dir exist and makes one if it doesn't
                if not os.path.exists(index_directory):
                    os.mkdir(index_directory)
                
                # Data file path, duplicate code -> (setPath)    
                data_path = UI.getDataPath(self.DEFAULT_data_path)
                Indexer.setDataPath(data_path)
                
                # Set index created to true
                UI.openingIndex(index_directory)
                Indexer.setIndexCreatedTrue()
                return
                                
            else: # TODO: Ask if the user wants to continue or stop
                if UI.shouldTerminateSearchEngine(): 
                    self.STOPPED = True
                return
            return
        
    def evaluateTermFrequency(self, evaluation):
        print("\nEvaluating Term Frequency")
        queries  = evaluation.load_queries()
        print("Queries loaded")
        results_formatted = []
        print("Performing searches on index")

        for (query, query_id) in tqdm(queries):
            results = self.Ranking.searchTermFrequency(query, self.Indexer)
            results_formatted = evaluation.addResults(results, results_formatted, query, query_id)

        print("Writing search results to results.txt")
        evaluation.writeResults(results_formatted)
        print("Results for Term Frequency: ")
        evaluation.evaluateResults()
        print('')
    
    def evaluateTF_IDF(self, evaluation):
        print("\nEvaluating TF IDF")
        queries  = evaluation.load_queries()
        print("Queries loaded")
        results_formatted = []
        print("Performing searches on index")

        for (query, query_id) in tqdm(queries):
            results = self.Ranking.searchTF_IDF(query, self.Indexer)
            #results = self.Ranking.searchBM25F(query, self.Indexer)
            results_formatted = evaluation.addResults(results, results_formatted, query, query_id)

        print("Writing search results to results.txt")
        print("Results for TF IDF:")
        evaluation.writeResults(results_formatted)
        evaluation.evaluateResults()
        print('')
        
    def evaluateBM25F(self, evaluation):
        print("\nEvaluating BM25F")
        queries  = evaluation.load_queries()
        print("Queries loaded")
        results_formatted = []
        print("Performing searches on index")

        for (query, query_id) in tqdm(queries):
            results = self.Ranking.searchBM25F(query, self.Indexer)
            results_formatted = evaluation.addResults(results, results_formatted, query, query_id)

        print("Writing search results to results.txt")
        evaluation.writeResults(results_formatted)
        print("Results for BM25F:")
        evaluation.evaluateResults()
        print('')
       
    def run(self):
        '''This function start the search engine'''
        print('Search Engine started.')
        
        self.UI = UserInterface()
        self.Indexer = Indexer()
        self.Ranking = Ranking()
                
        self.RUNNING = True
        while self.RUNNING:
            if not self.Indexer.indexCreated():
                #TODO Inform user that an index needs to be set
                self.setIndex(self.Indexer, self.UI)
                if self.STOPPED == True: # Stop SE if user did not want to continue
                    self.stopSearchEngine(self.UI)
                    break
            #TODO ask user to choose a mode
                       
            if self.EVALUATION_MODE:
            # This is used to evaluate SE against the TREC relevance judgements
                index_dir = self.Indexer.getIndexLocation()
                self.Ranking.setIndexDirectory(index_dir)
                
                evaluation = Evaluation()
                
                self.evaluateTermFrequency(evaluation)
                self.evaluateTF_IDF(evaluation)
                self.evaluateBM25F(evaluation)
                
                print("Done")
                self.RUNNING = False
 
            if self.USER_MODE:
                # This is used to query questions
                #TODO ask for search algorithm
                print("Entering USER_MODE")
                user_query = self.UI.getUserQuery()
                print("Entered query: " + user_query)
                
                results = self.Ranking.searchTermFrequency(user_query, self.Indexer)
                self.UI.printResults(results)
                print("Starting over.")
                
           
            print("------------------------------------------------------------------------")
            time.sleep(1)

In [124]:
mySearchEngine = SearchEngine()
mySearchEngine.run()

Search Engine has been initiated.
Search Engine started.
UserInterface has been initiated.
Indexer has been initiated.
Ranking class has been initiated.
Index has not been set.
Do you wish to create a new index?
Answer [y/n]: n
Do you wish to add an existing index?
Answer [y/n]: y
Please enter a directory name for the index (default = /indexdir): 
Please enter the path to the TREC_Washington_Post_collection.v2.jl file.
The default location is: /WP-corpus/data/TREC_Washington_Post_collection.v2.jl): 

Opening Index.
Selected index directory to use:	indexdir

Evaluating Term Frequency


  0%|          | 0/50 [00:00<?, ?it/s]

Queries loaded
Performing searches on index


100%|██████████| 50/50 [00:14<00:00,  3.57it/s]


Writing search results to results.txt
Results for Term Frequency: 
Average Recall 0.2803283498522676
Average Precision 0.19691598885904463
Average F_1-Measure 0.1806866382333958


Evaluating TF IDF


  0%|          | 0/50 [00:00<?, ?it/s]

Queries loaded
Performing searches on index


100%|██████████| 50/50 [00:14<00:00,  3.54it/s]


Writing search results to results.txt
Results for TF IDF:
Average Recall 0.2803283498522676
Average Precision 0.19691598885904463
Average F_1-Measure 0.1806866382333958


Evaluating BM25F


  0%|          | 0/50 [00:00<?, ?it/s]

Queries loaded
Performing searches on index


100%|██████████| 50/50 [00:14<00:00,  3.52it/s]


Writing search results to results.txt
Results for BM25F:
Average Recall 0.2803854927094105
Average Precision 0.19706983501289077
Average F_1-Measure 0.18076997156672914

Done
------------------------------------------------------------------------
