In [None]:
class Document:
    '''A document object is a dictionary'''
    # Constructer method
    def __init__(self, **dictionary_entries):
        '''Initiates Document object with values from dictionary'''
        # [id, article_url, title, author, publised_date, contents, type, source]
        self.__dict__.update(dictionary_entries)
    
    # Getters (Should update: https://docs.python.org/2/library/functions.html#property)
    def id(self):
        return self.id
    def article_url(self):
        return self.article_url
    def title(self):
        return self.title
    def author(self):
        return self.author
    def published_date(self):
        return self.published_date
    def contents(self):
        return self.contents
    def type(self):
        return self.type
    def source(self):
        return self.source
    
    # Method for printing the object
    def __str__(self):
        # TODO: implement printing function
        return "Document Identifier: " + self.id()
        

In [None]:
class DocumentDataStore:
    '''This class is used to house all the document data'''
    #https://www.youtube.com/watch?v=YBz3ERXQw_M
    
    def __init__(self, id_document_tuples):
        '''Expects a list of: [{id, document_object},...]'''
        self.id_document_tuples = id_document_tuples
        
    

In [4]:
# RUN: 'pip install jsonlines' on: ModuleNotFoundError: No module named 'jsonlines'
import jsonlines
from tqdm import tqdm
import pandas as pd
import pickle

# RUN: 'pip install whoosh' on: ModuleNotFoundError: No module named 'whoosh'
import os, os.path

#from whoosh import index
import whoosh
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.writing import AsyncWriter

from datetime import datetime
from pytz import timezone



######################################

class Indexer:
    '''This class is used to create index of the WP database'''
    
    def __init__(self, data_path):
        self.data_path = data_path
      
    ## Count documents
    def countDocuments(self):
        print("Counting documents, this might take a while...")
        counter = 0
        tenKCounter = 0

        with jsonlines.open(self.data_path) as reader:
            for obj in tqdm(reader.iter(type=dict, skip_invalid=True)):
                tenKCounter += 1
                counter += 1
            
                if tenKCounter >= 10000:
                    print("Current count is: " + str(counter))
                    tenKCounter = 0 
        print("Last count is: " + str(counter))
        print("Counting done.")
    
    def printDocumentData(self):
        '''Prints every 10000th object'''
        #dataframe = pd.DataFrame(columns = ['Title', 'Content', 'Year'])
        with jsonlines.open(self.data_path) as reader:
            counter = 0
            for obj in tqdm(reader.iter(type=dict, skip_invalid=True)):
                counter += 1
                if (counter % 10000) == 0:
                    print("Object number:" + str(counter))
                    print(type(obj))
                    doc = Document(**obj)
                    print(doc)
                    
    def createDocumentDataStore(self):
        '''This method should read data and create the document data store'''
        
        with jsonlines.open(self.data_path) as reader:
            #  PRODUCES MEMORY ERROR
            #  PRODUCES MEMORY ERROR
            id_doc_list = []
            print("Start iteration")
            for obj in tqdm(reader.iter(type=dict, skip_invalid=True)):
                document = Document(**obj)
                id_doc_list.append({document.id, document})
            
            print("End iteration")
            print("Create store")
            dds = DocumentDataStore(id_doc_list)
            print("Write file")
            outfile = open('docstore.pkl', 'wb')
            pickle.dump(dds, outfile)
            outfile.close()
            print("Done")
        #  PRODUCES MEMORY ERROR
        
    def extractDocumentContents(self, contents):
        '''Extracts document contents from array of dicts to string'''
        #TODO IMPLEMENT
        return ""
    
    def extractDocumentDate(self, epochTimestamp):
        '''Converst UNIX epoch timestamp to DATETIME'''
        if (epochTimestamp == ''):
            return datetime(1,1,1)# TODO IMPLENT Error Handling: ValueError: invalid literal for int() with base 10: 'None'
        
        try:
            date_info = epochTimestamp
            removedZeros = str(date_info)[0:10]
            timestamp = int(removedZeros)
            return datetime.fromtimestamp(timestamp, timezone('EST'))
        except:
            print("ERROR: In extractDocumentDate()")
            print("ERROR: ", sys.exc_info()[0])
            print("Caused by value: " + str(date_info))
            return datetime(1,1,1)
        
    
    def index(self):
        '''This method indexes the data'''
        # https://whoosh.readthedocs.io/en/latest/api/writing.html
        # https://appliedmachinelearning.blog/2018/07/31/developing-a-fast-indexing-and-full-text-search-engine-with-whoosh-a-pure-python-library/
        
        
        #  [id, article_url, title, author, publised_date, contents, type, source]
        # Define fields using whoosh's 'Schema' | https://whoosh.readthedocs.io/en/latest/schema.html#
        # Can add field boost here
        schema = Schema(doc_id = whoosh.fields.ID(unique=True),\
                       article_url = whoosh.fields.STORED,\
                       title = whoosh.fields.TEXT(stored=True),\
                       author = whoosh.fields.ID,\
                       published_date = whoosh.fields.DATETIME,\
                       contents = whoosh.fields.TEXT,\
                       type = whoosh.fields.STORED,\
                       source = whoosh.fields.STORED)
        
        # Create directory
        if not os.path.exists("indexdir"):
            os.mkdir("indexdir")
        
        # Creating a index writer to add document as per schema
        myindex = whoosh.index.create_in("indexdir",schema)
        writer = whoosh.writing.AsyncWriter(myindex)
        
        # Loop over data
        with jsonlines.open(self.data_path) as reader:
            for obj in tqdm(reader.iter(type=dict, skip_invalid=True)):
                writer.add_document(doc_id=obj['id'],\
                                   article_url=obj['article_url'],\
                                   title=obj['title'],\
                                   author=obj['author'],\
                                   published_date=self.extractDocumentDate(obj['published_date']),\
                                   contents = self.extractDocumentContents(obj['contents']),\
                                   type=obj['type'],\
                                   source=obj['source'])
            writer.commit()
        print("Index created!")  
        
    

data_path = 'WP-corpus/data/TREC_Washington_Post_collection.v2.jl'
indexer = Indexer(data_path)
#indexer.countDocuments()
#indexer.printDocumentData()
#indexer.createDocumentDataStore()
indexer.index()

409428it [02:22, 4631.40it/s]

ERROR: In extractDocumentDate()
ERROR:  <class 'ValueError'>
Caused by value: None
ERROR: In extractDocumentDate()
ERROR:  <class 'ValueError'>
Caused by value: None
ERROR: In extractDocumentDate()
ERROR:  <class 'ValueError'>
Caused by value: None
ERROR: In extractDocumentDate()
ERROR:  <class 'ValueError'>
Caused by value: None


410363it [02:22, 4403.97it/s]

ERROR: In extractDocumentDate()
ERROR:  <class 'ValueError'>
Caused by value: None
ERROR: In extractDocumentDate()
ERROR:  <class 'ValueError'>
Caused by value: None
ERROR: In extractDocumentDate()
ERROR:  <class 'ValueError'>
Caused by value: None


588968it [07:45, 3896.53it/s]

ERROR: In extractDocumentDate()
ERROR:  <class 'ValueError'>
Caused by value: None
ERROR: In extractDocumentDate()
ERROR:  <class 'ValueError'>
Caused by value: None


595037it [07:47, 1273.38it/s]

Index created!





In [None]:
# import nltk
# import string
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# #nltk.download('stopwords')
# from nltk.stem.porter import PorterStemmer


# # This function processes the user query
# def query_representation_function(query):
#     # Convert to lowercase
#     query = query.lower()
#     # Remove Punctuation ( !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ )
#     query_removed_punctuation = "".join([char for char in query if char not in string.punctuation])
#     # Tokenization
#     query_tokenized = word_tokenize(query_removed_punctuation)
#     # Stopword Filtering
#     english_stopwords = stopwords.words('english')
#     query_removed_stopwords = [word for word in query_tokenized if word not in english_stopwords]
#     # Stemming
#     porter = PorterStemmer()
#     query_stemmed = [porter.stem(word) for word in query_removed_stopwords]
    
#     return query_stemmed

# def test_print():
#     test_query = "This isn't a very long test query... I hope that I will be receiving a good answer! Give me results from the 2016 elections"
#     print("Test query:\n" + test_query)
#     print("Processed query:")
#     print(query_representation_function(test_query))
#     print("Test query after processing:\n" + test_query)

# test_print()

###############################################

class Ranking:
    '''This class is used to create a ranking'''
    
    def __init__(self):
        print("Ranking class has been initiated")

In [None]:
# # scoring function
# def get_score(row, query):
#     score = 0
#     for word in query:
#         for col in ['Title', 'Genre', 'Description', 'Director', 'Actors', 'Year']:
#             if word in str(row[col]):
#                 # for now matching a keyword in any column counts the same (insufficient for now ofc)
#                 score += 1    
#     return score


# def rank_movies(data, query):
#     data['Score'] = 0
#     for i, row in tqdm(data.iterrows()):
#         score = get_score(row, query)
#         data.at[i,'Score'] = score

#     # first sort on Score then Rating
#     data.sort_values(['Score', 'Rating'], ascending = False, inplace = True)
    
#     return data
    
# ranked_data = rank_movies(data, query)

# display top 5 movies
#ranked_data.head(5)

################################################
class Evaluation:
    '''Used to evaluate the ranking results'''
    #should check if query used is in the 

In [None]:
# display results

##################################################
class UserInteraction:
    '''Class used to handle user interaction'''
    shouldEvaluate = False
    userQuery = ""
      
    def __init__(self):
        '''This is the constructor method'''
        print("UserInteraction class has been initiated.")
        # Retrieving query
        self.userQuery = self.get_user_query()
        print("Query entered: " + self.userQuery)
        
        
    def get_user_query(self):
        ''''Retrieves the query from the user and returns it'''
        userInput = input("Please enter a query: ")
        return userInput
        
        
    

In [None]:
# Block used to run everything:

UI = UserInteraction()