In [1]:
#imports and downloads
import nltk
import spacy
import json
import math
import glob
import time
import regex as re

from pathlib import Path
from bs4 import BeautifulSoup
from spacy import displacy
from nltk import pos_tag
from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.metrics.distance import jaccard_distance 
from nltk.util import ngrams

nlp = spacy.load("en_core_web_sm")

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')




[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mtmyd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mtmyd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mtmyd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mtmyd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mtmyd\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\mtmyd\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [2]:
#setup
stop_words  = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

In [3]:
#Reads and returns the contents of a file
#@param file_path the path name of the file
#@returns contents_unfiltered unfiltered contents of the file
def readFileContents (file_path):
    with open(file_path) as file_object:
        contents_unfiltered = file_object.read()
    file_object.close()
    return contents_unfiltered

#Checks the spelling of a word not in corpus using the Jaccard Distance
def spell_check(vocab, word):
    correct_words = vocab
    temp = [(jaccard_distance(set(ngrams(word, 2)), 
                              set(ngrams(w, 2))),w) 
            for w in correct_words if w[0]==word[0]] 
    return(sorted(temp, key = lambda val:val[0])[0][1])

#Uses Beautiful Soup to parse the contents of an html file
#@param contents the contents to soupify
#@title_weight1 the weight of content tagged heading1 
#@title_weight2 the weight of content tagged heading2
#@title_weight3 the weight of content tagged heading3
#@title_weight4 the weight of content tagged title - usually wieghted highest
#@returns string of content
def soupify (contents, title_weight1 = 1, title_weight2 = 1, title_weight3 = 1, title_weight4 = 1):

    #grabbing everything from the soup
    soup = BeautifulSoup(contents, 'html.parser')
    data = soup.find_all('p')
    data.append(soup.find_all('span'))

    #adds specified weighting to titles
    for i in range(title_weight1):
        data.append(soup.find_all('h1'))
    for i in range(title_weight2):
        data.append(soup.find_all('h2'))
    for i in range(title_weight3):
        data.append(soup.find_all('h3'))
    for i in range(title_weight4):
        data.append(soup.find_all('title'))
        
    data.append(soup.find_all('li'))

    return str(data)

#Performs lemmatization on non-tokenized contents
#@param contents_not_tokenized contents that have not yet been tokenized
#@returns string of contents      
def lemmatize (contents_not_tokenized):
    lemmed_string = ''
    for w in contents_not_tokenized:
        lemmed_string += '' + lemmatizer.lemmatize(w)
    
    return lemmed_string

#Performs stemming on non-tokenized contents
#@param contents_not_tokenized contents that have not yet been tokenized
#@returns string of contents  
def stemmatize (contents_not_tokenized):
    stemmed_string = ''
    for w in contents_not_tokenized:
        stemmed_string += '' + ps.stem(w)
        
    return stemmed_string

#Removes stop words from tokenized content
#@param contents_tokenized the contents to remove stop words from
#@returns list of tokenized strings with stop words removed
def stopWords (contents_tokenized):
    return [w for w in contents_tokenized if not w in stop_words]

#Finds the high frequency bigrams from non-tokenized contents
#High cost - requires tokenizing just to find freq dist of bigrams
#Contents must be tokenized later too
#@param contents_not_tokenized the non-tokenized contents
#@param freq_requirement the amount of times a bigram must appear in a document to be included in the final lsit
#@returns important_bigrams a dictionary with important bigrams as keys and their frequencies as values
def bigrams (contents_not_tokenized, freq_requirement):
    unfiltered_token = word_tokenize(contents_not_tokenized)
    bigrams = nltk.bigrams(unfiltered_token)
    frequency = nltk.FreqDist(bigrams)

    important_bigrams = {}

    for key, value in frequency.items():
        if value > freq_requirement:
            if not key[0] in stop_words and not key[1] in stop_words:  #or vs and here 
                important_bigrams[key[0] +"_" + key[1]] = value
   
    return important_bigrams
        
#Expands the query by producing a list of synonyms for any noun present
#@param query tokenized version of query
#@returns query_expansion list of synonyms for any noun in query
def queryExpansion (query):
    query_expansion = []
    #expands the query
    pos_tag(query)
    nouns = [token for token, pos in pos_tag(query) if pos.startswith('N')]
    for noun in nouns:
        grouping = wordnet.synsets(noun)
        for group in grouping:
            lemmas = group.lemmas()
            #can experiment with only appending top synonyms
            for lemma in lemmas:
                query_expansion.append(lemma.name())

    return query_expansion
            
#Creates a list of Named Entities
#@param contents string of contents - preferably after preprocessing to avoid numbers and punctuation
#@returns named_entities list of Named Entities
def namedEntityRecog (contents):
    named_entities_list = []
    processed = nlp(contents)
    named_entities = processed.ents
    for entity in named_entities:
        named_entities_list.append(entity)
    return named_entities_list

#Removes HTMl tags, punctuation, and digits
#Makes all letters lowercase
#Allows words in query to easily match words in documents
#@param contents_unfiltered non-tokenized contents that have not been filtered
#@returns contents_not_tokenized string of filtered, non-tokenized contents
def basicPreProcesssing (contents_unfiltered):
    
    #remove HTMl tags and newlines
    contents_not_tokenized = re.sub("<.+?>|\n", "", contents_unfiltered)
    #remove punctuation and digits
    contents_not_tokenized = re.sub("[0-9]|[.?!,:;\-(){}\[\]'\"—]", " ", contents_not_tokenized)  #Removing punctuation from text
    #make all letters lowercase
    contents_not_tokenized = contents_not_tokenized.lower()
    
    return contents_not_tokenized

#Spacy tokenizes contents
#@param contents_not_tokenized string of contents
#@returns list of spacy tokens
def tokenize (contents_not_tokenized):
    tokens = word_tokenize(contents_not_tokenized)
    return tokens

#Saves the title of the html file and creates a description using tagged sections with query words present
#Sections with more instances of the query words will appear first in the description
#@param path the path of the file
#@param query_words the tokenized list of the words in the query
#@returns (title, final) tuple of 
    #title a string of the title 
    #final a list of tuples of strings of tagged sections and query word frequency 
def dynamicDescript(path, query_words):
    # because path is object not string
    path_in_str = str(path)

    f = open(path, "r", encoding='utf-8')
    contents = f.read()
    f.close()

    soup = BeautifulSoup(contents, 'html.parser')

    title = str(soup.find_all('title'))

    #removes the tags and brackets so that just the title of the video game is returned
    title = re.sub("</*title>|\n", "", title)
    title = re.sub("\]", "", title)
    title = re.sub(".+?:|\n", "", title)
    

    #prints the first two sentences of the document
    description = str(soupify(contents))
    description = re.sub("\(([0-9]*/*)*\)|\n|\b|\t|\s+-\s+", '', description)

    #split on HTMl tags and newlines
    description = re.split("<.+?>|\n", description)
    final_description = {}
    for item in description:
        for word in query_words:
            if word.lower() in item.lower():
                if item not in final_description.keys():
                    final_description[item] = 1
                else:
                    final_description[item] += 1

    final = []
    for item in final_description:
        final.append((final_description[item], item))
    final.sort(reverse=True)
    

    return (title, final)

In [4]:
#tokenize the documents according to the respective functions
#while tokenizing, create a dictionary

#https://stackoverflow.com/questions/10377998/how-can-i-iterate-over-files-in-a-given-directory
from pathlib import Path
import glob
directory_in_str = "C:videogames"
pathList = []
docIDs = {}
i = 0
tokens_full = []
bigrams_list = []
pathlist = Path(directory_in_str).glob('*.html')
count = 0

#initalizes the docIDs list and then tokenizes each webpage
for path in pathlist:

     # path is object not string
    path_in_str = str(path)
    pathList.append(path_in_str)

    #creates docIDs list
    docIDs[i] = path_in_str
    i += 1

    f = open(path_in_str, "r", encoding='utf-8')
    contents_not_tokenized = f.read()
    f.close()
    
    #soupify contents to remove unnecessary text - such as font information
    contents_not_tokenized = soupify(contents_not_tokenized, 5, 5, 5, 10)

    #processes contents by making all letters lowercase and by removing punctuation and digits
    contents_not_tokenized = basicPreProcesssing (contents_not_tokenized)

    #creates a list of Named Entities from non-tokenized contents
    #must make list before lemmatization or stemming in order to properly work
    named_entities = namedEntityRecog(contents_not_tokenized) 

    #creates dictionary of bigrams as keys and respective frequencies as values
    bigrams_list.append(bigrams(contents_not_tokenized, 5))

    #choose lemmatization or stemming or none (not both - will cause chaos!)
    # contents_not_tokenized = lemmatize (contents_not_tokenized)
    # contents_not_tokenized = stemmatize (contents_not_tokenized)
    
    #tokenizes contents
    contents_tokenized = tokenize (contents_not_tokenized)
    
    #removes stopwords from tokenized contents
    contents_tokenized = stopWords (contents_tokenized)
    
    #add Named Entities to the end of contents_tokenized
    #important to be placed after tokenization of regular contents to keep multi-word names together
    #weights Named Entities higher
    for item in named_entities:
        contents_tokenized.append(str(item))
    
    vocab = []
    update = []
    #counts frequency of each term in contents_tokenized and appends to updates
    for t in contents_tokenized:
        if t not in vocab:
            vocab.append(t)
            update.append((t, contents_tokenized.count(t)))
    
    #list of lists of (tokens, count) in order of document
    tokens_full.append(update)



In [5]:
#creates postings list and vocabID list
vocab = {}
count = 0
vocabID = 0
postings = {}
docIdCounter = 0
length = []
for doc in tokens_full:
    length.append(0)
    for (token, vocabCount) in doc:

        #counts the number of terms in each document, to be used in cosine similarity
        #does not count the length of the tf-idf of each term of the vector
        #does not count bigrams or named entity recognition weightings
        #does count different weightings of tokens in titles and headings
        length[docIdCounter] += vocabCount*vocabCount
        
        if token not in vocab:
            vocab[token] = count
            vocabID = count
            count += 1
        else:
            vocabID = vocab[token]
        if vocabID in postings.keys():
            postings[vocabID][docIdCounter] = vocabCount
            
        else:
            postings[vocabID] = {docIdCounter : vocabCount}
            
    docIdCounter += 1

#take the square root of every item in length list - relevant to cosine similarity
for i in range (len(length)):
    length[i] = math.sqrt(length[i] )

#adds bigrams to postings list 
docIdCounter = 0
for doc_bigrams in bigrams_list:
    for key in doc_bigrams:
        vocab[key] = count
        vocabID = count
        count += 1
        postings[vocabID] = { docIdCounter : doc_bigrams[key]}
    docIdCounter += 1
    
#creates txt files for vocab, postings, and docIDs lists for efficient storage and access
with open('vocab.txt', 'w') as f: json.dump(vocab, f)
with open('postings.txt', 'w') as f: json.dump(postings, f)
with open('docids.txt', 'w') as f: json.dump(docIDs, f)        

In [6]:
#creates a string of the top ten titles and respective descriptions based on final list
#@param final the list of documents in order determined by tf-idf or cosine similarity
#@param query_words the tokenized terms in the query
def print_results_in_order(final, query_words):
    
    i = 0
    result_in_string = ''
    for (score, d) in final:
        if i < 10:
            (title, final_descript) = dynamicDescript(docIDs[str(d)], query_words)
            # print(str(i+1) + ' ' + title)
        
            result_in_string += (str(i+1) + ' ' + title + '\t')
            result_in_string += (docIDs[str(d)] + '\n')
            
            if len(final_descript) > 1:
                # print('\t' + final_descript[0][1] + '; ' , end="")
                result_in_string += ('\t' + final_descript[0][1] + '; ')
                if len(final_descript) > 2:
                    # print(final_descript[1][1] + '; ' , end="")
                    # print(final_descript[2][1])
                    result_in_string += (final_descript[1][1] + '; ' + final_descript[2][1]+ '\n')
                else:
                    # print(final_descript[1][1])
                    result_in_string += (final_descript[1][1]+ '\n')
            elif (len(final_descript) == 1):
                # print(final_descript[0][1])
                result_in_string += (final_descript[0][1] + '\n' )
            else:
                break


        i += 1
    return(result_in_string)

#prints the string from @function print_results_in_order and writes results to a file
#@param final the list of documents in order determined by tf-idf or cosine similarity
#@param query the string of the query input by the user
#@param query_words the tokenized terms in the query
def print_results_and_write_to_file(final, query, query_words, filename):
    if final == []:
        print('no results for this query')
    else:
        print(print_results_in_order(final, query_words))
        with open(filename + '.txt', 'a') as f:  
            f.write(query + '\n')
            f.write(print_results_in_order(final, query_words))         
        f.close()

#performs a cosine similarity analysis between the query and each document in the corpus
#@param query_words the tokenized terms in the query
#@param docIdCounter the number of documents in the corpus
#@returns dot_product a list of tuples (cosine_similarity, documentID) sorted by cosine similarity score with the highest first
def cosine_similarity(query_words, docIdCounter):

    #setup
    results = []
    scores_dot = []
    for i in range(docIdCounter):
        scores_dot.append([])
        scores_dot[i] = []
    scores_query = []
    query_length_count = 0

    for word in query_words:
        query_length_count += 1 
        if word == "quit":
            return ['quit']
            break

        if word in vocab:

            termFreqTermQuery = 1 + math.log(query_words.count(word), 10)
            scores_query.append(termFreqTermQuery)

            vocabID = vocab[word]
            i = 0

            #calculate the tf for the query in each document in postings list
            #don't have to calculate tf of non-query words bc we do the dot product with the query
            for d in postings[str(vocabID)]:
                
                #TF-IDF
                #1 plus the log of the frequency of the term in each document
                #weighted
                termFreqTermInDoc = 1 + math.log(postings[str(vocabID)][d], 10)
                #dot product with query tf
                scores_dot[int(d)].append(termFreqTermQuery*termFreqTermInDoc)
    
    dot_product = []
    for i in range (len(scores_dot)):
        total = 0
        for score in scores_dot[i]:
            if length[i] != 0:
                total += score/(length[i]*query_length_count)
        dot_product.append((total, i)) #total is the cosine similarity and i corresponds to the documentID
    dot_product.sort(reverse=True)
    return dot_product

#performs a tf-idf analysis between the query and each document in the corpus
#@param query_words the tokenized terms in the query
#@param documentsInCorpus the amount of documents in the corpus
#@returns final the list of tuples (tf-idf score, docID) sorted with highest tf-idf scores first
def tf_idf(query_words, documentsInCorpus):
    results = []

        # retrieve documents that contain at least one of each term
    for word in query_words:
        if word == "quit":
            return ['quit']
            break

        if word in vocab:
            vocabID = vocab[word]
            
            corpusFreq = 0
            
            #count the number of documents in which this term occurs
            for d in postings[str(vocabID)]:
                corpusFreq += 1
            
            #if the word does not occur at all in the corpus
            if corpusFreq == 0:
                docFreqTerm = 0
            
            #if the word does occur in corpus, calculate its idf
            else:
                docFreqTerm = math.log(documentsInCorpus/corpusFreq, 10)
                
            #calculates the term frequency in each document
            
            #setup
            output = {}
            i = 0
            
            #the documents containing the word
            for d in postings[str(vocabID)]:

                  #TF-IDF
                #1 plus the log of the frequency of the term in each document
                #weighted
                termFreqTerm = 1 + math.log(postings[str(vocabID)][d], 10)
                
                #unweighted
#                 termFreqTerm = postings[str(vocabID)][d]
                
                #the product - the tf-idf
                product = docFreqTerm*termFreqTerm
                
                #creates a dictionary with keys as document ids and values s tf-idf
                output[d] = product
                i+=1                  
            
            #appends to a list of outputs
            results.append(output)
            
#         else:
# #           query_words.append(spellcheck(word))

    
    #sums the tf-idf for each document for all words in query
    second = {}
    final = []

    if len(results) == 0:
        print("No results for this query")
        return []
    elif len(results) == 1:
        second = results[0]
    else:
        first = results[0]

        for i in range(len(results)-1):
            second = results[i+1]
            for key1 in first:
                for key2 in second:
                    #if the key occurs in both dictionaries, update the second dictionary with the sum
                    if key1 == key2:
                        sum = first[key1] + second[key2]
                        second[key2] = sum
                    #if the key only occurs in the first dictionary, add it to the second
                if key1 not in second:
                    second[key1] = first[key1]
            first = second

    for doc in second:
            final.append((second[doc], doc))

    final.sort(reverse=True)  

    return final



In [9]:
with open('vocab.txt', 'r') as f: vocab = json.load(f)
with open('postings.txt', 'r') as f: postings = json.load(f)
with open('docids.txt', 'r') as f: docIDs = json.load(f)
filename = 'results'
while True:
    query = input("What would you like to query?")
    if query == 'quit':
        break
    print("You searched: " + query)
    query_words = basicPreProcesssing(query)

    # named_entities = namedEntityRecog(query_words) 
    # print(named_entities)

    # query_words = stemmatize(query_words)
    query_words = lemmatize(query_words)

    query_words = word_tokenize(query_words)

    #spell_check
    for word in query_words:
        if word not in vocab and word != None:
            
            if spell_check(vocab,word) != None:
                index = query_words.index(word)
                query_words.remove(word)
                query_words.insert(index, tokenize(spell_check(vocab,word))[0])

    
    #query expansion using synonyms for nouns
    #will increase the length of the query during cosine similarity
    query_expansion = queryExpansion(query_words)
    query_words += query_expansion

    #add bigrams to query
    query_bigrams = bigrams(query, 0)
    # query_words += query_bigrams
    
    # # # #add named entities to query_words
    # for item in named_entities:
    #     query_words.append(str(item))

    documentsInCorpus = len(docIDs)

    #choose one
    final = tf_idf(query_words, documentsInCorpus)
    if final ==  ['quit']:
        print('quitting')
        break

    print_results_and_write_to_file(final, query, query_words, filename)

    # final = cosine_similarity(query_words, documentsInCorpus)
    # if final ==  ['quit']:
    #     print('quitting')
    #     break


    # print_results_and_write_to_file(final, query, query_words, filename)
    

You searched: Action-Adventure Games
1  Streets of L.A.	C:videogames\true-crime-streets-of-la.html
	True Crime: Streets of L.A. , PS2, Action, Driving; A veteran developer of car-combat games prepares to release what you might call a GTA clone, but what Activision insists is a "driving/fighting/shooting hybrid." Okay, cool, whatever! ; Action, Adventure
2  X-Men Legends	C:videogames\x-men-legends.html
	GameSpy gets down and dirty with Activision's X-Men action-RPG. It features awesome multiplayer action and loads of little details that comic-book fans will love.; Play Games; Most Wanted Games of 2009
3  Lockdown	C:videogames\rainbow-six-4-tentative-title.html
	We go hands-on with the single and multiplayer action that you'll find in Ding Chavez's latest adventure.; The world's greatest counter-terrorism squad is back in action, and we got a chance to be the point man.; Play Games
4  God of War	C:videogames\god-of-war.html
	With some truly amazing combat and a dark, deep storyline, Sony