<a href="https://colab.research.google.com/github/michaelwnau/ai_academy_notebooks/blob/main/DocQuery_Skeleton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# DocQuery-example.py
# Collin Lynch and Travis Martin
# 9/8/2021

# The DocQuery code is taked with taking a query and a set of
# saved documents and then returning the document that is closest
# to the query using either TF/IDF scoring or the sum vector.
# When called this code will load the docs into memory and deal
# with the distance one at a time.

# Imports
# ---------------------------------------------
import spacy
import os
import scipy.spatial
import nltk
import gensim as gm
import requests
from bs4 import BeautifulSoup
import numpy as np

#load the spacy model
#spacy.cli.download("en_core_web_sm")  #you may have to run this line the first time through
MODEL = spacy.load("en_core_web_sm")
np.random.seed(seed=42)

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Core Code.
# ---------------------------------------------
"""
input: directory for the url file and url file name
1. open the url file and read contents 1 line at a time
2. append the urls to a list
3. go through the list of urls and open each page
4. clean the text from each page
5. store the text from the webpage and tokenized word in separate dictionaries
output: dictionaries with page text and tokenized words
"""
def load_webpages(Directory, url_file):

    Loaded_Docs_words = {}
    Loaded_Docs_page = {}
    webpage_names = []

    with open(Directory + URL_File, 'r') as InFile:
        website = InFile.readline()[:-1]
        while website:
            webpage_names.append(website)
            website = InFile.readline()[:-1]

    for page in webpage_names:
        Req = requests.get(page)
        SoupText = BeautifulSoup(Req.text, features="lxml")
        PageText = SoupText.get_text()
        Words = nltk.tokenize.word_tokenize(PageText)
        Loaded_Docs_words[page] = Words
        Loaded_Docs_page[page] = PageText

    return Loaded_Docs_words, Loaded_Docs_page

In [None]:
"""
input: raw page text, spacey model
1. iterate through the raw text pages and pass them into the spacey model
2. pass out the modeled text
output: spacey modeled text
"""
def build_model(Loaded_Docs_page, MODEL):

    spacey_model_page = {}

    for key in Loaded_Docs_page:
        raw_page = Loaded_Docs_page[key]
        doc = MODEL(raw_page)
        spacey_model_page[key] = doc

    return spacey_model_page

In [None]:
"""
input: spacey model of the raw page text
1. iterate through the webpages
2. create a summed vector for the words in the webpage
3. Normalize the doc vector by the number of words in the doc
output: a vector representing the webpage
"""
def find_doc_vec(spacey_model_page):
    webpage_vectors = {}
    shape = (96,)

    for webpage in spacey_model_page:
        temp_array = np.empty(shape)
        word_count = 0
        for word in spacey_model_page[webpage]:
            temp_array += word.vector
            word_count += 1

        webpage_vectors[webpage] = temp_array/word_count
    return webpage_vectors

In [None]:
"""
input: vectorized webpages
1. initialize the first two webpages as the closest
2. iterate through all possible webpage combinations (order does not matter)
3. compute the manhattan, euclidean, and cosine distance for each pair
4. compare the different distances to find the smallest
5. report the smallest distances
output: print the 2 closest webpages for each distance metric
"""
def find_closest_two_webpages(webpage_vectors):
    low_key1 = list(webpage_vectors.keys())[0]
    low_key2 = list(webpage_vectors.keys())[1]
    low_value1 = webpage_vectors[low_key1]
    low_value2 = webpage_vectors[low_key2]
    low_key1E = low_key1
    low_key2E = low_key2
    low_key1C = low_key1
    low_key2C = low_key2

    lowest_distance_manh = np.sum(abs(low_value1-low_value2))
    lowest_distance_eucl = np.linalg.norm(low_value1 - low_value2)
    lowest_distance_cos = np.dot(low_value1, low_value2)/(np.linalg.norm(low_value1)*np.linalg.norm(low_value2))

    outer_index = 0
    while outer_index < len(list(webpage_vectors.keys()))-1:
        inner_index = outer_index + 1
        while inner_index < len(list(webpage_vectors.keys())):
            key1 = list(webpage_vectors.keys())[outer_index]
            key2 = list(webpage_vectors.keys())[inner_index]
            value1 = webpage_vectors[key1]
            value2 = webpage_vectors[key2]

            current_distance_manh = np.sum(abs(value1-value2))
            current_distance_eucl = np.linalg.norm(value1 - value2)
            current_distance_cos = np.dot(value1, value2)/(np.linalg.norm(value1)*np.linalg.norm(value2))

            if current_distance_manh <= lowest_distance_manh:
                low_key1 = key1
                low_key2 = key2

            if current_distance_eucl <= lowest_distance_eucl:
                low_key1E = key1
                low_key2E = key2

            if current_distance_cos <= lowest_distance_cos:
                low_key1C = key1
                low_key2C = key2

            low_value1 = webpage_vectors[low_key1]
            low_value2 = webpage_vectors[low_key2]
            low_value1E = webpage_vectors[low_key1E]
            low_value2E = webpage_vectors[low_key2E]
            low_value1C = webpage_vectors[low_key1C]
            low_value2C = webpage_vectors[low_key2C]

            inner_index += 1
        outer_index += 1

    print('Two closest webpages using Manhattan distance:', low_key1, 'and', low_key2)
    print('Two closest webpages using Eulcidean distance:', low_key1E, 'and', low_key2E)
    print('Two closest webpages using Cosine distance:', low_key1C, 'and', low_key2C)
    print()

In [None]:
"""
input: query string, spacey model, vectorized webpages
1. split the query string into a list of words
2. find the vector for each word and store it
3. iterate through the webpages and compare the vectors to the query
4. I used manhattan, euclidean, and cosine distances
5. print the results to the screen
output: print the closest webpage to the query
"""

def find_closest_page_to_query(Query_String, MODEL, webpage_vectors):



    '''
    Add your code here
    '''



    low_keyM = 'Find this value'
    low_keyE = 'Find this value'
    low_keyC = 'Find this value'


    print('Closest Manhattan distance webpage to query is:', low_keyM)
    print('Closest Euclidean distance webpage to query is:', low_keyE)
    print('Closest Cosine distance webpage to query is:', low_keyC)
    print()

In [None]:
"""
input: raw loaded docs in a dictionary
1. store all the keys of the docs for later
2. store all the text from the docs in a list
3. use gensim to create a BOW then use that to create TFIDF ectors
4. iterate through the tfidf vectors and put the words and scores into a dictionary
5. iterate through all tfidf dictionaries and put them into one total dictionary
output: return the dictionary with all keys and tfidf values
"""
def compute_tfidf_value(Loaded_Docs_page):

    all_doc_list = []
    key_list = list(Loaded_Docs_page.keys())

    for key in Loaded_Docs_page:
        all_doc_list.append(Loaded_Docs_page[key])

    doc_tokenized = [gm.utils.simple_preprocess(doc) for doc in all_doc_list]
    dictionary = gm.corpora.Dictionary()
    BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
    tfidf = gm.models.TfidfModel(BoW_corpus, smartirs='ntc')

    tfidf_list = []
    for doc in tfidf[BoW_corpus]:
        tfidf_dict = {}
        for id, freq in doc:
            word = dictionary[id]
            score = freq
            tfidf_dict[word] = score

        tfidf_list.append(tfidf_dict)

    tfidf_dict_all_files = {}
    for index in range(len(key_list)):
        tfidf_dict_all_files[key_list[index]] = tfidf_list[index]

    return tfidf_dict_all_files

In [None]:
"""
input: tfidf dictionary with all files, number of keywords
1. iterate through the tfidf dictionary
2. sort each dictionary and print top keywords
output: print the number of keywords specified
"""
def get_keywords(tfidf_dict_all_files, num_keywords):
    for file in tfidf_dict_all_files:
        tfidf_dict_all_files[file]
        res = dict(sorted(tfidf_dict_all_files[file].items(), key = lambda x: x[1], reverse = True)[:num_keywords])
        print(f"The top {num_keywords} keywords for {file} are  " + str(res), '\n')

In [None]:
"""
input: tfidf dictionary with all files, a query string
1. split the query string into a list of words
2. iterate through the tfidf dictionaries
3. try to find the word in the dictionary and add its score to the total
4. store the scores of the query with the webpage in a new dictionary
5. find the highest score in this new dictionary
6. print the highest value to the screen
output: print the highest score for the words in the query
"""
def keyword_search(tfidf_dict_all_files, Query_String):

    '''
    Add your code here
    '''


    result = 'not found yet'
    print('The webpage that is closest to the query is:', result)

In [None]:
if __name__ == "__main__":

    #initial declarations
    URL_File = 'url_file.txt'

    Directory = os.getcwd() + '\\webpages\\'
    Query_String = 'The happiest place on earth'
    #Query_String = 'Newton Gravity Einstein Physics'
    num_keywords = 10

    #open the url file and store the webpage names for later
    Loaded_Docs_words, Loaded_Docs_page = load_webpages(Directory, URL_File)

    #load the documents created in the doc downloader program
    spacey_model_page = build_model(Loaded_Docs_page, MODEL)

    #create doc vector using the apcey model
    webpage_vectors = find_doc_vec(spacey_model_page)

    #iterate through all available webpages and find two closest
    find_closest_two_webpages(webpage_vectors)

    #find the closest webpage to the query
    find_closest_page_to_query(Query_String, MODEL, webpage_vectors)


    #compute the tfidf scores for all the webpages
    tfidf_dict_all_files = compute_tfidf_value(Loaded_Docs_page)

    #iterate through all the webpages and return the top N keywords
    get_keywords(tfidf_dict_all_files, num_keywords)

    #search all the documents and find the closest to the query string
    keyword_search(tfidf_dict_all_files, Query_String)

FileNotFoundError: ignored