# Information Retrieval

## Final Project

Lokesh Roopkumar
lroopk2@uic.edu

Please contact via email if there are any issues while executing the code.

Thank you so much for your time!

### Part 1 - Web Crawler

Please note that the crawler takes a long time to run. To give you an estimate, it took 1 hour and 36 minutes to crawl 4000 pages. 

The crawled data already exists in the CrawledPages directory (for 4000 pages). So the cell below need not be run for the project to work. Running the cell below might even destroy the existing file that has the crawled pages results. So please make a copy of that file before running the crawler again.

In [149]:
from bs4 import BeautifulSoup   
from bs4.element import Comment
import requests                                                             
requests.packages.urllib3.disable_warnings()
from collections import defaultdict, deque
import time
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import json
from os import makedirs

headers = {
  'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Mobile Safari/537.36'
}

pages_folder = './CrawledPages/'                                            
makedirs(pages_folder, exist_ok=True)
crawl_limit = 4000                                                          
domain = 'uic.edu'                                                          
seed_url = 'https://cs.uic.edu/'

stemmer = PorterStemmer()
stop_words = stopwords.words('english')

# file to store error logs

error_file = 'error_logs.txt'
file_writer = open(error_file, 'w+')
file_writer.close()

# filters unnecessary extensions

def extension_found(url):
    ignore_ext = ['.css', '.js', '.aspx', '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.tar', '.gz', '.tgz', '.zip', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.mp4', '.avi', '.ics', '/googlepublish', '/ical', '.xml']
    for ext in ignore_ext:
        if ext in url:
            return True
    return False

# ensures information from certain tags are ignored

def filter_tags(element):
    if element.parent.name in ['style', 'script', 'head', 'meta', '[document]']:
        return False
    elif isinstance(element, Comment):
        return False
    elif re.match(r'[\s\r\n]+', str(element)):
        return False
    elif 'encoding=\"utf-8\"' in element.lower():
        return False
    else:
        return True

urls_visited = set()
bfs_queue = deque()                                                         
bfs_queue.append(seed_url)                                                  

url_content = dict()                                                        
num_pages_crawled = 0

# timer is started to track how long it takes to crawl

start_time = time.time()

# the main web crawling part

while bfs_queue:
    
    try:
    
        curr_url = bfs_queue.popleft()                                          # urls are popped from the queue
        urls_visited.add(curr_url)
        get_request = requests.get(curr_url, headers = headers, verify=False)   # get request is made
        if get_request.status_code == 200:
            soup = BeautifulSoup(get_request.text, 'lxml')                      # Beautiful Soup is used to parse html
            soup_text = soup.find_all(text=True)
            
            visible_text = filter(filter_tags, soup_text)                        # filtering out text in unnecessary tags
            visible_text = ' '.join(term.strip() for term in visible_text)
            visible_text = visible_text.lower()
            visible_text = re.sub('[^a-z]+', ' ', visible_text)                  # text cleaning
            tokens = visible_text.split()                                  
            
            clean_tokens = list()                                 
            for token in tokens:
                if token not in stop_words:                                      # tokenize, stem and remove stopwords
                    temp_token = stemmer.stem(token)
                    if temp_token not in stop_words and len(temp_token) > 1:
                        clean_tokens.append(temp_token)
            url_content[curr_url] = clean_tokens

            for link in soup.find_all('a'):                                      # extract all links
                curr_outgoing_link = link.get('href')

                if curr_outgoing_link:
                    curr_outgoing_link = curr_outgoing_link.strip()              # clean and normalize urls
                    curr_outgoing_link = curr_outgoing_link.lower()                       
                    curr_outgoing_link = curr_outgoing_link.replace('http://', 'https://')
                    curr_outgoing_link = curr_outgoing_link.split("#")[0]
                    curr_outgoing_link = curr_outgoing_link.split("?", maxsplit=1)[0]

                    if len(curr_outgoing_link) > 0 and curr_outgoing_link[-1] != '/':
                        curr_outgoing_link += '/'

                    if len(curr_outgoing_link) > 0 and curr_outgoing_link[0] == '/':     # expanding relative urls
                        index = curr_url.find('.edu')
                        index += 5
                        curr_outgoing_link = curr_url[:index] + curr_outgoing_link[1:]

                    if domain in curr_outgoing_link and 'https' in curr_outgoing_link and curr_outgoing_link not in bfs_queue and curr_outgoing_link not in urls_visited:
                        if extension_found(curr_outgoing_link) == False:
                            bfs_queue.append(curr_outgoing_link)

        num_pages_crawled += 1
        if num_pages_crawled == crawl_limit:
            break
    
    except Exception as e:                                               # handling exceptions and writing to error logs file
        with open(error_file, 'a+') as file_writer:
            file_writer.write(f'Error either with {curr_url} or {curr_outgoing_link}')
            file_writer.write(f'\nError: {e}\n\n')
            
            print(f'Could not connect to {curr_url} or {curr_outgoing_link}')
            print(f'Error occurred: ', e, '\n')
            continue
            
# the urls are their corresponding tokens are stashed in a json file
            
with open(pages_folder + '4000_crawled_pages.json', 'w') as fw:
    json.dump(url_content, fw)
        
    
print(f'Crawled {num_pages_crawled} pages in {(time.time() - start_time)/60} minutes')


Could not connect to https://securityigert.uic.edu/ or https://www.uis.edu/
Error occurred:  ('Connection aborted.', FileNotFoundError(2, 'No such file or directory')) 

Could not connect to https://www.cmhsrp.uic.edu/nrtc/ or https://dscc.uic.edu/terms-of-use/
Error occurred:  ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)) 





Could not connect to https://hpsc.org.uic.edu/ or https://www.uis.edu/
Error occurred:  ('Connection aborted.', FileNotFoundError(2, 'No such file or directory')) 

Could not connect to https://nlp.cs.uic.edu/ or https://www.uis.edu/
Error occurred:  HTTPSConnectionPool(host='nlp.cs.uic.edu', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000231363C63D0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond')) 

Could not connect to https://projectheal.uic.edu/ or https://www.uis.edu/
Error occurred:  HTTPSConnectionPool(host='projectheal.uic.edu', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002312F3A6E50>: Failed to establish a new connection: [E

### Part 2 - Cosine Similarity

Just running the cell below would suffice. This is where I read in all the stored crawled pages data and calculate cosine similarity. Please ensure that there is a folder called CrawledPages in the same directory with a file inside it called 4000_crawled_pages.json.

In [None]:
from json import load
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from math import log2
from collections import Counter
from re import sub
from math import sqrt

crawled_folder_path = './CrawledPages/'

with open(crawled_folder_path + '4000_crawled_pages.json', 'r') as fr:
    url_tokens = load(fr)

porter_stemmer = PorterStemmer()
nltk_stopwords = stopwords.words('english')

url_clean_tokens = dict()
index_urls = dict()
most_freq_url = dict()

# remove urls that contain less than 10 words

for url_key in url_tokens:
    if len(url_tokens[url_key]) > 10:
        url_clean_tokens[url_key] = url_tokens[url_key]

# find frequency of most occuring word in every url
        
for url_key in url_clean_tokens:
    most_freq_url[url_key] = Counter(url_clean_tokens[url_key]).most_common(1)[0][1]

# construct inverted index for the urls

for url_key in url_clean_tokens:
    for word in url_clean_tokens[url_key]:
        if word in index_urls:
            if url_key in index_urls[word]:
                index_urls[word][url_key] += 1
            else:
                index_urls[word][url_key] = 1
        else:
            index_urls[word] = dict()
            index_urls[word][url_key] = 1

# compute weights for every word in every url and store them
            
weights_urls = dict()
for word in index_urls:
    idf = round( log2(len(url_clean_tokens) / len(index_urls[word])), 8)
    for every_url in index_urls[word]:
        if every_url in weights_urls:
            weights_urls[every_url][word] = round( (index_urls[word][every_url] / most_freq_url[every_url]) * idf, 8)
        else:
            weights_urls[every_url] = dict()
            weights_urls[every_url][word] = round( (index_urls[word][every_url] / most_freq_url[every_url]) * idf, 8)
            
# take the query as input, tokenize, stem and remove stop words i.e. input query cleaning
            
print()
input_query = input('Enter a search query: ')
print()
clean_query = input_query.lower()
clean_query = sub('[^a-z]+', ' ', clean_query)
query_tokens = clean_query.split()
clean_query_tokens = list()
for token in query_tokens:
    if token not in nltk_stopwords:
        stemmed_token = porter_stemmer.stem(token)
        if stemmed_token not in nltk_stopwords and len(stemmed_token) > 1:
            clean_query_tokens.append(stemmed_token)

# construct query term frequencies index

index_query = dict()
for token in clean_query_tokens:
    if token in index_query:
        index_query[token] += 1
    else:
        index_query[token] = 1

# find frequency of most occuring term in the query

query_high_freq = Counter(clean_query_tokens).most_common(1)[0][1]

# compute weights for every word in every query and store them

weights_query = dict()
for word in index_query:
    if word in index_urls:
        idf = round( log2(len(url_clean_tokens) / len(index_urls[word])), 8)
    else:
        idf = 0
    weights_query[word] = round( (index_query[word] / query_high_freq) * idf, 8)
    
# query vector length for cosine similarity
query_length = 0
for term in weights_query:
    query_length += round( weights_query[term] ** 2, 8)
    
# url vector lengths for cosine similarity

url_lengths = dict()
for each_url in weights_urls:
    url_size = 0
    for term in weights_urls[each_url]:
        url_size += round( weights_urls[each_url][term] ** 2, 8)
    url_lengths[each_url] = url_size
    
# cosine similarity

url_cosine_sim = dict()
for url in weights_urls:
    numerator = 0
    for term in weights_query:
        if term in weights_urls[url]:
            numerator += round( weights_query[term] * weights_urls[url][term], 8)
    denominator = round( sqrt(query_length * url_lengths[url]), 8)
    final_val = round(numerator / denominator, 8)
    if final_val > 0:
        url_cosine_sim[url] = final_val

urls_cosine_sorted = dict(sorted(url_cosine_sim.items(), key=lambda item: item[1], reverse=True))

# UI

print('Search Results: ')
i = 1
print_limit = 10
for url in urls_cosine_sorted:
    print(f'{i} {url}')
    if i == print_limit:
        input_cont = input('Do you want to continue (y/n): ')
        if input_cont.lower() == 'y':
            print_limit += 10
            continue
        else:
            break
    i += 1


Enter a search query: covid

Search Results: 
1 https://today.uic.edu/coronavirus/
2 https://ehso.uic.edu/covid19/
3 https://today.uic.edu/uic-fall-covid-19-guidance/
4 https://today.uic.edu/covid-19-vaccination-guidelines/
5 https://law.uic.edu/coronavirus/
6 https://dos.uic.edu/community-standards/uic-covid-19-guidance/
7 https://today.uic.edu/frequently-asked-questions/
8 https://research.uic.edu/covid-19ovcr/
9 https://today.uic.edu/updates-to-covid-19-university-sponsored-travel-approvals/
10 https://vcha.uic.edu/about/vcha-initiatives/covid-19-university-travel-request-authorization-form/
