In [32]:
from url_normalize import url_normalize
from urllib.parse import urlparse, urljoin
import pickle
import urllib.robotparser
import time
import requests
from bs4 import BeautifulSoup
import hashlib 


In [2]:
# global
LINK_GRAPH = {} # urls to link objects for saving later
#DOMAINS = {} # list of domains we have visited
INLINKS = {} # maps url to list of urls that point to it
OUTLINKS = {} # maps url to list of urls that it points to
VISITED = set([]) # list of urls we have crawled
FRONTIER = None
I = 0

In [3]:
class Link():
    """
       A Link object holds the url as a string, the outlinks that this link points to, the
       inlinks that point to this link, the wave number that this link belongs to in the crawl,
       the relevance score this link is given, and the domain of the link.
    """
    def __init__(self, link, inlinks, outlinks, wave, relevance_score):
        self.link = link
        self.outlinks = []
        self.inlinks = []
        self.wave = wave
        self.relevance_score = relevance_score
        self.domain = urlparse(link).netloc.strip()
        
    def update_outlinks(self, outlinks):
        "Add a list of outlinks"
        self.outlinks = outlinks
        
    def update_inlinks(self, inlinks):
        "Add a list of inlinks"
        self.inlinks = inlinks
        
    def update_score(self, relevance_score):
        "Add a relevence score"
        self.relevance_score = relevance_score

In [4]:
class Queue:
    "A container with a first-in-first-out (FIFO) queuing policy."
    
    def __init__(self):
        self.list = []
        self.set = set([])

    def push(self,item):
        "Enqueue the 'item' into the queue"
        self.list.insert(0,item)
        self.set.add(item)

    def pop(self):
        """
          Dequeue the earliest enqueued item still in the queue. This
          operation removes the item from the queue.
        """
        return self.list.pop()

    def isEmpty(self):
        "Returns true if the queue is empty"
        return len(self.list) == 0
    
    def sort_queue(self):
        """
          Slice the first 1000 links off the queue, get those from the wave we are on,
          sort those in the current wave, append back to list. 
        """
        if len(self.list) > 1000:
            # cut off everything besides the first 1000
            lnth = len(self.list)
            first_group = self.list[-1000:]
            last_group = self.list[:lnth-1000]

            # get wave of next item to be dequeud
            w = first_group[-1].wave

            # get list of all from first group that are in wave 1
            first_group_wave_w = [x for x in first_group if x.wave == w]
            first_group_not_wave_w = [x for x in first_group if x.wave != w]

            # sort first group that belongs to wave w
            sorted_first_group = sorted(first_group_wave_w, key=lambda x: x.relevance_score)

            # regroup lists
            self.list = last_group + first_group_not_wave_w + sorted_first_group
    
    def print_queue(self):
        "Prints each link in the queue"
        for each in self.list:
            print(each.link)
        
    def has_link(self, url):
        "Check if a link exists in the queue or not"
        if url in self.set:
            return True
        return False
#         temp_set = set(self.list)
#         if url in temp_set:
#             return True
#         return False


In [5]:
class Crawler:
    """
       Crawl the web to create a coprus of documents partaining to the topic of World War II.
       The crawler supports the saving of the documents as txt files, scoring documents based
       on relevance, and sorting the frontier so that the best documents are chosen. 
    """
    def __init__(self, frontier, visited, domains, inlinks, outlinks, link_graph, stop_criteria, i):
        self.frontier = frontier
        self.visited = visited
        self.domains = domains
        self.link_graph = link_graph
        self.inlinks = inlinks
        self.outlinks = outlinks
        self.stop_criteria = stop_criteria
        self.blacklist = set(['Citation_needed', 'Special:BookSources', 'Wikimedia_sister_projects', 'doi', 'CorpusID',
                         'universitypublishingonline', 'index.php?', 'Category:', 'archive.org', 'Terms_of_Use',
                         'wiktionary', 'omniatlas', 'Special:', 'Index_of', 'Panther%', 'SELIBR', 'Codebook', 'Help:',
                         'S2CID', 'Press', 'MyContributions', 'Basic_Books', 'NOTRS', 'www.worldcat.org/oclc/278029256',
                         'Weidenfeld_&_Nicolson', 'Template_talk:', 'index.php', 'Random_House', 'Contact_us', 'Publishing',
                         'Creative_Commons', 'License', 'General_disclaimer', 'Simon_&_Schuster', 'Rodopi',
                         'Houghton_Mifflin_Company', 'Template:', 'Portal:', 'Publish', 'Manual_of_Style', 'Talk:','OCLC',
                             'Rowman_&_Littlefield', 'Wikiversity', 'wikiversity', 'youtube', 'YouTube'])
        self.keywords = set(['war', 'world', 'ii', 'battle', 'army', 'united', 'states', 'us', 'u.s', 'germany', 'austria',
                        'france', 'england', 'fought', 'fight', 'seige', 'poland', 'holocaust', 'hitler', 'japan', 'nazi',
                            'jewish', 'second', 'concentration'])
        self.i = i
        self.files_to_write = {} # i --> text to put in file
        
    def get_doc_details(self, url, response, s):
        """
         Retrives the document details from the request response that are needed to save the 
         document in the corpus, and crawl its outlinks. 
        """
        details = {}
        details['doc_id'] = url
        details['headers'] = response.headers
        details['raw_html'] = s
        if s.title == None:
            details['title'] = ""
        else:
            details['title'] = s.title.string
        if s.get_text() == None:
            details['text'] = ""
        else:
            details['text'] = s.get_text() #strip=True
        temp_list = [a['href'] for a in s.find_all('a', href=True) if a.text]
        temp_list = list(set(temp_list))
        details['outlinks'] = temp_list
        return details
    
    
    def write_doc_to_file(self, doc_dict, i):
        """
          Writes a file in the corpus with the appropriate fields. One document for the corpus file,
          and another to hold the headers and the raw html (each will be named with the same id number).
        """
        f = open('C:/6200-IR/hw3-mplatt27/docs/' + "_" + str(i) + ".txt", "w", encoding="utf-8")
        in_w = ""
        out_w = ""
        f.write('<DOC>\n')
        f.write('<DOCNO>' + doc_dict['doc_id'] + "</DOCNO>\n")
        if doc_dict['title'] != None:
            f.write('<TITLE>' + doc_dict['title'] + '</TITLE>\n')
        else:
            f.write('<TITLE></TITLE>\n')
        f.write('<AUTHOR>MELANIE PLATT</AUTHOR>\n')
        if doc_dict['text'] != None:
            f.write('<TEXT>' + doc_dict['text'] + '</TEXT>\n')
        else:
            f.write('<TEXT></TEXT>\n')
        f.write('<OUTLINKS>' + out_w.join(self.link_graph[doc_dict['doc_id']].outlinks) + '</OUTLINKS>\n')
        f.write('<INLINKS>' + in_w.join(self.inlinks[doc_dict['doc_id']]) + '</INLINKS>\n')
        f.write('<HEADERS>' + str(doc_dict['headers']) + '</HEADERS>\n')
        f.write('<RAW_HTML>' + str(doc_dict['raw_html']) + '</RAW_HTML>\n')
        f.write('</DOC>')
        f.close()
        
    
    def canonicalize_url(self, outlink_url, original_url=None):
        """
          Normalize the urls so that they are all in the same format. Make relative urls
          absolute, change scheme and host to lower case, remove port, and dup slashes.
          Get rid of everything after #, make all http (not https). Check that they are
          not pdfs or another unreadable type of page. 
        """
        # make all relative urls absolute
        if original_url != None:
            if urlparse(outlink_url).netloc.strip() == "":
                outlink_url = urljoin(original_url, outlink_url)
                
#         pth = urlparse(outlink_url).path
#         bad_paths = ['.jpeg', '.pdf', '.javascript', 'png', 'mpeg', 'mp4']
#         for each in bad_paths:
#             if each in pth:
#                 return ""

        # use url_normalize library to change scheme and host to lowercase, remove the port and dup slashes
        outlink_url = url_normalize(outlink_url)

        # get rid of everything after #
        start_index = outlink_url.find("#")
        if start_index >= 0:
            outlink_url = outlink_url[:start_index]

        # make all http (not https)
        outlink_url = outlink_url.replace("https", "http")
        return outlink_url 
    
    
    def is_blacklisted(self,url):
        "Check if the link is a type we know we don't want"
        for word in self.blacklist:
            if word in url:
                return True
        return False
    
    
    def filter_links(self, url, links):
        """
          Filters outlinks that we want to keep by canonicalizing them, checking if they are in visited or the 
          frontier already, and they are not blacklisted. Creates an empty Link object for each link we are keeping
          and initializes it with an empty in and outlink list and a score of 0. Appends each link object to the
          link graph.
        """
        new_links = []
        for each in links:
            try:
                each = self.canonicalize_url(each, url)
            except:
                pass
            else:
                if each == "":
                    continue
                elif each not in self.visited: # and not self.frontier.has_link(each):
                    if not self.is_blacklisted(each):
                        new_links.append(each)
                if self.frontier.has_link(each): # added so that even if we don't want the link in frontier again, we know
                    if each not in self.inlinks:
                        self.inlinks[each] = [url]
                    else:
                        if url not in self.inlinks[each]:
                            self.inlinks[each].append(url) # another link points to it
            
        return new_links
                
    
    def update_inlink_counts(self, curr_link, new_outlinks):
        """
          For each new outlink, if it is in the inlink list already, append the current link to that list, else
          initialize the list as the current link (i.e, the current link points to the outlink). Once all inlink
          lists are updated, update the object for each outlink in the linkgraph (so that it holds the updated
          inlink list for them).
        """
        
        # update self.new_outlinks dictionary
        for each in new_outlinks:
            if each not in self.inlinks:
                self.inlinks[each] = [curr_link]
            else:
                if curr_link not in self.inlinks[each]:
                    self.inlinks[each].append(curr_link)
        
    
    def has_x_keywords(self,link):
        "Checks if the link contains keywords that give it a higher score"
        count = 0
        for each in self.keywords:
            if each in link.lower():
                count += 1
        return count
        
        
    def score_new_outlinks(self, links, wave):
        """
         Scores links before adding them to the frontier based on the number of
         inlinks, the key words that they have, if they are from wikipedia (good),
         and their wave number. 
        """
        scored_objs = []
        for each in links:
            temp_obj = Link(each, [], [], wave+1, 0)
            
            # calculate score
            score = 0
            score += len(self.inlinks[each])
            score -= wave+1
            score += (self.has_x_keywords(each) * 5)
            if 'wikipedia' in each.lower():
                score += 10

            # add score to object
            temp_obj.update_score(score)
            scored_objs.append(temp_obj)

        return scored_objs
    
    
    def save_variables(self):
        "Saves class variables to global variables so we have them in if program crashes and at end"
        
        with open(save_path + "inlinks.pickle", 'wb') as handle:
            pickle.dump(self.inlinks, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
        with open(save_path + "visited.pickle", 'wb') as handle:
            pickle.dump(self.visited, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
        temp_front = self.frontier.list[-5000:]
        with open(save_path + "frontier_list.pickle", 'wb') as handle:
            pickle.dump(temp_front, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
        with open(save_path + "link_graph.pickle", 'wb') as handle:
            pickle.dump(self.link_graph, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
    def save_frontier(self):
        
        with open('C:/6200-IR/hw3-mplatt27/frontier.pickle', 'wb') as handle:
            pickle.dump(self.frontier, handle, protocol=pickle.HIGHEST_PROTOCOL)

        
    def crawl(self):
        """
          Crawl the web for 40000 documents using a frontier Queue structure. For each link popped off the queue,
          get the robots.txt file, make sure we sleep if we have recently crawled that domain and that we can crawl
          that link. If we can, make a get reqest for the information and write the file. Explore the outlinks to 
          add to the frontier. Keep track of where we have been in a visited list. Periodically sort the frontier
          and save variables. 
        """
        #rp = urllib.robotparser.RobotFileParser()
        # i = 0
        save_count = 0
        sort_count = 0
        write_count = 0
        safe_stop = 0
        front_save = 0
        last_domain = None
        last_rp_input = None
        first_round = True
        second_round = True
        last_delay = 1
        
        while len(self.visited) != self.stop_criteria:
            if safe_stop > 42000: # for testing
                break
            
            if self.i % 100 == 0:
                print("visited is this long: ", len(self.visited))
            
            current_link_obj = self.frontier.pop()
            current_link = current_link_obj.link
            current_wave = current_link_obj.wave
            current_domain = current_link_obj.domain
            
            if current_link not in self.visited:
                
                # built input for robots.txt reader
                sch = urlparse(current_link).scheme.strip()
                rp_input = sch + '://' + current_domain + '/robots.txt'
                can_crawl = False
                
                # if we crawled same domain last time, use rp that we have
                if rp_input == last_rp_input:
                    can_crawl = rp.can_fetch("*", current_link)
                    delay = last_delay
                else:
                    # we need to make a new rp
                    try: 
                        rp = urllib.robotparser.RobotFileParser()
                        rp.set_url(current_link)
                        rp.read()
                    except:
                        pass
                    else:
                        can_crawl = rp.can_fetch("*", current_link)
                    
                # check if we can crawl the current url
                if can_crawl:

                    # if new domain, get the sleep delay
                    if current_domain != last_domain:
                        delay = rp.crawl_delay("*")
                        if delay == None:
                            delay = 1
                            last_delay = delay
                        
                    if delay < 3:
                        # get current domain to see if we have to sleep
                        if current_domain == last_domain:
                            time.sleep(delay)

                        # GET response for url
                        try:
                            resp = requests.get(current_link, timeout=3)
                        except:
                            pass
                        else:
                            
                            # check that it is html and in english
                            cont_type = resp.headers.get('Content-Type',0)
                            language = resp.headers.get('Content-Language',0)
                            if cont_type == 0:
                                cont_type = "None"

                            # create soup for later, and as a second check for language
                            soup = BeautifulSoup(resp.text, 'html.parser')
                            if soup != None:
                                if language == 0:
                                    if soup.html != None:
                                        language = soup.html.get('lang',0)
                                if 'text/html' in cont_type and language == 'en':

                                    # get details from the response for file
                                    doc_details = self.get_doc_details(current_link, resp, soup)
                                    
                                    # get outlinks and filter them
                                    outlinks = self.filter_links(current_link, doc_details['outlinks'])
                                    doc_details['outlinks'] = outlinks
                                    
                                    # save response to write file later
                                    self.files_to_write[self.i] = doc_details

                                    # update inlinks with the new outlinks (link objects will have empty inlink lists)
                                    # until they are crawled
                                    self.update_inlink_counts(current_link, outlinks)
                                    
                                    # score each outlink, create the object so it has the appropriate score while in front.
                                    outlink_objects = self.score_new_outlinks(outlinks, current_wave)
                                    
                                    # put the link objects (which have updated inlinks and scores) in frontier
                                    for each in outlink_objects:
                                        self.frontier.push(each)
                                    
                                    # add outlinks to link graph, then update current url's oulinks to what we found
                                    current_link_obj.update_outlinks(outlinks)
                                    current_link_obj.update_inlinks(self.inlinks[current_link])
                                    self.link_graph[current_link] = current_link_obj

                                    # append link to visited and increase count we are on
                                    self.visited.add(current_link)
                                    self.i += 1
                                    save_count += 1
                                    sort_count += 1
                                    write_count += 1
                                    front_save += 1
                                    last_domain = current_domain
                                    last_rp_input = rp_input
                            
             
            safe_stop += 1
            
            # save variables and write files
            if save_count == 500:
                self.save_variables()
                save_count = 0
                print("variables saved")
                
            if front_save == 7000:
                self.save_frontier()
                front_save = 0
                print("saved frontier")
                
            if write_count == 50:
                for num, dat in self.files_to_write.items():
                    self.write_doc_to_file(dat, num)
                self.files_to_write = {}
                write_count = 0
                print("wrote files")
                
                
            # sort frontier
            if sort_count == 5 and first_round:
                self.frontier.sort_queue()
                sort_count = 0
                print("frontier was sorted")
                first_round = False
            elif sort_count == 100 and second_round:
                self.frontier.sort_queue()
                sort_count = 0
                print("frontier was sorted")
                second_round = False
            elif sort_count == 500:
                self.frontier.sort_queue()
                sort_count = 0
                print("frontier was sorted")
        
        # save variables if finished running, save remaining files
        self.save_variables()
        for num, dat in self.files_to_write.items():
            self.write_doc_to_file(dat, num)
    

In [6]:
"""Main code to crawl the web"""

# obtain seed urls and save
seed_urls = ['http://en.wikipedia.org/wiki/World_War_II', 'http://www.history.com/topics/world-war-ii',
            'http://en.wikipedia.org/wiki/List_of_World_War_II_battles_involving_the_United_States', 
            'http://en.wikipedia.org/wiki/Military_history_of_the_United_States_during_World_War_II',
            'https://en.wikipedia.org/wiki/List_of_military_engagements_of_World_War_II']

# canonicalize seeds and create link objects for each
seed_link_objects = []
for each in seed_urls:
    each = url_normalize(each)
    start_index = each.find("#")
    if start_index >= 0:
        each = each[:start_index]
    each = each.replace("https", "http")
    obj = Link(each, [], [], 1, 10000) # give seeds a very high score
    seed_link_objects.append(obj)
    
print("Clean seed urls: \n")
for each in seed_link_objects:
    print(each.link)
    
# create frontier, inlinks, outlinks, and add link objects; add to global link graph also
FRONTIER = Queue()
for each in seed_link_objects:
    FRONTIER.push(each)
    LINK_GRAPH[each.link] = each
    INLINKS[each.link] = []
    OUTLINKS[each.link] = []  


Clean seed urls: 

http://en.wikipedia.org/wiki/World_War_II
http://www.history.com/topics/world-war-ii
http://en.wikipedia.org/wiki/List_of_World_War_II_battles_involving_the_United_States
http://en.wikipedia.org/wiki/Military_history_of_the_United_States_during_World_War_II
http://en.wikipedia.org/wiki/List_of_military_engagements_of_World_War_II


In [7]:
# print link graph to make sure seeds were added correctly
for key, value in LINK_GRAPH.items():
    print("link: ", key)
    print("outlinks: ", value.outlinks)
    print("inlinks: ", value.inlinks)
    print("wave: ", value.wave)
    print("score: ", value.relevance_score)
    print("*******************")

link:  http://en.wikipedia.org/wiki/World_War_II
outlinks:  []
inlinks:  []
wave:  1
score:  10000
*******************
link:  http://www.history.com/topics/world-war-ii
outlinks:  []
inlinks:  []
wave:  1
score:  10000
*******************
link:  http://en.wikipedia.org/wiki/List_of_World_War_II_battles_involving_the_United_States
outlinks:  []
inlinks:  []
wave:  1
score:  10000
*******************
link:  http://en.wikipedia.org/wiki/Military_history_of_the_United_States_during_World_War_II
outlinks:  []
inlinks:  []
wave:  1
score:  10000
*******************
link:  http://en.wikipedia.org/wiki/List_of_military_engagements_of_World_War_II
outlinks:  []
inlinks:  []
wave:  1
score:  10000
*******************


In [8]:
# pickle objects to start off (FRONTIER, VISITED, LINK_GRAPH, DOMAINS, INLINKS, OUTLINKS, I)
# note that domains is not longer used (we just keep track of the last domain visited)

save_path = 'C:/6200-IR/hw3-mplatt27/'
with open(save_path + "frontier.pickle", 'wb') as handle:
    pickle.dump(FRONTIER, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("frontier saved!")

with open(save_path + "visited.pickle", 'wb') as handle:
    pickle.dump(VISITED, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("visited list saved!")

with open(save_path + "link_graph.pickle", 'wb') as handle:
    pickle.dump(LINK_GRAPH, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("link graph saved!")

with open(save_path + "inlinks.pickle", 'wb') as handle:
    pickle.dump(INLINKS, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("inlinks saved!")

with open(save_path + "outlinks.pickle", 'wb') as handle:
    pickle.dump(OUTLINKS, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("outlinks saved!")

frontier saved!
visited list saved!
link graph saved!
inlinks saved!
outlinks saved!


In [9]:
# initialize crawler with frontier, visited, domains, inlinks, outlinks, link_graph, stop_criteria, i
c = Crawler(FRONTIER, VISITED, DOMAINS, INLINKS, OUTLINKS, LINK_GRAPH, 40000, I)

In [10]:
start = time.time()
c.crawl()
end = time.time()

visited is this long:  0
frontier was sorted
wrote files
wrote files
visited is this long:  100
frontier was sorted
wrote files
wrote files
visited is this long:  200
visited is this long:  200
wrote files
wrote files
visited is this long:  300
wrote files
wrote files
visited is this long:  400
wrote files
variables saved
wrote files
visited is this long:  500
wrote files
wrote files
visited is this long:  600
visited is this long:  600
frontier was sorted
wrote files
wrote files
visited is this long:  700
wrote files
wrote files
visited is this long:  800
wrote files
wrote files
visited is this long:  900
wrote files
variables saved
wrote files
visited is this long:  1000
wrote files
wrote files
visited is this long:  1100
frontier was sorted
wrote files
wrote files
visited is this long:  1200
visited is this long:  1200
wrote files
wrote files
visited is this long:  1300
wrote files
wrote files
visited is this long:  1400
wrote files
variables saved
wrote files
visited is this long: 

wrote files
visited is this long:  7700
wrote files
wrote files
visited is this long:  7800
wrote files
wrote files
visited is this long:  7900
wrote files
variables saved
wrote files
visited is this long:  8000
visited is this long:  8000
wrote files
wrote files
visited is this long:  8100
visited is this long:  8100
visited is this long:  8100
frontier was sorted
wrote files
wrote files
visited is this long:  8200
wrote files
wrote files
visited is this long:  8300
wrote files
wrote files
visited is this long:  8400
wrote files
variables saved
wrote files
visited is this long:  8500
wrote files
wrote files
visited is this long:  8600
frontier was sorted
wrote files
wrote files
visited is this long:  8700
wrote files
wrote files
visited is this long:  8800
wrote files
wrote files
visited is this long:  8900
wrote files
variables saved
wrote files
visited is this long:  9000
wrote files
wrote files
visited is this long:  9100
frontier was sorted
wrote files
wrote files
visited is this 

In [11]:
print("Crawl took {} seconds".format(end - start))

Crawl took 38653.04254245758 seconds


In [6]:
# load in pickles if program crashed
save_path = 'C:/6200-IR/hw3-mplatt27/'
handle = open(save_path + "frontier.pickle", 'rb')
FRONTIER = pickle.load(handle)
handle.close()

handle = open(save_path + "visited.pickle", 'rb')
VISITED = pickle.load(handle)
handle.close()

handle = open(save_path + "link_graph.pickle", 'rb')
LINK_GRAPH = pickle.load(handle)
handle.close()

# handle = open(save_path + "domains.pickle", 'rb')
# DOMAINS = pickle.load(handle)
# handle.close()

handle = open(save_path + "inlinks.pickle", 'rb')
INLINKS = pickle.load(handle)
handle.close()

handle = open(save_path + "outlinks.pickle", 'rb')
OUTLINKS = pickle.load(handle)
handle.close()

# handle = open(save_path + "i_value.pickle", 'rb')
# I = pickle.load(handle)
# handle.close()
I = 814

In [None]:
#########################################