crawler.py


# Copyright (C) 2011 by Peter Goodman
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import urllib2
import urlparse
from BeautifulSoup import *
from collections import defaultdict
import re
import sqlite3 as lite
import operator

def attr(elem, attr):
    """An html attribute from an html element. E.g. <a href="">, then
    attr(elem, "href") will get the href or an empty string."""
    try:
        return elem[attr]
    except:
        return ""

WORD_SEPARATORS = re.compile(r'\s|\n|\r|\t|[^a-zA-Z0-9\-_]')

class crawler(object):
    """Represents 'Googlebot'. Populates a database by crawling and indexing
    a subset of the Internet.

    This crawler keeps track of font sizes and makes it simpler to manage word
    ids and document ids."""

    def __init__(self, db_conn, url_file):
        """Initialize the crawler with a connection to the database to populate
        and with the file containing the list of seed URLs to begin indexing."""

        # ID counters
        self._word_id_count = 0
        self._doc_id_count = 0
        self._temp_doc_id = 0   # Current doc_id count stored in self._temp_doc_id_count
	
        # Inverted data structures
        self._inverted_index = {}
        self._inverted_doc_cache = {}   # doc_ids are keys, full URLs are values
        self._inverted_word_cache = {}  # word_ids are keys, word strings are values

        self._url_queue = [ ]
        self._doc_id_cache = { }        # full URLs are keys, doc_ids are values
        self._word_id_cache = { }       # word strings are values, word_ids are keys
        self._simp_rank = {}            # full URLs are keys, hit counts are values

        # functions to call when entering and exiting specific tags
        self._enter = defaultdict(lambda *a, **ka: self._visit_ignore)
        self._exit = defaultdict(lambda *a, **ka: self._visit_ignore)

        # add a link to our graph, and indexing info to the related page
        self._enter['a'] = self._visit_a

        # record the currently indexed document's title an increase
        # the font size
        def visit_title(*args, **kargs):
            self._visit_title(*args, **kargs)
            self._increase_font_factor(7)(*args, **kargs)

        # increase the font size when we enter these tags
        self._enter['b'] = self._increase_font_factor(2)
        self._enter['strong'] = self._increase_font_factor(2)
        self._enter['i'] = self._increase_font_factor(1)
        self._enter['em'] = self._increase_font_factor(1)
        self._enter['h1'] = self._increase_font_factor(7)
        self._enter['h2'] = self._increase_font_factor(6)
        self._enter['h3'] = self._increase_font_factor(5)
        self._enter['h4'] = self._increase_font_factor(4)
        self._enter['h5'] = self._increase_font_factor(3)
        self._enter['title'] = visit_title

        # decrease the font size when we exit these tags
        self._exit['b'] = self._increase_font_factor(-2)
        self._exit['strong'] = self._increase_font_factor(-2)
        self._exit['i'] = self._increase_font_factor(-1)
        self._exit['em'] = self._increase_font_factor(-1)
        self._exit['h1'] = self._increase_font_factor(-7)
        self._exit['h2'] = self._increase_font_factor(-6)
        self._exit['h3'] = self._increase_font_factor(-5)
        self._exit['h4'] = self._increase_font_factor(-4)
        self._exit['h5'] = self._increase_font_factor(-3)
        self._exit['title'] = self._increase_font_factor(-7)

        # never go in and parse these tags
        self._ignored_tags = set([
            'meta', 'script', 'link', 'meta', 'embed', 'iframe', 'frame', 
            'noscript', 'object', 'svg', 'canvas', 'applet', 'frameset', 
            'textarea', 'style', 'area', 'map', 'base', 'basefont', 'param',
        ])

        # set of words to ignore
        self._ignored_words = set([
            '', 'the', 'of', 'at', 'on', 'in', 'is', 'it',
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
            'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
            'u', 'v', 'w', 'x', 'y', 'z', 'and', 'or',
        ])

        # TODO remove me in real version
        self._mock_next_doc_id = 1
        self._mock_next_word_id = 1

        # keep track of some info about the page we are currently parsing
        self._curr_depth = 0
        self._curr_url = ""
        self._curr_doc_id = 0
        self._font_size = 0
        self._curr_words = None

        # get all urls into the queue
        try:
            with open(url_file, 'r') as f:
                for line in f:
                    self._url_queue.append((self._fix_url(line.strip(), ""), 0))
        except IOError:
            pass

    def _insert_document(self, url):
        """A function that inserts a new url into a document db table
        and then returns that newly inserted document's id."""

        # Increment ID count when a new page is visited
        self._doc_id_count += 1
        self._doc_id_cache[url] = self._doc_id_count
        self._inverted_doc_cache[self._doc_id_count] = url
        return self._doc_id_count
    
    def _insert_word(self, word):
        """A function that inserts a word into the lexicon db table,
        increases word_count by 1 and then returns that newly 
        inserted word's id."""

        # Increment ID count when a new word is found
        self._word_id_count +=1
        self._word_id_cache[word] = self._word_id_count
        self._inverted_word_cache[self._word_id_count] = word
        return self._word_id_count
    
    def word_id(self, word):
        """Get the word id of some specific word."""

        # Encode words to get rid of ('u...') in word string
        word = word.encode('ascii')
        if word in self._word_id_cache:

            # Add to inverted index
            if self._temp_doc_id not in self._inverted_index[self._word_id_cache[word]]:
                self._inverted_index[self._word_id_cache[word]].add(self._temp_doc_id)
            
            return self._word_id_cache[word]

        word_id = self._insert_word(word)
        self._word_id_cache[word] = word_id
        self._inverted_word_cache[word_id] = word

        # Add to inverted index
        if word_id not in self._inverted_index:
            # Create empty set for the new word
            self._inverted_index[word_id] = set()

        if self._temp_doc_id not in self._inverted_index[word_id]:
            self._inverted_index[word_id].add(self._temp_doc_id)

        return word_id
    
    def document_id(self, url):
        """Get the document id for some url."""

        url = url.encode('ascii')

        if url in self._doc_id_cache:
            self._temp_doc_id = self._doc_id_cache[url]
            return self._doc_id_cache[url]
        
        doc_id = self._insert_document(url)
        self._doc_id_cache[url] = doc_id
        self._inverted_doc_cache[doc_id] = url
        self._temp_doc_id = doc_id
        #Insert into ranker
        if doc_id not in self._simp_rank.keys():
            self._simp_rank[doc_id] = 0
        return doc_id
    
    def _fix_url(self, curr_url, rel):
        """Given a url and either something relative to that url or another url,
        get a properly parsed url."""

        rel_l = rel.lower()
        if rel_l.startswith("http://") or rel_l.startswith("https://"):
            curr_url, rel = rel, ""
            
        # compute the new url based on import 
        curr_url = urlparse.urldefrag(curr_url)[0]
        parsed_url = urlparse.urlparse(curr_url)
        return urlparse.urljoin(parsed_url.geturl(), rel)

    def add_link(self, from_doc_id, to_doc_id):
        """Add a link into the database, or increase the number of links between
        two pages in the database."""
        # TODO
        if to_doc_id in self._simp_rank.keys():
            self._simp_rank[to_doc_id] += 1
        else:
            self._simp_rank[to_doc_id] = 1

    def _visit_title(self, elem):
        """Called when visiting the <title> tag."""
        title_text = self._text_of(elem).strip()
        print "document title="+ repr(title_text)

        # TODO update document title for document id self._curr_doc_id
    
    def _visit_a(self, elem):
        """Called when visiting <a> tags."""
        dest_url = self._fix_url(self._curr_url, attr(elem,"href"))

        #print "href="+repr(dest_url), \
        #      "title="+repr(attr(elem,"title")), \
        #      "alt="+repr(attr(elem,"alt")), \
        #      "text="+repr(self._text_of(elem))

        # add the just found URL to the url queue
        self._url_queue.append((dest_url, self._curr_depth))
        
        # add a link entry into the database from the current document to the
        # other document
        self.add_link(self._curr_doc_id, self.document_id(dest_url))
        
        # TODO add title/alt/text to index for destination url
    
    def _add_words_to_document(self):
        # TODO: knowing self._curr_doc_id and the list of all words and their
        #       font sizes (in self._curr_words), add all the words into the
        #       database for this document
        print "    num words="+ str(len(self._curr_words))

    def _increase_font_factor(self, factor):
        """Increade/decrease the current font size."""
        def increase_it(elem):
            self._font_size += factor
        return increase_it
    
    def _visit_ignore(self, elem):
        """Ignore visiting this type of tag"""
        pass

    def _add_text(self, elem):
        """Add some text to the document. This records word ids and word font sizes
        into the self._curr_words list for later processing."""
        words = WORD_SEPARATORS.split(elem.string.lower())
        for word in words:
            word = word.strip()
            if word in self._ignored_words:
                continue
            self._curr_words.append((self.word_id(word), self._font_size))
        
    def _text_of(self, elem):
        """Get the text inside some element without any tags."""
        if isinstance(elem, Tag):
            text = [ ]
            for sub_elem in elem:
                text.append(self._text_of(sub_elem))
            
            return " ".join(text)
        else:
            return elem.string

    def _index_document(self, soup):
        """Traverse the document in depth-first order and call functions when entering
        and leaving tags. When we come accross some text, add it into the index. This
        handles ignoring tags that we have no business looking at."""
        class DummyTag(object):
            next = False
            name = ''
        
        class NextTag(object):
            def __init__(self, obj):
                self.next = obj
        
        tag = soup.html
        stack = [DummyTag(), soup.html]

        while tag and tag.next:
            tag = tag.next

            # html tag
            if isinstance(tag, Tag):

                if tag.parent != stack[-1]:
                    self._exit[stack[-1].name.lower()](stack[-1])
                    stack.pop()

                tag_name = tag.name.lower()

                # ignore this tag and everything in it
                if tag_name in self._ignored_tags:
                    if tag.nextSibling:
                        tag = NextTag(tag.nextSibling)
                    else:
                        self._exit[stack[-1].name.lower()](stack[-1])
                        stack.pop()
                        tag = NextTag(tag.parent.nextSibling)
                    
                    continue
                
                # enter the tag
                self._enter[tag_name](tag)
                stack.append(tag)

            # text (text, cdata, comments, etc.)
            else:
                self._add_text(tag)

    def store_rank_info(self):
        """Creates and stores a databse 'page_rank.db' with table 'rankTable'
        based on resolved_inverted_index and _simp_rank"""

        # Resolved Inverted Index
        res_ind = self.get_resolved_inverted_index()
        # Rank Index
        rnk_ind = self.get_rank()
        con = lite.connect("page_rank.db")
        cur = con.cursor()
        cur.execute("DROP TABLE IF EXISTS rankTable")
        cur.execute("CREATE TABLE rankTable(id INTEGER PRIMARY KEY, url TEXT, rank INTEGER, words TEXT)")
        # IF NOT EXISTS
        
        # Populate database
        for url_id in self._inverted_doc_cache.keys():
            url_text = self._inverted_doc_cache[url_id]
            url_rank = rnk_ind[url_id]
            cur.execute('INSERT INTO rankTable(url, rank) VALUES (?, ?)', (url_text, url_rank))
            con.commit()

        # Insert words
        for word in res_ind:
            for url_text in res_ind[word]:
                cur.execute('SELECT * FROM rankTable WHERE url=?', (url_text,))
                row = cur.fetchall()
                
                # Initializing 'words' for the table
                if row[0][3] == None:
                    cur_words = ""
                else:
                    cur_words = str(row[0][3])
                # Space separate words
                cur_words += (word + " ")
                cur.execute('UPDATE rankTable SET words=? WHERE url=?', (cur_words, url_text))
                cur.execute('SELECT * FROM rankTable WHERE url=?', (url_text,))
                con.commit()
        con.close()
        return

    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue
	    
	    seen.add(doc_id) # mark this document as haven't been visited
            
            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = [ ]
                self._index_document(soup)
                self._add_words_to_document()
                print "    url="+repr(self._curr_url)

            except Exception as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()
        self.store_rank_info()

    def get_inverted_index(self):
        """Returns inverted _inverted_index"""
        
        return self._inverted_index

    def get_resolved_inverted_index(self):
        """Returns resolved_inverted_index"""

        resolved_inverted_index = {}

        for key_id in self._inverted_index:
            resol_word = self._inverted_word_cache[key_id]
            if resol_word not in resolved_inverted_index:
                    resolved_inverted_index[resol_word] = set()

            word_doc_ids = self._inverted_index[key_id]

            for doc_id in word_doc_ids:

                if doc_id not in resolved_inverted_index[resol_word]:
                    resol_doc = self._inverted_doc_cache[doc_id]
                    resolved_inverted_index[resol_word].add(resol_doc)

        return resolved_inverted_index

    def get_rank(self):
        """Returns rank"""
        return self._simp_rank

    def get_doc_cache(self):
        """Returns self_inverted_doc_cache"""
        return self._inverted_doc_cache

def get_url_list_for_word(word):
    """Returns a list of tuples. Tuples contain the URL that contained 'word', as well as other info"""

    con = lite.connect("page_rank.db")
    cur = con.cursor()
    cur.execute('SELECT * FROM rankTable')
    search_res = []
    # Get entire table
    rows = cur.fetchall()
    for row in rows:
        # 'row[3]' is a single string of all the words at its given URL, they are space seperated
        all_words = row[3]
        all_split = all_words.split()
        # Adds row to 'search_res' if 'word' exists anywhere in 'all_words'
        if word in all_split:
            search_res.append((row[0], str(row[1]), row[2], str(row[3])))
    return search_res

def get_url_list(words):
    """Returns a list of tuples. Tuples contain the URL that contained any space sperated word in 'words', as 
    well as other info. Changes rank, in returned the list, for a related URL if the URL contains more than
    one word from 'words'""" 

    all_words = words.split()
    final_res = []
    for word in all_words:
        # Get a list of results for each word in 'words'
        cur_res = get_url_list_for_word(word)
        for res in cur_res:
            # Check for if the current url entry is already in 'final_res'
            if res not in final_res:
                final_res.append(res)
            else:
                final_res.remove(res)
                # Updating rank of result if a url has more than one word from 'words'
                rep_res = (res[0],res[1],(res[2] + 1), res[3])
                final_res.append(rep_res)
    return final_res


if __name__ == "__main__":
    bot = crawler(None, "urls.txt")
    bot.crawl(depth=1)
    print "\nURL list\n"
    # Should return only 2 pages (...test1, ...test2) and test 1 rank should be higher
    print get_url_list("onetwo uniquethree")
    print "\nDone\n"