In [40]:
import urllib.request, time, re, random, hashlib
from bs4 import BeautifulSoup 
import string
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cosine
from itertools import combinations
from scipy.cluster import hierarchy
from scipy.spatial import distance
%matplotlib osx


In [41]:
# Compassionate Caching inspired by 
# http://lethain.com/an-introduction-to-compassionate-screenscraping/

last_fetched_at = None

def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache-file-{}'.format(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
        return result

In [42]:
class Entity(object):
    def __init__(self, name):
        self.name = name
        self.words = None
        self.vector = None

In [43]:
def get_search_results(entity):
    """Return an html with search results for given entity."""  
    name = entity.name
    url_encoded_name = name.replace(' ', '%20')
    result = fetch('http://www.usatoday.com'+ '/search/' + url_encoded_name + '/')
    return result

In [44]:
def get_articles(results_html, n=3):
    """Return a list of article htmls for given search results html."""
    soupify_result = BeautifulSoup(results_html, 'html.parser')
    link_results = soupify_result.find_all('a', attrs={'class':'search-result-item-link'})
    domain = 'http://www.usatoday.com'
    articles = []
    num_result = 0
    for link in link_results:
        # this excludes any video, audio results
        if (link['href'].startswith('/story') and num_result < n):
            # the links are relevant links, convert them to absolute links
            article = fetch(domain+link['href'])
            articles.append(article)
            num_result += 1
    return articles

In [45]:
from nltk.corpus import stopwords
from nltk import word_tokenize

def get_words(articles):
    """Return list of representative words from a list of article htmls."""
    bag_of_words = []
    for article in articles:
        soupify_article = BeautifulSoup(article,'html.parser')
        paragraphs = soupify_article.find_all('p',attrs={'class':None})
        for p in paragraphs:
            if p.parent.name != 'a':
                words = word_tokenize(p.text)
                filtered_words = [w.lower() for w in words if w.isalpha()]
                filtered_stop_words = [w for w in filtered_words if w not in stopwords.words('english')]
                bag_of_words += filtered_stop_words
        
    bag_of_words = set(bag_of_words)
    return bag_of_words

In [46]:
def get_bag_of_words(entity):
    results = get_search_results(entity)
    articles = get_articles(results,3)
    return get_words(articles)

In [47]:
entity = Entity('hillary clinton')
entity.words = get_bag_of_words(entity)
print(entity.words)

Loading: http://www.usatoday.com/search/hillary%20clinton/
Loading: http://www.usatoday.com/story/news/politics/elections/2016/2016/02/02/cruz-topples-trump-clinton-sanders-await-final-tally/79685690/
Loading: http://www.usatoday.com/story/news/politics/onpolitics/2016/02/02/hillary-clinton-new-hampshire-iowa-democrats/79700614/
Loading: http://www.usatoday.com/story/news/politics/elections/2016/2016/02/03/young-supporters-drive-sanders-virtual-tie-clinton/79739492/
{'jeffrey', 'folks', 'laughter', 'loss', 'respective', 'registering', 'since', 'showed', 'city', 'helped', 'won', 'theme', 'declined', 'worked', 'focus', 'ames', 'reminded', 'economic', 'slimmest', 'embarrassment', 'volunteers', 'nashua', 'photo', 'juggernaut', 'questions', 'revolution', 'age', 'separating', 'number', 'polling', 'cedar', 'maryland', 'move', 'strong', 'error', 'described', 'ago', 'thus', 'willing', 'vote', 'jubilant', 'reflection', 'big', 'total', 'together', 'introduced', 'sound', 'worth', 'political', 'res