In [5]:
import urllib.request, time, re, random, hashlib
from bs4 import BeautifulSoup 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk import word_tokenize
%matplotlib osx

In [6]:
# Compassionate Caching inspired by 
# http://lethain.com/an-introduction-to-compassionate-screenscraping/

last_fetched_at = None

def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache-file-{}'.format(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
        return result

In [7]:
# Get a set of links for news articles
def get_news_links(link):
    articles = fetch(link)
    soup = BeautifulSoup(articles, 'html.parser')
    links = set([])
    pattern = re.compile(r"^http://www\.factcheck\.org/\d{4}/\d{2}/")
    for a in soup.find_all("a"):
        sub_link = a.get("href")
        if sub_link != None and pattern.match(sub_link):
            links.add(sub_link)
    return links

def news_scraper(n=10):
    links = set([])
    for i in range(1, n+1):
        link = "http://www.factcheck.org/askfactcheck/page/{}/".format(i)
        links = links.union(get_news_links(link))
    return links

In [95]:
len(news_scraper())

Retrieving from cache: http://www.factcheck.org/askfactcheck/page/1/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/2/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/3/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/4/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/5/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/6/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/7/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/8/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/9/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/10/


100

In [8]:
# Get a set of links for opinion articles
def get_opinion_links(link):
    articles = fetch(link)
    soup = BeautifulSoup(articles, 'html.parser')
    links = set([])
    for a in soup.find_all("a"):
        sub_link = a.get("href")
        if sub_link != None and sub_link.startswith("/story/opinion/"):
            links.add('http://www.usatoday.com' + sub_link)
    return links

def opinion_scraper():
    links = get_opinion_links("http://www.usatoday.com/opinion/")
    for link in links:
        sub_set = get_opinion_links(link)
        links = links.union(sub_set)
    return links

In [109]:
opinion_scraper()

Retrieving from cache: http://www.usatoday.com/opinion/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/hillary-clintons-woman-problem-column/80175130/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/anthem-cruise-ship-storm-your-say/80202290/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/exonerations-dna-convicted-forensic-criminal-justice-column/80056392/
Retrieving from cache: http://www.usatoday.com/story/opinion/2015/05/25/caption-contest-youtoon/1568271/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/09/military-medical-battlefield-training-live-tissue-training-animal-rights-column/80018116/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/04/trump-sanders-mccain-new-hampshire-mavericks-column/79832920/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/08/martin-shkreli-drug-prices-your-say/80026236/
Retrieving from cache: http://www.usatoday.com/

{'http://www.usatoday.com/story/opinion/2013/09/16/syria-irs-lerner-column/2816277/',
 'http://www.usatoday.com/story/opinion/2014/05/12/president-obama-irs-scandal-watergate-column/8968317/',
 'http://www.usatoday.com/story/opinion/2015/05/25/caption-contest-youtoon/1568271/',
 'http://www.usatoday.com/story/opinion/2015/07/23/highway-funding-oil-gasoline-tax-fix-congress-editorials-debates/30579385/',
 'http://www.usatoday.com/story/opinion/2015/12/21/cdc-opioids-heroin-overdoses-doctors-editorials-debates/77708774/',
 'http://www.usatoday.com/story/opinion/2016/02/01/mia-love-single-subject-rule-constitutional-amendment--reynolds-column/79605158/',
 'http://www.usatoday.com/story/opinion/2016/02/01/super-bowl-football-brain-damage-immoral-watch-column/79654086/',
 'http://www.usatoday.com/story/opinion/2016/02/04/obama--wrong-solitary-confinement-column/79649416/',
 'http://www.usatoday.com/story/opinion/2016/02/04/trump-sanders-mccain-new-hampshire-mavericks-column/79832920/',
 'ht

In [9]:
def get_words(article_html):
    """Return list of representative words from an article. """
    bag_of_words = []
    soupify_article = BeautifulSoup(article_html, 'html.parser')
    paragraphs = soupify_article.find_all('p',attrs={'class':None})
    for p in paragraphs:
        if p.parent.name != 'a':
            words = word_tokenize(p.text)
            filtered_words = [w.lower() for w in words if w.isalpha()]
            filtered_stop_words = [w for w in filtered_words if w not in stopwords.words('english')]
            bag_of_words += filtered_stop_words
    return bag_of_words

In [10]:
OPINION = 1
NEWS = 0
def build_corpus():
    opinion = [(get_words(fetch(link)), OPINION) for link in opinion_scraper()]
    news = [(get_words(fetch(link)), NEWS) for link in news_scraper()]
    corpus = news + opinion
    random.shuffle(corpus)
    return corpus

In [11]:
corpus = build_corpus()

Retrieving from cache: http://www.usatoday.com/opinion/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/new-hampshire-primary-donald-trump-bernie-sanders-tellusatoday-your-say/80202062/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/08/primary-voting-presidential-election-tellusatoday-your-say/80026468/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/anthem-cruise-ship-storm-your-say/80202290/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/hillary-clinton-women-voters-millennials-new-hampshire-column/80190950/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/marco-rubio-hip-hop-ben-carson-trump-bush-young-minority-voters-column/76387044/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/09/bernie-sanders-hillary-clinton-new-hampshire-column/80094342/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/07/new-hampshire-primary-100-year

In [12]:
def vectorize(list_of_texts):
    """Return feature vectors for each entity given list of texts."""
    vectorizer = TfidfVectorizer(analyzer = "word",
                                 tokenizer = None,
                                 preprocessor = None,
                                 stop_words = None,
                                 max_features = 500)
    compressed_vectors = vectorizer.fit_transform(list_of_texts)
    return compressed_vectors.toarray()

In [44]:
def compute_scores_for_n(data, k=5):
    
    num_instances = len(data)
    num_in_fold = num_instances // 5
    
    scores = []
    for i in range(k):
        training_set = (data[0:i*num_in_fold] + 
                    data[(i+1)*num_in_fold:])
        test_set = data[i*num_in_fold: (i+1)*num_in_fold]

        X_train = vectorize([" ".join(article) for article, tag in training_set])
        y_train = [tag for article, tag in training_set]
        X_test = vectorize([" ".join(article) for article, tag in test_set])
        y_true = [tag for article, tag in test_set]
        
        clf = RandomForestClassifier().fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        score = metrics.f1_score(y_true, y_pred, average="weighted")
        print(score)
        scores.append(score)

    return sum(scores)/len(scores)

In [45]:
compute_scores_for_n(corpus)



0.529411764706
0.75
0.484848484848




0.387096774194
0.484848484848




0.52724110171928007

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics