In [1]:
import urllib.request, time, re, random, hashlib
from bs4 import BeautifulSoup 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk import word_tokenize
%matplotlib osx

In [2]:
# Compassionate Caching inspired by 
# http://lethain.com/an-introduction-to-compassionate-screenscraping/

last_fetched_at = None

def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache-file-{}'.format(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
        return result

In [3]:
# Get a set of links for news articles
def get_news_links(link):
    articles = fetch(link)
    soup = BeautifulSoup(articles, 'html.parser')
    links = set([])
    pattern = re.compile(r"^http://www\.factcheck\.org/\d{4}/\d{2}/")
    for a in soup.find_all("a"):
        sub_link = a.get("href")
        if sub_link != None and pattern.match(sub_link):
            links.add(sub_link)
    return links

def news_scraper(n=10):
    links = set([])
    for i in range(1, n+1):
        link = "http://www.factcheck.org/askfactcheck/page/{}/".format(i)
        links = links.union(get_news_links(link))
    return links

In [4]:
len(news_scraper())

Loading: http://www.factcheck.org/askfactcheck/page/1/
Loading: http://www.factcheck.org/askfactcheck/page/2/
Loading: http://www.factcheck.org/askfactcheck/page/3/
Loading: http://www.factcheck.org/askfactcheck/page/4/
Loading: http://www.factcheck.org/askfactcheck/page/5/
Loading: http://www.factcheck.org/askfactcheck/page/6/
Loading: http://www.factcheck.org/askfactcheck/page/7/
Loading: http://www.factcheck.org/askfactcheck/page/8/
Loading: http://www.factcheck.org/askfactcheck/page/9/
Loading: http://www.factcheck.org/askfactcheck/page/10/


100

In [10]:
# Get a set of links for opinion articles
def get_opinion_links(link):
    articles = fetch(link)
    soup = BeautifulSoup(articles, 'html.parser')
    links = set([])
    for a in soup.find_all("a"):
        sub_link = a.get("href")
        if sub_link != None and sub_link.startswith("/story/opinion/"):
            links.add('http://www.usatoday.com' + sub_link)
    return links

def opinion_scraper():
    links = get_opinion_links("http://www.usatoday.com/opinion/")
    for link in links:
        sub_set = get_opinion_links(link)
        links = links.union(sub_set)
    return links

In [11]:
opinion_scraper()

Retrieving from cache: http://www.usatoday.com/opinion/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/09/trump-sanders-wins-new-hampshire-economic-anxiety-column/80088548/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/15/jim-wallis-getting-personal-racism-black-lives-matter/79977654/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/14/martin-shkreli-cam-newton-second-look-your-say/80383482/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/hillary-clintons-woman-problem-column/80175130/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/colbert-noah-fallon-kimmel-corden-sanders-trump-punchlines-new-hampshire/80179418/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/12/valentines-day-jimmy-kimmel-james-corden-punchlines-funny/80289898/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/11/hillary-clinton-speeches-goldman-sachs-wall-street-sp

{'http://www.usatoday.com/story/opinion/2014/08/28/russia-ukraine-nato-vladimir-putin-president-obama-editorials-debates/14766425/',
 'http://www.usatoday.com/story/opinion/2015/05/25/caption-contest-youtoon/1568271/',
 'http://www.usatoday.com/story/opinion/2015/07/23/highway-funding-oil-gasoline-tax-fix-congress-editorials-debates/30579385/',
 'http://www.usatoday.com/story/opinion/2016/02/08/cal-thomas-elections-2016-god-religion-politics-evangelical-voters-column/79943324/',
 'http://www.usatoday.com/story/opinion/2016/02/08/martin-shkreli-drug-prices-your-say/80026236/',
 'http://www.usatoday.com/story/opinion/2016/02/09/bernie-sanders-hillary-clinton-new-hampshire-column/80094342/',
 'http://www.usatoday.com/story/opinion/2016/02/09/new-hampshire-primary-donald-trump-bernie-sanders-editorials-debates/80091284/',
 'http://www.usatoday.com/story/opinion/2016/02/09/our-votes-matter-voter-id-citizens-united-voting-rights-act-democracy-awakens-column/80068028/',
 'http://www.usatoday.

In [7]:
def get_words(article_html):
    """Return list of representative words from an article. """
    bag_of_words = []
    soupify_article = BeautifulSoup(article_html, 'html.parser')
    paragraphs = soupify_article.find_all('p',attrs={'class':None})
    for p in paragraphs:
        if p.parent.name != 'a':
            words = word_tokenize(p.text)
            filtered_words = [w.lower() for w in words if w.isalpha()]
            filtered_stop_words = [w for w in filtered_words if w not in stopwords.words('english')]
            bag_of_words += filtered_stop_words
    return bag_of_words

In [10]:
OPINION = 1
NEWS = 0
def build_corpus():
    opinion = [(get_words(fetch(link)), OPINION) for link in opinion_scraper()]
    news = [(get_words(fetch(link)), NEWS) for link in news_scraper()]
    corpus = news + opinion
    random.shuffle(corpus)
    return corpus

In [12]:
def build_word_corpus():
    '''for each word record if it appears in opinion or in news'''
    opinion = [(word, OPINION) for link in opinion_scraper() for word in get_words(fetch(link)) ]
    news = [(word, NEWS) for link in news_scraper() for word in get_words(fetch(link)) ]
    return opinion
d = build_word_corpus()
print(d[:10])

Retrieving from cache: http://www.usatoday.com/opinion/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/09/trump-sanders-wins-new-hampshire-economic-anxiety-column/80088548/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/15/jim-wallis-getting-personal-racism-black-lives-matter/79977654/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/14/martin-shkreli-cam-newton-second-look-your-say/80383482/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/hillary-clintons-woman-problem-column/80175130/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/colbert-noah-fallon-kimmel-corden-sanders-trump-punchlines-new-hampshire/80179418/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/12/valentines-day-jimmy-kimmel-james-corden-punchlines-funny/80289898/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/11/hillary-clinton-speeches-goldman-sachs-wall-street-sp

NameError: name 'get_words' is not defined

In [11]:
corpus = build_corpus()

Retrieving from cache: http://www.usatoday.com/opinion/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/10/oil-tax-10-barrel-infrastructure-president-obama-climate-change-editorials-debates/80056688/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/11/hillary-clinton-bernie-sanders-wall-street-lanny-davis-editorials-debates/80253414/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/14/why-i-wrote-play-antonin-scalia-originalist-john-strand/80374808/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/11/federal-budget-obama-deficits-debt-tellusatoday-your-say/80253310/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/14/antonin-scalia-2016-presidential-election-voters-editorials-debates/80382050/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/11/obama-budget-children-summer-food-hope-change-david-cay-johnston/80199860/
Retrieving from cache: http://www.usatoday.com/st

In [12]:
def vectorize(list_of_texts):
    """Return feature vectors for each entity given list of texts."""
    vectorizer = TfidfVectorizer(analyzer = "word",
                                 tokenizer = None,
                                 preprocessor = None,
                                 stop_words = None,
                                 max_features = 500)
    compressed_vectors = vectorizer.fit_transform(list_of_texts)
    return compressed_vectors.toarray()

In [5]:

sample = 'project annenberg public policy center is not good'
test = sample.split()
print(test)

['project', 'annenberg', 'public', 'policy', 'center', 'is', 'not', 'good']


In [29]:
# Generate Part-Of-Speech tag and associated features 

def gen_pos_tag(article):
    '''Param: article is a list of words from an individual article
       Return: A dictionary of the ratio of each tag type'''
    modifiers = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    past_tense = ['VBD', 'VBN']
    modifiers
    total = len(article)
    tags = nltk.pos_tag(article)
    tags_count = {}
    for (w,t) in tags:
        tags_count[t] = tags_count.get(t, 0) + 1
    for (w,t) in tags:
        tags_count[t] /= total
    return tags_count


{'JJ': 0.03125, 'NN': 0.005859375, 'VBZ': 0.03125, 'RB': 0.125}


In [10]:
# Negation
import nltk
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

def mark_negations(article):
    '''attach NEG tag to the negation'''
    sentim_analyzer = SentimentAnalyzer()
    article = mark_negation(article)
    return article

l = mark_negations(test)
print(l)

['project', 'annenberg', 'public', 'policy', 'center', 'is', 'not', 'good_NEG']


In [44]:
def compute_scores_for_n(data, k=5):
    
    num_instances = len(data)
    num_in_fold = num_instances // 5
    
    scores = []
    for i in range(k):
        training_set = (data[0:i*num_in_fold] + 
                    data[(i+1)*num_in_fold:])
        test_set = data[i*num_in_fold: (i+1)*num_in_fold]

        X_train = vectorize([" ".join(article) for article, tag in training_set])
        y_train = [tag for article, tag in training_set]
        X_test = vectorize([" ".join(article) for article, tag in test_set])
        y_true = [tag for article, tag in test_set]
        
        clf = RandomForestClassifier().fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        score = metrics.f1_score(y_true, y_pred, average="weighted")
        print(score)
        scores.append(score)

    return sum(scores)/len(scores)

In [45]:
compute_scores_for_n(corpus)



0.529411764706
0.75
0.484848484848




0.387096774194
0.484848484848




0.52724110171928007

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics