In [48]:
import urllib.request, time, re, random, hashlib, nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk import word_tokenize
from sklearn.svm import LinearSVC as LSVC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import BaggingClassifier as BC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, average_precision_score
%matplotlib osx

### 1.Get data

In [3]:
# Compassionate Caching inspired by 
# http://lethain.com/an-introduction-to-compassionate-screenscraping/

last_fetched_at = None

def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache-file-{}'.format(url_hash)
#     print(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
        return result

In [4]:
# Get a set of links for news articles
def get_news_links(link):
    articles = fetch(link)
    soup = BeautifulSoup(articles, 'html.parser')
    links = set([])
    pattern = re.compile(r"^http://www\.factcheck\.org/\d{4}/\d{2}/")
    for a in soup.find_all("a"):
        sub_link = a.get("href")
        if sub_link != None and pattern.match(sub_link):
            links.add(sub_link)
    return links

def news_scraper(n=10):
    links = set([])
    for i in range(1, n+1):
        link = "http://www.factcheck.org/askfactcheck/page/{}/".format(i)
        links = links.union(get_news_links(link))
    return links

In [5]:
news_set = news_scraper()

Retrieving from cache: http://www.factcheck.org/askfactcheck/page/1/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/2/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/3/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/4/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/5/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/6/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/7/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/8/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/9/
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/10/


In [6]:
# Get a set of links for opinion articles
def get_opinion_links(link):
    articles = fetch(link)
    soup = BeautifulSoup(articles, 'html.parser')
    links = set([])
    for a in soup.find_all("a"):
        sub_link = a.get("href")
        if sub_link != None and sub_link.startswith("/story/opinion/"):
            links.add('http://www.usatoday.com' + sub_link)
    return links

def opinion_scraper():
    links = get_opinion_links("http://www.usatoday.com/opinion/")
    for link in links:
        sub_set = get_opinion_links(link)
        links = links.union(sub_set)
    return links

In [11]:
opinion_set = {'http://www.usatoday.com/story/opinion/2013/09/16/syria-irs-lerner-column/2816277/',
 'http://www.usatoday.com/story/opinion/2013/09/25/grocery-store-detroit-irs-column/2868797/',
 'http://www.usatoday.com/story/opinion/2014/05/12/president-obama-irs-scandal-watergate-column/8968317/',
 'http://www.usatoday.com/story/opinion/2014/08/28/russia-ukraine-nato-vladimir-putin-president-obama-editorials-debates/14766425/',
 'http://www.usatoday.com/story/opinion/2015/05/25/caption-contest-youtoon/1568271/',
 'http://www.usatoday.com/story/opinion/2015/07/23/highway-funding-oil-gasoline-tax-fix-congress-editorials-debates/30579385/',
 'http://www.usatoday.com/story/opinion/2015/12/21/cdc-opioids-heroin-overdoses-doctors-editorials-debates/77708774/',
 'http://www.usatoday.com/story/opinion/2016/02/01/mia-love-single-subject-rule-constitutional-amendment--reynolds-column/79605158/',
 'http://www.usatoday.com/story/opinion/2016/02/01/super-bowl-football-brain-damage-immoral-watch-column/79654086/',
 'http://www.usatoday.com/story/opinion/2016/02/04/obama--wrong-solitary-confinement-column/79649416/',
 'http://www.usatoday.com/story/opinion/2016/02/04/trump-sanders-mccain-new-hampshire-mavericks-column/79832920/',
 'http://www.usatoday.com/story/opinion/2016/02/07/journalists-jail-murder-censorship-turkey-editorials-debates/79844586/',
 'http://www.usatoday.com/story/opinion/2016/02/07/new-hampshire-primary-100-years-old-rebel-role-dante-scala-column/79967400/',
 'http://www.usatoday.com/story/opinion/2016/02/07/police-use-of-lethal-force-tellusatoday-your-say/79978876/',
 'http://www.usatoday.com/story/opinion/2016/02/07/turkish-ambassador-journalists-turkey-editorials-debates/79845450/',
 'http://www.usatoday.com/story/opinion/2016/02/07/voter-anger-elections-super-bowl-second-look/79967622/',
 'http://www.usatoday.com/story/opinion/2016/02/08/bill-de-blasio-chirlane-mccray-opioid-crisis-treatment-naloxone-overdoses-column/79972594/',
 'http://www.usatoday.com/story/opinion/2016/02/08/cal-thomas-elections-2016-god-religion-politics-evangelical-voters-column/79943324/',
 'http://www.usatoday.com/story/opinion/2016/02/08/federal-deficit-our-view-editorials-debates/80024164/',
 'http://www.usatoday.com/story/opinion/2016/02/08/federal-deficits-economy-governemtn-spending-editorials-debates/80032380/',
 'http://www.usatoday.com/story/opinion/2016/02/08/irs-tea-party-targeting-lois-lerner-corruption--obama-glenn-reynolds-column/79967098/',
 'http://www.usatoday.com/story/opinion/2016/02/08/martin-shkreli-drug-prices-your-say/80026236/',
 'http://www.usatoday.com/story/opinion/2016/02/08/primary-voting-presidential-election-tellusatoday-your-say/80026468/',
 'http://www.usatoday.com/story/opinion/2016/02/09/bernie-sanders-hillary-clinton-new-hampshire-column/80094342/',
 'http://www.usatoday.com/story/opinion/2016/02/09/beyonce-ads-super-bowl-colbert-corden-meyers-conan-jessica-williams/80052554/',
 'http://www.usatoday.com/story/opinion/2016/02/09/military-medical-battlefield-training-live-tissue-training-animal-rights-column/80018116/',
 'http://www.usatoday.com/story/opinion/2016/02/09/new-hampshire-primary-donald-trump-bernie-sanders-editorials-debates/80091284/',
 'http://www.usatoday.com/story/opinion/2016/02/09/obama-administration-least-transparent-epa-state-doj-clinton-benghazi-column/80050428/',
 'http://www.usatoday.com/story/opinion/2016/02/09/our-votes-matter-voter-id-citizens-united-voting-rights-act-democracy-awakens-column/80068028/',
 'http://www.usatoday.com/story/opinion/2016/02/09/solitary-confinement-federal-prisons-tellusatoday-your-say/80086320/',
 'http://www.usatoday.com/story/opinion/2016/02/09/super-bowl-50-your-say/80086738/',
 'http://www.usatoday.com/story/opinion/2016/02/09/trump-sanders-wins-new-hampshire-economic-anxiety-column/80088548/',
 'http://www.usatoday.com/story/opinion/2016/02/10/anthem-cruise-ship-storm-your-say/80202290/',
 'http://www.usatoday.com/story/opinion/2016/02/10/colbert-noah-fallon-kimmel-corden-sanders-trump-punchlines-new-hampshire/80179418/',
 'http://www.usatoday.com/story/opinion/2016/02/10/exonerations-dna-convicted-forensic-criminal-justice-column/80056392/',
 'http://www.usatoday.com/story/opinion/2016/02/10/hillary-clinton-women-voters-millennials-new-hampshire-column/80190950/',
 'http://www.usatoday.com/story/opinion/2016/02/10/hillary-clintons-woman-problem-column/80175130/',
 'http://www.usatoday.com/story/opinion/2016/02/10/marco-rubio-hip-hop-ben-carson-trump-bush-young-minority-voters-column/76387044/',
 'http://www.usatoday.com/story/opinion/2016/02/10/new-hampshire-primary-donald-trump-bernie-sanders-tellusatoday-your-say/80202062/',
 'http://www.usatoday.com/story/opinion/2016/02/10/oil-prices-gasoline-revenue-american-petroleum-institute-editorials-debates/80193760/',
 'http://www.usatoday.com/story/opinion/2016/02/10/oil-tax-10-barrel-infrastructure-president-obama-climate-change-editorials-debates/80056688/',
 'http://www.usatoday.com/story/opinion/2016/02/10/why-supreme-court-put-new-climate-rules-hold-column/80169792/',
 'http://www.usatoday.com/story/opinion/2016/02/11/federal-budget-obama-deficits-debt-tellusatoday-your-say/80253310/',
 'http://www.usatoday.com/story/opinion/2016/02/11/glenn-reynolds-socialism-bernie-sanders-young-millennial-voters-column/80169668/',
 'http://www.usatoday.com/story/opinion/2016/02/11/hillary-clinton-bernie-sanders-wall-street-lanny-davis-editorials-debates/80253414/',
 'http://www.usatoday.com/story/opinion/2016/02/11/hillary-clinton-speeches-goldman-sachs-wall-street-speaking-fees-editorials-debates/80233010/',
 'http://www.usatoday.com/story/opinion/2016/02/11/obama-budget-children-summer-food-hope-change-david-cay-johnston/80199860/',
 'http://www.usatoday.com/story/opinion/2016/02/11/wesley-clark-russia-assadsyria-obama-conflict-column/80228140/',
 'http://www.usatoday.com/story/opinion/2016/02/12/ligo-discovery-impossible-without-public-funding-gravitational-waves-column/80253446/',
 'http://www.usatoday.com/story/opinion/2016/02/12/lindberg-draft-conscription-women-all-volunteer-force-courage-virtue-column/80169484/',
 'http://www.usatoday.com/story/opinion/2016/02/12/top-threat-kurds-economy-not-isil-column/80228512/',
 'http://www.usatoday.com/story/opinion/2016/02/12/valentines-day-jimmy-kimmel-james-corden-punchlines-funny/80289898/',
 'http://www.usatoday.com/story/opinion/2016/02/13/scalia-death-appreciation-politics-nomination-glenn-reynolds-column/80350008/',
 'http://www.usatoday.com/story/opinion/2016/02/13/scalia-text-legacy-clerk-steven-calabresi-column/80349810/',
 'http://www.usatoday.com/story/opinion/2016/02/13/valentines-day-romance-marraige-flowers-fracking-column/80234586/',
 'http://www.usatoday.com/story/opinion/2016/02/14/antonin-scalia-2016-presidential-election-voters-editorials-debates/80382050/',
 'http://www.usatoday.com/story/opinion/2016/02/14/antonin-scalia-death-supreme-court-nomination-senate-obama-gonzales-column/80378246/',
 'http://www.usatoday.com/story/opinion/2016/02/14/bernie-sanders-henry-kissinger-richard-nixon-democratic-debate-column/80372646/',
 'http://www.usatoday.com/story/opinion/2016/02/14/justice-antonin-scalia-president-obama-mitch-mcconnell-editorials-debates/80375514/',
 'http://www.usatoday.com/story/opinion/2016/02/14/martin-shkreli-cam-newton-second-look-your-say/80383482/',
 'http://www.usatoday.com/story/opinion/2016/02/14/oil-tax-transportation-president-obama-your-say/80383560/',
 'http://www.usatoday.com/story/opinion/2016/02/14/religion-politics-gender-tellusatoday-your-say/80383622/',
 'http://www.usatoday.com/story/opinion/2016/02/14/scalia-defining-moment-minority-rights-stephen-henderson/80372366/',
 'http://www.usatoday.com/story/opinion/2016/02/14/why-i-wrote-play-antonin-scalia-originalist-john-strand/80374808/',
 'http://www.usatoday.com/story/opinion/2016/02/15/american-kennel-club-westminster-kennel-club-dog-show-editorials-debates/80401688/',
 'http://www.usatoday.com/story/opinion/2016/02/15/antonin-scalia-supreme-court-recess-appointment-nomination-politics-obama-column/80379796/',
 'http://www.usatoday.com/story/opinion/2016/02/15/dogs-breeding-westminster-kennel-american-kennel-club-editorials-debates/80373002/',
 'http://www.usatoday.com/story/opinion/2016/02/15/donald-trump-torture-enhanced-interrogation-techniques-editorials-debates/80418458/',
 'http://www.usatoday.com/story/opinion/2016/02/15/donald-trump-waterboarding-torture-editorials-debates/80258136/',
 'http://www.usatoday.com/story/opinion/2016/02/15/gop-supreme-court-scalia-obama-nominee-tellusatoday-your-say/80425956/',
 'http://www.usatoday.com/story/opinion/2016/02/15/hillary-clinton-feminism-sexism-bernie-bros-democratic-primary-2016-column/80374526/',
 'http://www.usatoday.com/story/opinion/2016/02/15/jim-wallis-getting-personal-racism-black-lives-matter/79977654/',
 'http://www.usatoday.com/story/opinion/2016/02/15/john-oliver-colin-jost-michael-che-punchlines-democracy-voting/80405220/',
 'http://www.usatoday.com/story/opinion/2016/02/15/patrick-leahy-antonin-scalia-death-supreme-court-nomination-confirmation-column/80415542/',
 'http://www.usatoday.com/story/opinion/2016/02/15/supreme-court-fight-assures-ugly-end-obama-era-david-corn-antonin-scalia-column/80374474/',
 'http://www.usatoday.com/story/opinion/2016/02/15/trump-has-no-idea-how-to-be-president-stephen-hess/80401590/',
 'http://www.usatoday.com/story/opinion/2016/02/15/wealthy-donors-citizens-united-politics-your-say/80425588/',
 'http://www.usatoday.com/story/opinion/2016/02/16/doj-ferguson-lawsuit-police-tellusatoday-your-say/80479008/',
 'http://www.usatoday.com/story/opinion/2016/02/16/evangelicals-south-carolina-republican-primary-column/80414280/',
 'http://www.usatoday.com/story/opinion/2016/02/16/hillary-clinton-bernie-sanders-nevada-caucuses-jon-ralston/80450100/',
 'http://www.usatoday.com/story/opinion/2016/02/16/kirsten-powers-bernie-sanders-hillary-clinton-democratic-primary-2016-column/80407150/',
 'http://www.usatoday.com/story/opinion/2016/02/16/libya-islamic-state-isil-oil-terrorism-obama-daesh-column/80018234/',
 'http://www.usatoday.com/story/opinion/2016/02/16/mlb-lifetime-ban-jenrry-mejia-peds-your-say/80478800/',
 'http://www.usatoday.com/story/opinion/2016/02/16/scalia-supreme-court-alexander-hamilton-musical-nomination-senate-obama-column/80465232/',
 'http://www.usatoday.com/story/opinion/2016/02/16/scalia-supreme-court-obama-gop-punchlines-bee-meyers/80451096/',
 'http://www.usatoday.com/story/opinion/2016/02/17/best-supreme-court-nominee-depends-jonathan-turley/80516622/',
 'http://www.usatoday.com/story/opinion/2016/02/17/cable-tv-set-top-box-fcc-tom-wheeler-editorials-debates/80474618/',
 'http://www.usatoday.com/story/opinion/2016/02/17/irs-civil-asset-forfeiture-ken-quran-randy-sowers-institute-justice-column/80499524/',
 'http://www.usatoday.com/story/opinion/2016/02/17/kanye-swift-fallon-colbert-corden-grammys-punchlines/80503382/',
 'http://www.usatoday.com/story/opinion/2016/02/17/lawrence-lessig-scalia-set-principled-example/80448256/',
 'http://www.usatoday.com/story/opinion/2016/02/17/randy-barnett-antonin-scalia-new-originalism-heller-second-amendment-column/80450446/',
 'http://www.usatoday.com/story/opinion/2016/02/17/tevision-cable-fcc-tom-wheeler-google-editorials-debates/80519326/',
 'http://www.usatoday.com/story/opinion/2016/02/17/wwjd-vote-for-bernie-sanders-column/80426466/',
 'http://www.usatoday.com/story/opinion/2016/02/17/yoweri-mouseveni-uganda-african-leaders-term-limits-obama-column/79651582/',
 'http://www.usatoday.com/story/opinion/columnists/stephen-henderson/2016/02/13/moments-defined-scalia-and-should-define-legacy/80355476/',
 'http://www.usatoday.com/story/opinion/columnists/stephen-henderson/2016/02/16/alexander-hamilton-and-looming-high-court-battle/80459026/',
 'http://www.usatoday.com/story/opinion/voices/2016/02/08/voices-rise-and-fall-rand-paul/79875100/',
 'http://www.usatoday.com/story/opinion/voices/2016/02/09/voices-mexico-legalize-marijuana/79781382/',
 'http://www.usatoday.com/story/opinion/voices/2016/02/10/voices-staying-safe-dangerous-venues/80170178/',
 'http://www.usatoday.com/story/opinion/voices/2016/02/15/voices-gomez-honduras-violence-central-america-unaccompanied-minors-immigration/80212272/'}

In [79]:
## Run the scraper for several times to get enough number of opinion articles
# new_opinion_set = opinion_scraper()
# opinion_set = new_opinion_set.union(opinion_set)

d2f2d9f2e949ef64113055d5999f87b5aff518b4
Retrieving from cache: http://www.usatoday.com/opinion/
be130630e349331342063ac989f3849644cf2818
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/13/valentines-day-romance-marraige-flowers-fracking-column/80234586/
508ad85d737d317d219d5ac54eb5b47e6250e3c1
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/14/bernie-sanders-henry-kissinger-richard-nixon-democratic-debate-column/80372646/
02f8334a36e5d091fad85befec25b0cb2234ee55
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/09/trump-sanders-wins-new-hampshire-economic-anxiety-column/80088548/
a943f948820e6dc9bcfdb38c10e0b424db3a7528
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/15/supreme-court-fight-assures-ugly-end-obama-era-david-corn-antonin-scalia-column/80374474/
2d0b0ab432c3188fe7ffa5d9d4014eee93046776
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/11/obama-budget-children-summer-food-h

### 2. Preprocessing & Feature Generation

In [13]:
STOP_WORDS = stopwords.words('english')
STOP_PHRASES = ["Ask FactCheck", "FULL QUESTION", "FULL ANSWER", '© Copyright 2016 FactCheck.org', 'A Project of the Annenberg Public Policy Center']
def get_words(article_html, is_opinion):
    """Return list of representative words from an article. """
    bag_of_words = []
    raw = []
    if not is_opinion:
        additional = re.search(r'<span style="color:.{,20}">(<strong>)?Sources(</strong>)?</span>', article_html)
        if additional:
            article_html = article_html[:additional.start()]
    soupify_article = BeautifulSoup(article_html, 'html.parser')
    paragraphs = soupify_article.find_all('p',attrs={'class':None})
    for p in paragraphs:
        if p.parent.name != 'a' and p.text not in STOP_PHRASES:
            p_text = p.text.lower().replace('usa today', ' ').replace('q: ', ' ').replace('a: ', ' ').replace('getelementbyid', ' ').replace('eet', ' ')
            raw += word_tokenize(p_text)
    for word in raw: 
        if '\\xc2\\xa0' in word:
            tmp = word.split('\\xc2\\xa0')
        else:
            tmp = [word]
        tmp = [re.sub(r"\\x..", "", w).replace("\\", "") for w in tmp]
        for w in tmp:
            bag_of_words += re.sub(r"[^a-zA-Z]", " ", w).split()
    
    bag_of_words = [w.lower() for w in bag_of_words if w.isalpha() and w not in STOP_WORDS]
    
    return bag_of_words

In [14]:
## Test the get_words function for opinion articles
# article_html = fetch('http://www.factcheck.org/2013/03/obamacare-to-cost-20000-a-family/')
# article_html = article_html[:re.search(r'<span style=".{,15}">(<strong>)?Sources(</strong>)?</span>', article_html).start()]
# soupify_article = BeautifulSoup(article_html, 'html.parser')
# paragraphs = [p.text for p in soupify_article.find_all('p',attrs={'class':None})]
# paragraphs

Retrieving from cache: http://www.factcheck.org/2013/03/obamacare-to-cost-20000-a-family/


['',
 'A Project of the Annenberg Public Policy Center',
 ' Q: Did the IRS say that the cheapest health insurance plan under the federal health care law would cost $20,000 per family?',
 'A: No. The IRS used $20,000 in a hypothetical example to illustrate how it will calculate the tax penalty for a family that fails to obtain health coverage as required by law. Treasury says the figure \\xe2\\x80\\x9cis not an estimate of premiums.\\xe2\\x80\\x9d',
 '',
 '\xa0',
 'FULL QUESTION',
 'The Internal Revenue Service issued a report in which it estimated that under Obamacare, the least expensive health insurance plan available to a family in 2016 would cost $20,000 annually, according to CNSNews.com.',
 'Is this a true report?',
 'FULL ANSWER',
 'This question \\xe2\\x80\\x94 and several more from readers \\xe2\\x80\\x94 was prompted by an article published by the Cybercast News Service (an \\xe2\\x80\\x9calternative\\xe2\\x80\\x9d news site run by the conservative Media Research Center) with

In [15]:
OPINION = 1
NEWS = 0
def build_corpus(opinion_set, news_set):
    opinion = [(get_words(fetch(link), OPINION), OPINION) for link in opinion_set]
    news = [(get_words(fetch(link), NEWS), NEWS) for link in news_set]
    corpus = news + opinion
    random.shuffle(corpus)
    return corpus

In [16]:
def build_word_corpus():
    '''for each word record if it appears in opinion or in news'''
    opinion_words = [word for link in opinion_scraper() for word in get_words(fetch(link), OPINION)]
    news_words = [word for link in news_scraper() for word in get_words(fetch(link), NEWS)]
    shared_words = (set.intersection(set(opinion_words), set(news_words)))
    return shared_words

In [17]:
def ambiguous_words(article):
    return len([w for w in article if w in shared_words])/len(article)

In [18]:
TRAIN = 1
TEST = 0
def vectorize(vectorizer, list_of_texts, is_train):
    """Return feature vectors for each entity given list of texts."""
    if is_train:
        compressed_vectors = vectorizer.fit_transform(list_of_texts)
    else:
        compressed_vectors = vectorizer.transform(list_of_texts)
    return compressed_vectors.toarray()

In [21]:
def gen_pos_tag(article):
    '''Param: article is a list of words from an individual article
       Return: A dictionary of the ratio of each tag type'''
    #modifiers = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    #past_tense = ['VBD', 'VBN']
    tags = nltk.pos_tag(article)
    return [tag for word, tag in tags]

In [25]:
def mark_negations(article):
    '''attach NEG tag to the negation and return the proportion of negated words in an article'''
    sentim_analyzer = SentimentAnalyzer()
    article = mark_negation(article)
    return len([w for w in article if w[-3:] == 'NEG'])/len(article)

In [84]:
def add_features(features, article_html):
    # FIXME
    return np.append(features, np.array([[1],[1],[0]]), axis=1)

In [27]:
corpus = build_corpus(opinion_set, news_set)

Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/09/bernie-sanders-hillary-clinton-new-hampshire-column/80094342/
Loading: http://www.usatoday.com/story/opinion/2016/02/09/super-bowl-50-your-say/80086738/
Loading: http://www.usatoday.com/story/opinion/2016/02/07/journalists-jail-murder-censorship-turkey-editorials-debates/79844586/
Loading: http://www.usatoday.com/story/opinion/2016/02/09/obama-administration-least-transparent-epa-state-doj-clinton-benghazi-column/80050428/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/11/hillary-clinton-speeches-goldman-sachs-wall-street-speaking-fees-editorials-debates/80233010/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/15/john-oliver-colin-jost-michael-che-punchlines-democracy-voting/80405220/
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/14/antonin-scalia-2016-presidential-election-voters-editorials-debates/80382050/
Retrieving from cache: http://www.usatod

In [41]:
' '.join(corpus[0][0] + gen_pos_tag(corpus[0][0]))

'new health care law biggest tax increase history n raw dollars perhaps several tax increases since larger percentages economy inflation adjusted dollars n obamacare largest tax hike us history several readers asked us since rush limbaugh made hugely exaggerated claim new health care law biggest tax increase history world re sure limbaugh meant statement taken seriously offered figures citations back said critics law made similar claims increase certainly large let take look taxes fees finance obamacare stack earlier increases big increase question package taxes revenue raisers law contains constitute large increase recent estimate nonpartisan joint committee taxation puts total dozen different tax increases revenue related provisions billion counting effect penalty payments individuals refuse take health insurance estimated nonpartisan congressional budget office billion period jct tax estimate also doesn count penalties paid large employers employees cbo estimates pay billion period 

### 3. Cross-Validation, train and evaluate

In [33]:
def evaluate(y, y_pred, y_pred_prob, train_time, test_time, threshold=0.5):
    '''generate evaluation results'''
    rv = {}
    y_pred_new = np.where(y_pred_prob >= threshold, 1, 0)
    rv["accuracy"] = np.mean(y == y_pred_new)
    rv["precision"] = precision_score(y, y_pred_new)
    rv["recall"] = recall_score(y, y_pred_new)
    rv["f1"] = f1_score(y, y_pred_new)
    rv["auc_roc"] = roc_auc_score(y, y_pred_prob)
    rv["average_precision_score"] = average_precision_score(y,y_pred_prob)
    rv["train_time"] = train_time
    rv["test_time"] = test_time
    return pd.Series(rv)

In [34]:
def compute_avg_scores(l):
    rv = l[0]
    for i in range(1, len(l)):
        rv += l[i]
    return rv/len(l)

In [42]:
def go(data, k=5):
    
    # cross validation
    num_instances = len(data)
    num_in_fold = num_instances // 5
    
    results = {}
    # implement parameter and model selection here
    models = {"LinearSVC": LSVC(), 
              "RandomForest": RFC(), 
              "KNeighbors": KNC(),
              "DecisionTree": DTC(),
              "LogisticReg": LR(),
              "NaiveBayes":NB(),
              "Bagging": BC(),
              "Boosting": GBC()}
    
    for i in range(k):
        training_set = (data[0:i*num_in_fold] + 
                    data[(i+1)*num_in_fold:])
        test_set = data[i*num_in_fold: (i+1)*num_in_fold]
        
#         vectorizer = TfidfVectorizer(analyzer = "word", stop_words = "english")
        vectorizer = TfidfVectorizer(analyzer = "word", stop_words = "english", 
                                     ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)

#         X_train = vectorize(vectorizer, [' '.join(article) for article, tag in training_set], TRAIN)
#         y_train = [tag for article, tag in training_set]
#         X_test = vectorize(vectorizer, [' '.join(article) for article, tag in test_set], TEST)
#         y_true = [tag for article, tag in test_set]
        
        X_train = vectorize(vectorizer, [' '.join(article + gen_pos_tag(article)) for article, tag in training_set], TRAIN)
        y_train = [tag for article, tag in training_set]
        X_test = vectorize(vectorizer, [' '.join(article + gen_pos_tag(article)) for article, tag in test_set], TEST)
        y_true = [tag for article, tag in test_set]
        
        for clf_name in models:
            # train
            begin_train = time.time()        
            clf = models[clf_name].fit(X_train, y_train)
            end_train = time.time()

            # test
            begin_test = time.time()
            y_pred = clf.predict(X_test)
            end_test = time.time()
            if clf_name != "LinearSVC":
                y_pred_prob = clf.predict_proba(X_test)[:,1]
            else:
                y_pred_prob = y_pred

            # evaluate
            train_time = end_train - begin_train
            test_time = end_test - begin_test
            results[clf_name] = results.get(clf_name, []) + [evaluate(y_true, y_pred, y_pred_prob, train_time, test_time)]
    
    for clf_name in models:
        results[clf_name] = compute_avg_scores(results[clf_name])
        
    return results

In [43]:
def important_features(corpus):
    vectorizer = TfidfVectorizer(analyzer = "word", stop_words = "english", ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
#     vectorizer = TfidfVectorizer(analyzer = "word", stop_words = "english")
    X_train = vectorize(vectorizer, [" ".join(article + gen_pos_tag(article)) for article, tag in corpus], TRAIN)
    y_train = [tag for article, tag in corpus]
    model = RFC().fit(X_train, y_train)
    feature_importance = pd.Series(model.feature_importances_)
    labels = pd.DataFrame(vectorizer.get_feature_names(), columns=["features"])
    labels["importance"] = feature_importance
    return labels[labels["importance"] > 0].sort_values(by="importance",ascending=0)

In [44]:
# run the program
results = go(corpus)

In [37]:
# Unigram features
metrics = pd.Series(["accuracy","precision","recall","f1","auc_roc","average_precision_score","train_time","test_time"])
evaluation_result = pd.DataFrame(columns=metrics)
for clf_name in results:
    evaluation_result.loc[clf_name] = results[clf_name]
evaluation_result

Unnamed: 0,accuracy,precision,recall,f1,auc_roc,average_precision_score,train_time,test_time
RandomForest,0.93,0.894622,0.974,0.931214,0.987991,0.988756,0.028363,0.002705
DecisionTree,0.875,0.869827,0.886045,0.874321,0.879877,0.907936,0.075775,0.001093
LogisticReg,0.88,0.905195,0.874667,0.87478,0.97743,0.966179,0.00865,0.001199
LinearSVC,0.92,0.940909,0.888,0.910607,0.921975,0.941955,0.009555,0.000789
Boosting,0.97,0.96738,0.974,0.969387,0.997333,0.998499,11.348626,0.001804
KNeighbors,0.82,0.765635,0.918281,0.830655,0.880972,0.877867,0.030577,0.108627
Bagging,0.93,0.947895,0.919429,0.92804,0.974897,0.974445,0.363864,0.03447
NaiveBayes,0.75,0.84646,0.636421,0.709678,0.762135,0.836441,0.032398,0.01127


In [38]:
important = important_features(corpus)
important

Unnamed: 0,features,importance
3109,department,0.053748
8145,opinion,0.038387
6812,like,0.032620
7814,nexpand,0.027744
6839,link,0.023655
7966,number,0.021666
3503,document,0.020571
4320,factcheck,0.019657
1001,based,0.019618
3264,didn,0.016095


In [39]:
num_news = len([(a, tag) for a, tag in corpus if tag == NEWS])
num_opinion = len([(a, tag) for a, tag in corpus if tag == OPINION])
print('{0: <15}'.format("word"), '{0: <8}'.format("NEWS"), '{0: <8}'.format("OPINION"), '{0: <8}'.format("importance"))
for w,impt in important.to_records(index=False):
    print('{0: <15}'.format(w), 
          '{0: <8}'.format(sum([article.count(w) for article, tag in corpus if tag == NEWS])/num_news), 
          '{0: <8}'.format(sum([article.count(w) for article, tag in corpus if tag == OPINION])/num_opinion),
          '{0: <8}'.format(impt))

word            NEWS     OPINION  importance
department      1.75     0.25     0.05374790339023765
opinion         0.12     0.96     0.03838696683423041
like            1.07     1.5      0.03262028067526732
nexpand         0.38     0.0      0.02774369461486025
link            0.49     0.0      0.02365478210988829
number          1.32     0.13     0.02166624193317518
document        1.08     0.03     0.020571147279221817
factcheck       0.36     0.0      0.019657438928231526
based           0.99     0.15     0.01961782880277227
didn            0.55     0.06     0.016094987081237865
true            1.9      0.14     0.01590407415438665
eugene          0.23     0.02     0.01576679265632012
published       0.39     0.09     0.015565477621122578
better          0.18     0.38     0.014689363336628136
publishes       0.01     0.47     0.013316939589549156
percent         2.57     0.04     0.012951350481362487
right           0.62     0.71     0.012927221702813472
circulated      0.29     0.0 

In [45]:
# Unigrams and bigrams
metrics = pd.Series(["accuracy","precision","recall","f1","auc_roc","average_precision_score","train_time","test_time"])
evaluation_result = pd.DataFrame(columns=metrics)
for clf_name in results:
    evaluation_result.loc[clf_name] = results[clf_name]
evaluation_result

Unnamed: 0,accuracy,precision,recall,f1,auc_roc,average_precision_score,train_time,test_time
RandomForest,0.94,0.904065,0.97995,0.938963,0.981758,0.97772,0.130306,0.018674
DecisionTree,0.845,0.838556,0.832236,0.833949,0.846401,0.875396,0.647601,0.009259
LogisticReg,0.86,0.952941,0.777754,0.839793,0.978959,0.961661,0.041713,0.004511
LinearSVC,0.905,0.948889,0.846947,0.891098,0.906712,0.935418,0.045942,0.004903
Boosting,0.92,0.894305,0.92814,0.909634,0.974496,0.97078,86.807099,0.010164
KNeighbors,0.685,1.0,0.360782,0.52065,0.831928,0.876055,0.157895,0.738242
Bagging,0.925,0.929474,0.926426,0.922976,0.98542,0.982598,2.42782,0.280786
NaiveBayes,0.825,0.89985,0.735278,0.803706,0.831353,0.885064,0.278114,0.100358


In [46]:
important = important_features(corpus)
important

Unnamed: 0,features,importance
28897,expand,0.033588
17827,contributors,0.028777
17830,contributors read,0.026630
48486,look,0.024361
56883,oct,0.023499
25692,editorials publishes,0.017449
47287,like,0.016946
68026,readers,0.015275
24361,doesn,0.014359
93207,won,0.013430


In [47]:
num_news = len([(a, tag) for a, tag in corpus if tag == NEWS])
num_opinion = len([(a, tag) for a, tag in corpus if tag == OPINION])
print('{0: <15}'.format("word"), '{0: <8}'.format("NEWS"), '{0: <8}'.format("OPINION"), '{0: <8}'.format("importance"))
for w,impt in important.to_records(index=False):
    print('{0: <15}'.format(w), 
          '{0: <8}'.format(sum([" ".join(article).count(w) for article, tag in corpus if tag == NEWS])/num_news), 
          '{0: <8}'.format(sum([" ".join(article).count(w) for article, tag in corpus if tag == OPINION])/num_opinion),
          '{0: <8}'.format(impt))

word            NEWS     OPINION  importance
expand          1.45     0.07     0.033588443487078734
contributors    0.01     0.56     0.028776708734224665
contributors read 0.0      0.47     0.026630329216838937
look            0.63     0.98     0.024361494369415974
oct             1.59     0.16     0.023499345996656974
editorials publishes 0.0      0.47     0.017449171077939096
like            1.68     1.9      0.016945683528912552
readers         0.62     0.01     0.01527530643300017
doesn           0.73     0.21     0.014358972558920342
won             0.72     0.33     0.013429866719452743
voters          0.29     0.71     0.012754688233868683
org             1.57     0.53     0.012709497206703894
make            1.57     1.03     0.012380979078481392
addition        0.77     0.54     0.0114838175831521
vbd prp         0.0      0.0      0.01140215573123732
receiving       0.23     0.0      0.010844234047766767
told            1.36     0.16     0.010321047008547007
illegally       0

### 4. Build the model

In [51]:
def get_words_from_string(article):
    return [w.lower() for w in nltk.word_tokenize(article) if w.isalpha() and (w not in STOP_WORDS or w.lower() not in STOP_WORDS)]


def train(data):
    
    vectorizer = TfidfVectorizer(analyzer = "word", stop_words = "english", ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
    X_train = vectorize(vectorizer, [' '.join(article + gen_pos_tag(article)) for article, tag in data], TRAIN)
    y_train = [tag for article, tag in data]
        
    clf = LSVC().fit(X_train, y_train)
        
    return clf, vectorizer


def predict_sample(article, vectorizer, clf):
    article = get_words_from_string(article)
    sample = vectorize(vectorizer, [' '.join(article + gen_pos_tag(article))], TEST)
    y_pred = clf.predict(sample)
    return y_pred

In [None]:
clf, vectorizer = train(corpus)

In [57]:
t = """Wheeler’s plan would start a lengthy process aimed at letting consumers buy cable boxes that could combine 
cable or satellite channels with content from providers such as Netflix, Amazon, Hulu and others, all in one box 
and all searchable. If you wanted to watch the movie American Sniper, for example, you could search for it and compare
prices to find the cheapest way to rent or buy it. Supporters of the idea hope bare-bones boxes would be available for
less than $50, while boxes with DVRs and other features would go for hundreds more."""
predict_sample(t, vectorizer, clf)

array([1])

In [58]:
t = """President Obama will crown his historic rapprochement with Cuba with a visit to the island as soon as March, 
the first for a sitting U.S. president in nearly 90 years, administration sources said Wednesday.
The White House will announce on Thursday the details of a multi-stop presidential trip to Latin America — including 
Cuba — in the coming weeks, said senior administration officials speaking on condition of anonymity because the 
official announcement had not been made. The trip was first reported by ABC News and Reuters."""
predict_sample(t, vectorizer, clf)

array([0])