In [73]:
import urllib.request, time, re, random, hashlib
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk import word_tokenize
from sklearn.svm import LinearSVC as LSVC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import BaggingClassifier as BC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, average_precision_score
%matplotlib osx

### 1.Get data

In [74]:
# Compassionate Caching inspired by 
# http://lethain.com/an-introduction-to-compassionate-screenscraping/

last_fetched_at = None

def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache-file-{}'.format(url_hash)
    print(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
        return result

In [75]:
# Get a set of links for news articles
def get_news_links(link):
    articles = fetch(link)
    soup = BeautifulSoup(articles, 'html.parser')
    links = set([])
    pattern = re.compile(r"^http://www\.factcheck\.org/\d{4}/\d{2}/")
    for a in soup.find_all("a"):
        sub_link = a.get("href")
        if sub_link != None and pattern.match(sub_link):
            links.add(sub_link)
    return links

def news_scraper(n=10):
    links = set([])
    for i in range(1, n+1):
        link = "http://www.factcheck.org/askfactcheck/page/{}/".format(i)
        links = links.union(get_news_links(link))
    return links

In [76]:
news_set = news_scraper()

f3d58e1534e698cbca57ab2d3686e9ddf9fb2c45
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/1/
3429eba6b9d6f6b1c08db6b73fdb8f733c8dd2e8
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/2/
b423a8ff72cc56524a42422987510946189efed0
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/3/
979391ba901adee8c089f9580cdbf0cc8a7e7565
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/4/
d4532f9ed7fc430919b649a0e177ae53389d96e0
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/5/
32104a8ff9da0c2bee6effc859f95d2ddc44c1a6
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/6/
75947153ac86210aaec8fbee19ab1fcecd31fc28
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/7/
db0952490b0d970fbb2f654df8d4d3640d7f5c0b
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/8/
8152effbf0b7ee5ce73b3123fc7b0f709c8f090d
Retrieving from cache: http://www.factcheck.org/askfactcheck/page/9/
62e83badaf

In [6]:
news_set

{'http://www.factcheck.org/2010/10/stimulating-the-dead/',
 'http://www.factcheck.org/2010/10/the-chamber-and-foreign-contributions/',
 'http://www.factcheck.org/2010/11/ask-factcheck-trip-to-mumbai/',
 'http://www.factcheck.org/2010/11/blame-jane-falsehoods/',
 'http://www.factcheck.org/2010/11/muslims-appointed-to-homeland-security/',
 'http://www.factcheck.org/2010/12/fostering-a-fortune/',
 'http://www.factcheck.org/2010/12/health-care-law-waivers/',
 'http://www.factcheck.org/2010/12/tsa-not-to-blame-for-this/',
 'http://www.factcheck.org/2011/01/congress-not-exempt-from-student-loans/',
 'http://www.factcheck.org/2011/01/obama-guns-and-the-untouchables/',
 'http://www.factcheck.org/2011/01/obamas-not-invited-to-royal-wedding/',
 'http://www.factcheck.org/2011/02/irs-and-the-health-care-law-part-ii/',
 'http://www.factcheck.org/2011/02/millions-with-preexisting-conditions/',
 'http://www.factcheck.org/2011/02/sorry-statistics/',
 'http://www.factcheck.org/2011/03/congressional-ref

In [77]:
# Get a set of links for opinion articles
def get_opinion_links(link):
    articles = fetch(link)
    soup = BeautifulSoup(articles, 'html.parser')
    links = set([])
    for a in soup.find_all("a"):
        sub_link = a.get("href")
        if sub_link != None and sub_link.startswith("/story/opinion/"):
            links.add('http://www.usatoday.com' + sub_link)
    return links

def opinion_scraper():
    links = get_opinion_links("http://www.usatoday.com/opinion/")
    for link in links:
        sub_set = get_opinion_links(link)
        links = links.union(sub_set)
    return links

In [78]:
opinion_set = {'http://www.usatoday.com/story/opinion/2013/09/16/syria-irs-lerner-column/2816277/',
 'http://www.usatoday.com/story/opinion/2014/05/12/president-obama-irs-scandal-watergate-column/8968317/',
 'http://www.usatoday.com/story/opinion/2014/08/28/russia-ukraine-nato-vladimir-putin-president-obama-editorials-debates/14766425/',
 'http://www.usatoday.com/story/opinion/2015/05/25/caption-contest-youtoon/1568271/',
 'http://www.usatoday.com/story/opinion/2015/07/23/highway-funding-oil-gasoline-tax-fix-congress-editorials-debates/30579385/',
 'http://www.usatoday.com/story/opinion/2015/12/21/cdc-opioids-heroin-overdoses-doctors-editorials-debates/77708774/',
 'http://www.usatoday.com/story/opinion/2016/02/01/mia-love-single-subject-rule-constitutional-amendment--reynolds-column/79605158/',
 'http://www.usatoday.com/story/opinion/2016/02/01/super-bowl-football-brain-damage-immoral-watch-column/79654086/',
 'http://www.usatoday.com/story/opinion/2016/02/04/obama--wrong-solitary-confinement-column/79649416/',
 'http://www.usatoday.com/story/opinion/2016/02/04/trump-sanders-mccain-new-hampshire-mavericks-column/79832920/',
 'http://www.usatoday.com/story/opinion/2016/02/07/journalists-jail-murder-censorship-turkey-editorials-debates/79844586/',
 'http://www.usatoday.com/story/opinion/2016/02/07/new-hampshire-primary-100-years-old-rebel-role-dante-scala-column/79967400/',
 'http://www.usatoday.com/story/opinion/2016/02/07/police-use-of-lethal-force-tellusatoday-your-say/79978876/',
 'http://www.usatoday.com/story/opinion/2016/02/07/turkish-ambassador-journalists-turkey-editorials-debates/79845450/',
 'http://www.usatoday.com/story/opinion/2016/02/07/voter-anger-elections-super-bowl-second-look/79967622/',
 'http://www.usatoday.com/story/opinion/2016/02/08/bill-de-blasio-chirlane-mccray-opioid-crisis-treatment-naloxone-overdoses-column/79972594/',
 'http://www.usatoday.com/story/opinion/2016/02/08/cal-thomas-elections-2016-god-religion-politics-evangelical-voters-column/79943324/',
 'http://www.usatoday.com/story/opinion/2016/02/08/federal-deficit-our-view-editorials-debates/80024164/',
 'http://www.usatoday.com/story/opinion/2016/02/08/federal-deficits-economy-governemtn-spending-editorials-debates/80032380/',
 'http://www.usatoday.com/story/opinion/2016/02/08/irs-tea-party-targeting-lois-lerner-corruption--obama-glenn-reynolds-column/79967098/',
 'http://www.usatoday.com/story/opinion/2016/02/08/martin-shkreli-drug-prices-your-say/80026236/',
 'http://www.usatoday.com/story/opinion/2016/02/08/primary-voting-presidential-election-tellusatoday-your-say/80026468/',
 'http://www.usatoday.com/story/opinion/2016/02/09/bernie-sanders-hillary-clinton-new-hampshire-column/80094342/',
 'http://www.usatoday.com/story/opinion/2016/02/09/beyonce-ads-super-bowl-colbert-corden-meyers-conan-jessica-williams/80052554/',
 'http://www.usatoday.com/story/opinion/2016/02/09/military-medical-battlefield-training-live-tissue-training-animal-rights-column/80018116/',
 'http://www.usatoday.com/story/opinion/2016/02/09/new-hampshire-primary-donald-trump-bernie-sanders-editorials-debates/80091284/',
 'http://www.usatoday.com/story/opinion/2016/02/09/obama-administration-least-transparent-epa-state-doj-clinton-benghazi-column/80050428/',
 'http://www.usatoday.com/story/opinion/2016/02/09/our-votes-matter-voter-id-citizens-united-voting-rights-act-democracy-awakens-column/80068028/',
 'http://www.usatoday.com/story/opinion/2016/02/09/solitary-confinement-federal-prisons-tellusatoday-your-say/80086320/',
 'http://www.usatoday.com/story/opinion/2016/02/09/super-bowl-50-your-say/80086738/',
 'http://www.usatoday.com/story/opinion/2016/02/09/trump-sanders-wins-new-hampshire-economioc-anxiety-column/80088548/',
 'http://www.usatoday.com/story/opinion/2016/02/10/anthem-cruise-ship-storm-your-say/80202290/',
 'http://www.usatoday.com/story/opinion/2016/02/10/colbert-noah-fallon-kimmel-corden-sanders-trump-punchlines-new-hampshire/80179418/',
 'http://www.usatoday.com/story/opinion/2016/02/10/exonerations-dna-convicted-forensic-criminal-justice-column/80056392/',
 'http://www.usatoday.com/story/opinion/2016/02/10/hillary-clinton-women-voters-millennials-new-hampshire-column/80190950/',
 'http://www.usatoday.com/story/opinion/2016/02/10/hillary-clintons-woman-problem-column/80175130/',
 'http://www.usatoday.com/story/opinion/2016/02/10/marco-rubio-hip-hop-ben-carson-trump-bush-young-minority-voters-column/76387044/',
 'http://www.usatoday.com/story/opinion/2016/02/10/new-hampshire-primary-donald-trump-bernie-sanders-tellusatoday-your-say/80202062/',
 'http://www.usatoday.com/story/opinion/2016/02/10/oil-prices-gasoline-revenue-american-petroleum-institute-editorials-debates/80193760/',
 'http://www.usatoday.com/story/opinion/2016/02/10/oil-tax-10-barrel-infrastructure-president-obama-climate-change-editorials-debates/80056688/',
 'http://www.usatoday.com/story/opinion/2016/02/10/why-supreme-court-put-new-climate-rules-hold-column/80169792/',
 'http://www.usatoday.com/story/opinion/2016/02/11/federal-budget-obama-deficits-debt-tellusatoday-your-say/80253310/',
 'http://www.usatoday.com/story/opinion/2016/02/11/glenn-reynolds-socialism-bernie-sanders-young-millennial-voters-column/80169668/',
 'http://www.usatoday.com/story/opinion/2016/02/11/hillary-clinton-bernie-sanders-wall-street-lanny-davis-editorials-debates/80253414/',
 'http://www.usatoday.com/story/opinion/2016/02/11/hillary-clinton-speeches-goldman-sachs-wall-street-speaking-fees-editorials-debates/80233010/',
 'http://www.usatoday.com/story/opinion/2016/02/11/obama-budget-children-summer-food-hope-change-david-cay-johnston/80199860/',
 'http://www.usatoday.com/story/opinion/2016/02/11/wesley-clark-russia-assadsyria-obama-conflict-column/80228140/',
 'http://www.usatoday.com/story/opinion/2016/02/12/ligo-discovery-impossible-without-public-funding-gravitational-waves-column/80253446/',
 'http://www.usatoday.com/story/opinion/2016/02/12/lindberg-draft-conscription-women-all-volunteer-force-courage-virtue-column/80169484/',
 'http://www.usatoday.com/story/opinion/2016/02/12/top-threat-kurds-economy-not-isil-column/80228512/',
 'http://www.usatoday.com/story/opinion/2016/02/12/valentines-day-jimmy-kimmel-james-corden-punchlines-funny/80289898/',
 'http://www.usatoday.com/story/opinion/2016/02/13/scalia-death-appreciation-politics-nomination-glenn-reynolds-column/80350008/',
 'http://www.usatoday.com/story/opinion/2016/02/13/scalia-text-legacy-clerk-steven-calabresi-column/80349810/',
 'http://www.usatoday.com/story/opinion/2016/02/13/valentines-day-romance-marraige-flowers-fracking-column/80234586/',
 'http://www.usatoday.com/story/opinion/2016/02/14/antonin-scalia-2016-presidential-election-voters-editorials-debates/80382050/',
 'http://www.usatoday.com/story/opinion/2016/02/14/antonin-scalia-death-supreme-court-nomination-senate-obama-gonzales-column/80378246/',
 'http://www.usatoday.com/story/opinion/2016/02/14/bernie-sanders-henry-kissinger-richard-nixon-democratic-debate-column/80372646/',
 'http://www.usatoday.com/story/opinion/2016/02/14/justice-antonin-scalia-president-obama-mitch-mcconnell-editorials-debates/80375514/',
 'http://www.usatoday.com/story/opinion/2016/02/14/martin-shkreli-cam-newton-second-look-your-say/80383482/',
 'http://www.usatoday.com/story/opinion/2016/02/14/oil-tax-transportation-president-obama-your-say/80383560/',
 'http://www.usatoday.com/story/opinion/2016/02/14/religion-politics-gender-tellusatoday-your-say/80383622/',
 'http://www.usatoday.com/story/opinion/2016/02/14/scalia-defining-moment-minority-rights-stephen-henderson/80372366/',
 'http://www.usatoday.com/story/opinion/2016/02/14/why-i-wrote-play-antonin-scalia-originalist-john-strand/80374808/',
 'http://www.usatoday.com/story/opinion/2016/02/15/american-kennel-club-westminster-kennel-club-dog-show-editorials-debates/80401688/',
 'http://www.usatoday.com/story/opinion/2016/02/15/antonin-scalia-supreme-court-recess-appointment-nomination-politics-obama-column/80379796/',
 'http://www.usatoday.com/story/opinion/2016/02/15/dogs-breeding-westminster-kennel-american-kennel-club-editorials-debates/80373002/',
 'http://www.usatoday.com/story/opinion/2016/02/15/donald-trump-torture-enhanced-interrogation-techniques-editorials-debates/80418458/',
 'http://www.usatoday.com/story/opinion/2016/02/15/donald-trump-waterboarding-torture-editorials-debates/80258136/',
 'http://www.usatoday.com/story/opinion/2016/02/15/gop-supreme-court-scalia-obama-nominee-tellusatoday-your-say/80425956/',
 'http://www.usatoday.com/story/opinion/2016/02/15/hillary-clinton-feminism-sexism-bernie-bros-democratic-primary-2016-column/80374526/',
 'http://www.usatoday.com/story/opinion/2016/02/15/jim-wallis-getting-personal-racism-black-lives-matter/79977654/',
 'http://www.usatoday.com/story/opinion/2016/02/15/john-oliver-colin-jost-michael-che-punchlines-democracy-voting/80405220/',
 'http://www.usatoday.com/story/opinion/2016/02/15/patrick-leahy-antonin-scalia-death-supreme-court-nomination-confirmation-column/80415542/',
 'http://www.usatoday.com/story/opinion/2016/02/15/supreme-court-fight-assures-ugly-end-obama-era-david-corn-antonin-scalia-column/80374474/',
 'http://www.usatoday.com/story/opinion/2016/02/15/trump-has-no-idea-how-to-be-president-stephen-hess/80401590/',
 'http://www.usatoday.com/story/opinion/2016/02/15/wealthy-donors-citizens-united-politics-your-say/80425588/',
 'http://www.usatoday.com/story/opinion/columnists/stephen-henderson/2016/02/13/moments-defined-scalia-and-should-define-legacy/80355476/',
 'http://www.usatoday.com/story/opinion/voices/2016/02/08/voices-rise-and-fall-rand-paul/79875100/',
 'http://www.usatoday.com/story/opinion/voices/2016/02/09/voices-mexico-legalize-marijuana/79781382/',
 'http://www.usatoday.com/story/opinion/voices/2016/02/10/voices-staying-safe-dangerous-venues/80170178/',
 'http://www.usatoday.com/story/opinion/voices/2016/02/15/voices-gomez-honduras-violence-central-america-unaccompanied-minors-immigration/80212272/'}

In [79]:
new_opinion_set = opinion_scraper()
opinion_set = new_opinion_set.union(opinion_set)

d2f2d9f2e949ef64113055d5999f87b5aff518b4
Retrieving from cache: http://www.usatoday.com/opinion/
be130630e349331342063ac989f3849644cf2818
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/13/valentines-day-romance-marraige-flowers-fracking-column/80234586/
508ad85d737d317d219d5ac54eb5b47e6250e3c1
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/14/bernie-sanders-henry-kissinger-richard-nixon-democratic-debate-column/80372646/
02f8334a36e5d091fad85befec25b0cb2234ee55
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/09/trump-sanders-wins-new-hampshire-economic-anxiety-column/80088548/
a943f948820e6dc9bcfdb38c10e0b424db3a7528
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/15/supreme-court-fight-assures-ugly-end-obama-era-david-corn-antonin-scalia-column/80374474/
2d0b0ab432c3188fe7ffa5d9d4014eee93046776
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/11/obama-budget-children-summer-food-h

### 2. Preprocessing & Feature Generation

In [10]:
STOP_WORDS = stopwords.words('english')
STOP_PHRASES = ["Ask FactCheck", "FULL QUESTION", "FULL ANSWER", '© Copyright 2016 FactCheck.org', 'A Project of the Annenberg Public Policy Center']
def get_words(article_html, is_opinion):
    """Return list of representative words from an article. """
    bag_of_words = []
    raw = []
    if not is_opinion:
        additional = re.search(r'<span style="color:.{,20}">(<strong>)?Sources(</strong>)?</span>', article_html)
        if additional:
            article_html = article_html[:additional.start()]
    soupify_article = BeautifulSoup(article_html, 'html.parser')
    paragraphs = soupify_article.find_all('p',attrs={'class':None})
    for p in paragraphs:
        if p.parent.name != 'a' and p.text not in STOP_PHRASES:
            p_text = p.text.lower().replace('usa today', ' ').replace('q: ', ' ').replace('a: ', ' ').replace('getelementbyid', ' ').replace('eet', ' ')
            raw += word_tokenize(p_text)
    for word in raw: 
        if '\\xc2\\xa0' in word:
            tmp = word.split('\\xc2\\xa0')
        else:
            tmp = [word]
        tmp = [re.sub(r"\\x..", "", w).replace("\\", "") for w in tmp]
        for w in tmp:
            bag_of_words += re.sub(r"[^a-zA-Z]", " ", w).split()
    
    bag_of_words = [w.lower() for w in bag_of_words if w.isalpha() and w not in STOP_WORDS]
    
    return bag_of_words

In [11]:
article_html = fetch('http://www.factcheck.org/2013/03/obamacare-to-cost-20000-a-family/')
article_html = article_html[:re.search(r'<span style=".{,15}">(<strong>)?Sources(</strong>)?</span>', article_html).start()]
soupify_article = BeautifulSoup(article_html, 'html.parser')
paragraphs = [p.text for p in soupify_article.find_all('p',attrs={'class':None})]
paragraphs

001059b52e12ee3c55fccb793dd9718523c1dcec
Retrieving from cache: http://www.factcheck.org/2013/03/obamacare-to-cost-20000-a-family/


['',
 'A Project of the Annenberg Public Policy Center',
 ' Q: Did the IRS say that the cheapest health insurance plan under the federal health care law would cost $20,000 per family?',
 'A: No. The IRS used $20,000 in a hypothetical example to illustrate how it will calculate the tax penalty for a family that fails to obtain health coverage as required by law. Treasury says the figure \\xe2\\x80\\x9cis not an estimate of premiums.\\xe2\\x80\\x9d',
 '',
 '\xa0',
 'FULL QUESTION',
 'The Internal Revenue Service issued a report in which it estimated that under Obamacare, the least expensive health insurance plan available to a family in 2016 would cost $20,000 annually, according to CNSNews.com.',
 'Is this a true report?',
 'FULL ANSWER',
 'This question \\xe2\\x80\\x94 and several more from readers \\xe2\\x80\\x94 was prompted by an article published by the Cybercast News Service (an \\xe2\\x80\\x9calternative\\xe2\\x80\\x9d news site run by the conservative Media Research Center) with

In [16]:
OPINION = 1
NEWS = 0
def build_corpus(opinion_set, news_set):
    opinion = [(get_words(fetch(link), OPINION), OPINION) for link in opinion_set]
    news = [(get_words(fetch(link), NEWS), NEWS) for link in news_set]
    corpus = news + opinion
    random.shuffle(corpus)
    return corpus

In [19]:
def build_word_corpus():
    '''for each word record if it appears in opinion or in news'''
    opinion_words = [word for link in opinion_scraper() for word in get_words(fetch(link), OPINION)]
    news_words = [word for link in news_scraper() for word in get_words(fetch(link), NEWS)]
    shared_words = (set.intersection(set(opinion_words), set(news_words)))
    return shared_words

d2f2d9f2e949ef64113055d5999f87b5aff518b4
Retrieving from cache: http://www.usatoday.com/opinion/
be130630e349331342063ac989f3849644cf2818
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/13/valentines-day-romance-marraige-flowers-fracking-column/80234586/
508ad85d737d317d219d5ac54eb5b47e6250e3c1
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/14/bernie-sanders-henry-kissinger-richard-nixon-democratic-debate-column/80372646/
02f8334a36e5d091fad85befec25b0cb2234ee55
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/09/trump-sanders-wins-new-hampshire-economic-anxiety-column/80088548/
a943f948820e6dc9bcfdb38c10e0b424db3a7528
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/15/supreme-court-fight-assures-ugly-end-obama-era-david-corn-antonin-scalia-column/80374474/
2d0b0ab432c3188fe7ffa5d9d4014eee93046776
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/11/obama-budget-children-summer-food-h

In [80]:
def ambiguous_words(article):
    return len([w for w in article if w in shared_words])/len(article)

In [81]:
TRAIN = 1
TEST = 0
def vectorize(vectorizer, list_of_texts, is_train):
    """Return feature vectors for each entity given list of texts."""
    if is_train:
        compressed_vectors = vectorizer.fit_transform(list_of_texts)
    else:
        compressed_vectors = vectorizer.transform(list_of_texts)
    return compressed_vectors.toarray()

In [136]:
def gen_pos_tag(article):
    '''Param: article is a list of words from an individual article
       Return: A dictionary of the ratio of each tag type'''
    #modifiers = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    #past_tense = ['VBD', 'VBN']
    #article[0] is the list of words
    #tags = [''.join(t) for (w, t) in nltk.pos_tag(article[0])]
    tags = nltk.pos_tag(article)
    return tags

In [83]:
def mark_negations(article):
    '''attach NEG tag to the negation and return the proportion of negated words in an article'''
    sentim_analyzer = SentimentAnalyzer()
    article = mark_negation(article)
    return len([w for w in article if w[-3:] == 'NEG'])/len(article)

In [84]:
def add_features(features, article_html):
    # FIXME
    return np.append(features, np.array([[1],[1],[0]]), axis=1)

In [85]:
corpus = build_corpus(opinion_set, news_set)

be130630e349331342063ac989f3849644cf2818
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/13/valentines-day-romance-marraige-flowers-fracking-column/80234586/
02f8334a36e5d091fad85befec25b0cb2234ee55
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/09/trump-sanders-wins-new-hampshire-economic-anxiety-column/80088548/
17a7670af6bdb6ae425443c6f5cb2afe048032c5
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/09/military-medical-battlefield-training-live-tissue-training-animal-rights-column/80018116/
b6877cb860bcb777ab710bc7b02024f229d8fb8b
Retrieving from cache: http://www.usatoday.com/story/opinion/2016/02/08/federal-deficits-economy-governemtn-spending-editorials-debates/80032380/
08cca5170917b63e969b513dd37ab713a3b7c2ca
Loading: http://www.usatoday.com/story/opinion/2016/02/09/obama-administration-least-transparent-epa-state-doj-clinton-benghazi-column/80050428/
47c248f28ebd7c0f962c311d53aa2b0d485c53b3
Retrieving from cache: h

In [154]:
' '.join(corpus[0][0]) + ' ' + ' '.join( t for w,t in gen_pos_tag(corpus[0][0]))

'served general counsel texas gov george w bush tell rape victim young mother sexually assaulted bed knife point daughter lay asleep foot away wrong new dna evidence showed man identified rapist could guilty based testimony invalidated forensic testing man wrongly convicted spent years life sentence behind bars story rare according national registry exonerations updated last week convicted defendants exonerated last year states district columbia federal courts guam record even one mistake one many miscarriage justice individual wrongly incarcerated time also miscarriage justice victims like one sat office guilty gone free among many responsibilities attorney general united states everything power ensure justice remains blind dispensed without regard skin color zip code support tough justice justice guilty must punished experience growing data exonerations reveal troubling picture american justice today one requires action forensic science long relied upon determine guilt innocence coun

### 3. Cross-Validation, train and evaluate

In [86]:
def evaluate(y, y_pred, y_pred_prob, train_time, test_time, threshold=0.5):
    '''generate evaluation results'''
    rv = {}
    y_pred_new = np.where(y_pred_prob >= threshold, 1, 0)
    rv["accuracy"] = np.mean(y == y_pred_new)
    rv["precision"] = precision_score(y, y_pred_new)
    rv["recall"] = recall_score(y, y_pred_new)
    rv["f1"] = f1_score(y, y_pred_new)
    rv["auc_roc"] = roc_auc_score(y, y_pred_prob)
    rv["average_precision_score"] = average_precision_score(y,y_pred_prob)
    rv["train_time"] = train_time
    rv["test_time"] = test_time
    return pd.Series(rv)

In [87]:
def compute_avg_scores(l):
    rv = l[0]
    for i in range(1, len(l)):
        rv += l[i]
    return rv/len(l)

In [157]:
def go(data, k=5):
    
    # cross validation
    num_instances = len(data)
    num_in_fold = num_instances // 5
    
    results = {}
    # implement parameter and model selection here
    models = {"LinearSVC": LSVC(), 
              "RandomForest": RFC(), 
              "KNeighbors": KNC(),
              "DecisionTree": DTC(),
              "LogisticReg": LR(),
              "NaiveBayes":NB(),
              "Bagging": BC(),
              "Boosting": GBC()}
    
    for i in range(k):
        training_set = (data[0:i*num_in_fold] + 
                    data[(i+1)*num_in_fold:])
        test_set = data[i*num_in_fold: (i+1)*num_in_fold]
        
#         vectorizer = TfidfVectorizer(analyzer = "word", stop_words = "english")
        vectorizer = TfidfVectorizer(analyzer = "word", stop_words = "english", 
                                     ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
        
        X_train = vectorize(vectorizer, [' '.join(article) + ' ' + ' '.join(t for w,t in gen_pos_tag(article)) for article, tag in training_set], TRAIN)
        y_train = [tag for article, tag in training_set]
        X_test = vectorize(vectorizer, [' '.join(article) + ' ' + ' '.join(t for w,t in gen_pos_tag(article)) for article, tag in test_set], TEST)
        y_true = [tag for article, tag in test_set]
        
        for clf_name in models:
            # train
            begin_train = time.time()        
            clf = models[clf_name].fit(X_train, y_train)
            end_train = time.time()

            # test
            begin_test = time.time()
            y_pred = clf.predict(X_test)
            end_test = time.time()
            if clf_name != "LinearSVC":
                y_pred_prob = clf.predict_proba(X_test)[:,1]
            else:
                y_pred_prob = y_pred

            # evaluate
            train_time = end_train - begin_train
            test_time = end_test - begin_test
            results[clf_name] = results.get(clf_name, []) + [evaluate(y_true, y_pred, y_pred_prob, train_time, test_time)]
    
    for clf_name in models:
        results[clf_name] = compute_avg_scores(results[clf_name])
        
    return results

In [166]:
def important_features(corpus):
    vectorizer = TfidfVectorizer(analyzer = "word", stop_words = "english", ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
#     vectorizer = TfidfVectorizer(analyzer = "word", stop_words = "english")
    X_train = vectorize(vectorizer, [" ".join(article) + ' '+ ' '.join(t for t,w in gen_pos_tag(article)) for article, tag in corpus], TRAIN)
    y_train = [tag for article, tag in corpus]
    model = RFC().fit(X_train, y_train)
    feature_importance = pd.Series(model.feature_importances_)
    labels = pd.DataFrame(vectorizer.get_feature_names(), columns=["features"])
    labels["importance"] = feature_importance
    return labels[labels["importance"] > 0].sort_values(by="importance",ascending=0)

In [167]:
# run the program
results = go(corpus)

In [174]:
# Unigram features
metrics = pd.Series(["accuracy","precision","recall","f1","auc_roc","average_precision_score","train_time","test_time"])
evaluation_result = pd.DataFrame(columns=metrics)
for clf_name in results:
    evaluation_result.loc[clf_name] = results[clf_name]
evaluation_result

Unnamed: 0,accuracy,precision,recall,f1,auc_roc,average_precision_score,train_time,test_time
RandomForest,0.916667,0.890752,0.924444,0.903247,0.979166,0.977987,0.108875,0.014476
LogisticReg,0.677778,0.971429,0.297778,0.434424,0.964107,0.943485,0.046307,0.0052
Bagging,0.894444,0.903573,0.867778,0.884551,0.943635,0.956163,2.333103,0.210302
Boosting,0.922222,0.912143,0.937778,0.919526,0.9901,0.990417,72.15411,0.010701
DecisionTree,0.855556,0.812321,0.895556,0.847868,0.861071,0.878938,0.635052,0.007002
NaiveBayes,0.822222,0.906548,0.667778,0.76163,0.805417,0.859385,0.244413,0.10882
LinearSVC,0.911111,0.985714,0.807778,0.884688,0.899127,0.938413,0.059609,0.005894
KNeighbors,0.7,0.977778,0.353333,0.502409,0.864723,0.884445,0.220521,0.695767


In [175]:
important = important_features(corpus)
important

Unnamed: 0,features,importance
65831,report,0.029409
80305,told,0.028157
13938,collapse text,0.019418
79171,text,0.017957
22695,document,0.016384
27775,factcheck org,0.016132
63417,read,0.016110
63424,read columns,0.015098
86921,won,0.014089
80587,total,0.013952


In [176]:
num_news = len([(a, tag) for a, tag in corpus if tag == NEWS])
num_opinion = len([(a, tag) for a, tag in corpus if tag == OPINION])
print('{0: <15}'.format("word"), '{0: <8}'.format("NEWS"), '{0: <8}'.format("OPINION"))
for w,_ in important.to_records(index=False):
    print('{0: <15}'.format(w), 
          '{0: <8}'.format(sum([article.count(w) for article, tag in corpus if tag == NEWS])/num_news), 
          '{0: <8}'.format(sum([article.count(w) for article, tag in corpus if tag == OPINION])/num_opinion))

word            NEWS     OPINION 
report          2.02     0.14814814814814814
told            1.36     0.14814814814814814
collapse text   0.0      0.0     
text            0.48     0.012345679012345678
document        1.08     0.024691358024691357
factcheck org   0.0      0.0     
read            0.4      0.7160493827160493
read columns    0.0      0.0     
won             0.55     0.20987654320987653
total           1.14     0.024691358024691357
d angelo        0.0      0.0     
robert          0.19     0.06172839506172839
look race       0.0      0.0     
says            1.66     0.20987654320987653
vacation        0.36     0.012345679012345678
provide         0.61     0.1111111111111111
said            4.71     0.4444444444444444
u               3.75     0.6419753086419753
scalia          0.0      1.037037037037037
view            0.12     0.38271604938271603
sept            0.39     0.0     
org             0.38     0.0     
jan             0.66     0.04938271604938271
new hampsh

In [177]:
# Unigrams and bigrams
metrics = pd.Series(["accuracy","precision","recall","f1","auc_roc","average_precision_score","train_time","test_time"])
evaluation_result = pd.DataFrame(columns=metrics)
for clf_name in results:
    evaluation_result.loc[clf_name] = results[clf_name]
evaluation_result

Unnamed: 0,accuracy,precision,recall,f1,auc_roc,average_precision_score,train_time,test_time
RandomForest,0.916667,0.890752,0.924444,0.903247,0.979166,0.977987,0.108875,0.014476
LogisticReg,0.677778,0.971429,0.297778,0.434424,0.964107,0.943485,0.046307,0.0052
Bagging,0.894444,0.903573,0.867778,0.884551,0.943635,0.956163,2.333103,0.210302
Boosting,0.922222,0.912143,0.937778,0.919526,0.9901,0.990417,72.15411,0.010701
DecisionTree,0.855556,0.812321,0.895556,0.847868,0.861071,0.878938,0.635052,0.007002
NaiveBayes,0.822222,0.906548,0.667778,0.76163,0.805417,0.859385,0.244413,0.10882
LinearSVC,0.911111,0.985714,0.807778,0.884688,0.899127,0.938413,0.059609,0.005894
KNeighbors,0.7,0.977778,0.353333,0.502409,0.864723,0.884445,0.220521,0.695767


In [178]:
important = important_features(corpus)
important

Unnamed: 0,features,importance
62948,race justice,0.055365
54050,opinions,0.045390
22524,diverse,0.034250
81578,true,0.034140
45208,look race,0.027154
68465,rumor,0.018043
80305,told,0.017589
22774,doesn,0.017124
75529,statement,0.015385
12830,claim,0.014364


In [179]:
num_news = len([(a, tag) for a, tag in corpus if tag == NEWS])
num_opinion = len([(a, tag) for a, tag in corpus if tag == OPINION])
print('{0: <25}'.format("word"), '{0: <8}'.format("NEWS"), '{0: <8}'.format("OPINION"))
for w,_ in important.to_records(index=False):
    print('{0: <25}'.format(w), 
          '{0: <8}'.format(sum([" ".join(article).count(w) for article, tag in corpus if tag == NEWS])/num_news), 
          '{0: <8}'.format(sum([" ".join(article).count(w) for article, tag in corpus if tag == OPINION])/num_opinion))

word                      NEWS     OPINION 
race justice              0.0      0.4074074074074074
opinions                  0.04     0.6666666666666666
diverse                   0.0      0.4691358024691358
true                      1.96     0.13580246913580246
look race                 0.0      0.4074074074074074
rumor                     0.54     0.0     
told                      1.36     0.14814814814814814
doesn                     0.73     0.18518518518518517
statement                 0.75     0.06172839506172839
claim                     4.09     0.3333333333333333
plan                      2.52     0.4567901234567901
hillary clinton           0.08     0.2839506172839506
link                      0.67     0.0     
board contributors        0.0      0.5308641975308642
story                     1.27     0.32098765432098764
editorials opinion        0.0      0.0     
percent                   2.88     0.037037037037037035
factcheck                 0.36     0.0     
online           