In [None]:
import pandas as pd
from dateutil.parser import parse
import matplotlib.pyplot as plt
from IPython.display import display 

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

from joblib import Parallel, delayed
import multiprocessing  

from pprint import pprint
import re

import spacy
nlp = spacy.load('en')

plt.style.use('seaborn-paper')
%matplotlib inline

# Snopes.com

In [None]:
%%time

url = 'https://www.snopes.com/fact-check/page/{pageNum}/'

numb_pages = 977
story_links = []

def get_soup(url):
    """Create a soup from url"""
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    html_doc = urlopen(req).read()
    soup = BeautifulSoup(html_doc, 'html.parser')
    return soup


    
    
def get_story_links(link):
    print('processing', link)
    story_links = []
    req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
    html_doc = urlopen(req).read()
#     print(html_content)
    soup = BeautifulSoup(html_doc, 'html.parser')
#     story = soup.findAll('href', style="width=300px;") 
    for a in soup.find_all('a', re.compile('article-link'), href=True):
        story_links.append(a['href'])
        
    return story_links

 
for page in range(1, numb_pages+1):
    link = url.replace('{pageNum}', str(page))
    story_links.append( get_story_links(link))

    
story_links =  [item for sublist in story_links for item in sublist]
print('found number of stories', len(story_links))


In [None]:
df_link = pd.DataFrame(story_links) 
df_link.to_csv('data/snopes_links.csv')


In [None]:
%%time
from tqdm import tqdm
def get_truth(snopes_href):
    """returns if the rumor is considered true or false"""
#     snopes_url = 'http://www.snopes.com' + snopes_href
    claim_text = ''
    truth = 'false'
    origin = ''
    soup = get_soup(snopes_href)
    review = soup.find('span', {'itemprop': 'alternateName'})
            
    try:
        article_div = soup.find_all('div', class_='entry-content article-text')
        if article_div:
            claim_text = soup.findAll('p')[0].string.strip()     
    except:
        print ("Oops!")
        claim_text = None
        
#     print(review, claim_text)
    if review:
        if 'rue' in review.string:
            truth = 'True'
    else:
        review = soup.find('font', {'class': 'status_color'})
        if review:
            if 'TRUE' in review.string:
                truth = 'True'
                
    return {'claim': claim_text, 'lable': truth, 'link': snopes_href}

def processLink(story_link):
#     print(story_link)
    try:
        return get_truth(story_link)
    except:
        print ("Oops!")
    


num_cores = multiprocessing.cpu_count()
print(num_cores)
    
results = Parallel(n_jobs=num_cores)(delayed(processLink)(i) for i in tqdm(story_links))

# results = [item for sublist in results for item in sublist]
df = pd.DataFrame(results)
df.to_csv('data/snopes.tsv', sep='\t')
   
df.tail()

In [None]:
# results[:100]
cleanedList = [x for x in results if x != None]
df = pd.DataFrame(cleanedList)
df.to_csv('data/snopes.tsv', sep='\t')
   
df.tail()

In [None]:
# 
url = 'http://www.politifact.com/truth-o-meter/rulings/{category}/?page={pageNum}'

categories = {'true': 110, 
              'mostly-true' : 138, 
              'half-true': 144,
              'barely-true': 121,
              'false' : 136,
              'pants-fire' : 74
             }

links = []

for category, pageNum in categories.items():
     for i in range (1, pageNum + 1):
        link = url.replace('{category}', category)
        link = link.replace('{pageNum}', str(i))
        links.append(link)
        
    
print(links[1])

In [None]:

def get_statement(link):
    print('processing', link)
    req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
    html_doc = urlopen(req).read()
#     print(html_content)
    soup = BeautifulSoup(html_doc, 'html.parser')
    statements = soup.find_all(class_='statement')
#     print(statements[10])
    records = []
    for statement in statements:
        record = {}
        record['quote'] = statement.find_all('p', class_='quote')[0].text
        record['statement__source'] = statement.find_all('p', class_='statement__source')[0].text
        record['statement__text'] = statement.find_all('a', class_='link')[0].text
        record['link'] = statement.find_all('a', class_='link')[0]['href']
        record['article__meta'] = statement.find_all('span', class_='article__meta')[0].text
        record['ruling'] = link.split('/')[-2]
        records.append(record)
    
    return records

# records = get_statement(links[1])
# # print(records)
# df = pd.DataFrame(records)
# df.tail()

In [None]:
%%time

def processLink(link):
    return get_statement(link)

num_cores = multiprocessing.cpu_count()
print(num_cores)
    
results = Parallel(n_jobs=num_cores)(delayed(processLink)(i) for i in links)

results = [item for sublist in results for item in sublist]
df = pd.DataFrame(results)
df.to_csv('data/politifact.tsv', sep='\t')


In [None]:
df.tail()

In [None]:
print(df.groupby(['ruling']).size())
df.groupby(['ruling']).size().plot(kind='barh')

In [None]:

def tokenizer(text):
    return u' '.join([w.text for w in nlp(text)])

def _apply_df(args):
    df, func, kwargs = args
    df['statement__text'] = df['statement__text'].apply(func, **kwargs)
    return df#df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = multiprocessing.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))


num_cores = multiprocessing.cpu_count()
print(num_cores) 

df = pd.read_csv('data/politifact.tsv', sep='\t')

df = apply_by_multiprocessing(df, tokenizer,  workers=num_cores)

df.to_csv('data/politifact.tsv', sep='\t')
df.tail()