In [1]:
# python3
import os
from urllib.request import urlopen
import webbrowser
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
import re

## Scrape URLS 

### Snapshots of politcs section of news websites over the week 2/10-2-16

In [3]:
fox_urls = ['https://web.archive.org/web/20170225100230/http://www.foxnews.com/politics.html',
            'https://web.archive.org/web/20170224185300/http://www.foxnews.com/politics.html',
           'https://web.archive.org/web/20170223134746/http://www.foxnews.com/politics.html',
           'https://web.archive.org/web/20170222171145/http://www.foxnews.com/politics.html',
           'https://web.archive.org/web/20170221224745/http://www.foxnews.com/politics.html',
           'https://web.archive.org/web/20170220224305/http://www.foxnews.com/politics.html',
           'https://web.archive.org/web/20170219182238/http://www.foxnews.com/politics.html']

breitbart_urls = ['https://web.archive.org/web/20170225191025/http://www.breitbart.com/big-government/',
                 'https://web.archive.org/web/20170224212301/http://www.breitbart.com/big-government/',
                 'https://web.archive.org/web/20170223234659/http://www.breitbart.com/big-government/',
                 'https://web.archive.org/web/20170222233839/http://www.breitbart.com/big-government/',
                 'https://web.archive.org/web/20170221220822/http://www.breitbart.com/big-government/']

huffpost_urls = ['https://web.archive.org/web/20170225191519/http://www.huffingtonpost.com/section/politics',
                 'https://web.archive.org/web/20170224192029/http://www.huffingtonpost.com/section/politics',
                'https://web.archive.org/web/20170221221317/http://www.huffingtonpost.com/section/politics',
                 'https://web.archive.org/web/20170222211843/http://www.huffingtonpost.com/section/politics',
                'https://web.archive.org/web/20170220231829/http://www.huffingtonpost.com/section/politics']

nyt_urls = ['https://web.archive.org/web/20170225040536/https://www.nytimes.com/pages/politics/index.html',   
            'https://web.archive.org/web/20170224025710/https://www.nytimes.com/pages/politics/index.html',
           'https://web.archive.org/web/20170223125613/https://www.nytimes.com/pages/politics/index.html',
           'https://web.archive.org/web/20170222111155/https://www.nytimes.com/pages/politics/index.html',
           'https://web.archive.org/web/20170221093125/https://www.nytimes.com/pages/politics/index.html',
           'https://web.archive.org/web/20170220082925/https://www.nytimes.com/pages/politics/index.html',
           'https://web.archive.org/web/20170219093118/https://www.nytimes.com/pages/politics/index.html']

In [4]:
def extract_links(url):
    """
    Extract links from url
    create parsed soup object that is url
    <a> tag defines a hyperlink
    <href> specifies the URL of the page the link goes to
    """
    soup = BeautifulSoup(urlopen(url), "html.parser") 

    links = []
    for link in soup.findAll('a'):
        links.append(link.get('href'))

    return links

### Get Relevant Article URLS

In [5]:
def find_section_links(source_urls,link_list,pattern_match, pattern_badmatch = None):
    for url in source_urls:
        all_links = extract_links(url)
        for link in all_links:
            if(link):
                politics_link = re.findall(pattern_match,link)
                if pattern_badmatch: 
                    bad_match = re.findall(pattern_badmatch,link)
                else:
                    bad_match = False
                if politics_link and not bad_match:
                    politics_link= politics_link[0]
                    if politics_link not in link_list:
                        link_list.append(politics_link)
    return link_list

In [247]:
fox_pol_links = find_section_links(fox_urls,[],r"http://www.foxnews.com/politics/2017/02/2.*")

In [9]:
breitbart_pol_links = find_section_links(breitbart_urls,[],r"http://www.breitbart.com/big-government/2017/02/2.*",r"#disqus_thread")

In [241]:
nyt_pol_links = find_section_links(nyt_urls,[],r"http://www.nytimes.com/2017/02/2\d/us/politics/.*")

In [323]:
huffpost_pol_links = find_section_links(huffpost_urls,[],r"http://www.huffingtonpost.com/entry/.*")

In [10]:
#resample to get fewer articles
breitbart_pol_links = breitbart_pol_links[1::2]
huffpost_pol_links = huffpost_pol_links[1::3]

### Download, Parse & Combine Article Full Texts

In [11]:
from newspaper import Article

In [13]:
def write_full_articles(link_list,filename,file_use):
    f = open(filename,file_use)
    for link in link_list:
        article = Article(link)
        article.download()
        article.parse()
        f.write(article.text)
    f.close()

In [252]:
write_full_articles(fox_pol_links,'data/fox_pol_feb_20_26.txt','w')

In [14]:
write_full_articles(breitbart_pol_links,'data/breitbart_pol_feb_20_26.txt','w')

In [200]:
#write_full_articles(nyt_pol_links,'data/nyt_pol_feb_20_26.txt')

In [326]:
write_full_articles(huffpost_pol_links,'data/huffpost_pol_feb_20_26.txt','w')

### Import More Sources for NYT 

In [230]:
import xml.etree.ElementTree as ET

In [231]:
#nyt_pol_2_25_RSS.xml is saved RSS Feed http://www.nytimes.com/services/xml/rss/index.html
#from section US>Politics on 2/25
tree = ET.parse('nyt_pol_2_25_RSS.xml')
root = tree.getroot()
channel = root.findall("./channel")

In [243]:
for item in channel[0].findall('item'):
    link= item.find('link').text
    politics_link = re.findall(r"http://www.nytimes.com/2017/02/2\d/us/politics/.*",link)
    if politics_link:
        trunc_link = re.findall("(.*)\?partner=rss&emc=rss",link)
        if trunc_link[0] not in nyt_pol_links:
            nyt_pol_links.append(trunc_link[0])

In [245]:
write_full_articles(nyt_pol_links,'data/nyt_pol_feb_20_26.txt','w')