## Blueprint: download and interprete robots.txt

In [1]:
import urllib.robotparser
rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://www.reuters.com/robots.txt")
rp.read()
rp.can_fetch("*", "https://www.reuters.com/sitemap.xml")

True

## Blueprint: Finding URLs from sitemap.xml

In [8]:
import xmltodict
import requests

sitemap = xmltodict.parse(requests.get('https://www.reuters.com/sitemap_news_index1.xml')\
                          .text)
urls = [url["loc"] for url in sitemap["urlset"]["url"]]
print("\n".join(urls[:3]))

https://www.reuters.com/article/us-usa-hongkong-restrictions/us-curbs-visas-for-chinese-officials-over-hong-kong-autonomy-idUSKBN23X2DN
https://www.reuters.com/article/us-global-oil/oil-dips-on-rise-in-us-coronavirus-cases-set-for-weekly-fall-idUSKBN23X01R
https://www.reuters.com/article/usa-lgbt-entertainment/refile-new-yorks-stonewall-inn-seeks-help-to-fight-closure-after-lockdown-idUSL8N2E33Z1


## Blueprint: Finding URLs from RSS

In [33]:
import feedparser
feed = feedparser.parse("http://rss.cnn.com/rss/cnn_topstories.rss")

In [38]:
[(e.title, e.link) for e in feed.entries]

[("The vice president's defense shows the wide gap between health officials' guidelines and what Trump himself is actually doing",
  'http://rss.cnn.com/~r/rss/cnn_topstories/~3/F27151RPv4c/index.html'),
 ("Pence trumpets 'encouraging news' as some states see surge in cases",
  'http://rss.cnn.com/~r/rss/cnn_topstories/~3/8ndgU1YUQio/h_5218fd02b7ef18940277beb6a18bde02'),
 ('Don Lemon: This is what American carnage looks like',
  'http://rss.cnn.com/~r/rss/cnn_topstories/~3/bpeaGbaW2Ks/dons-take-trump-coronavirus-testing-lemon-ctn-vpx.cnn'),
 ('Pressure mounts on DeSantis as Covid-19 cases spike in Florida',
  'http://rss.cnn.com/~r/rss/cnn_topstories/~3/F46pkWHZFbM/index.html'),
 ("Dr. Sanjay Gupta: I can't believe we're in the position we're in",
  'http://rss.cnn.com/~r/rss/cnn_topstories/~3/I0-7CB_tQDI/coronavirus-frustration-gupta-newday-vpx.cnn'),
 ("The region that's leading the country in mask-wearing",
  'http://rss.cnn.com/~r/rss/cnn_topstories/~3/mLph_FQWJfs/index.html'),
 ('

## Example: Downloading HTML with Python

In [39]:
%%time
s = requests.Session()

for url in urls[0:10]:
    file = url.split("/")[-1]
    
    r = s.get(url)
    with open(file, "w+b") as f:
        f.write(r.text.encode("utf-8"))

CPU times: user 47.8 ms, sys: 10.6 ms, total: 58.4 ms
Wall time: 2.26 s


## Blueprint: Downloading HTML pages with wget

In [40]:
with open("urls.txt", "w+b") as f:
    f.write("\n".join(urls).encode("utf-8"))

## Blueprint: Extraction with Regex

In [43]:
url = 'https://www.reuters.com/article/us-health-vaping-marijuana-idUSKBN1WG4KT'

# use the part after the last / as filename
file = url.split("/")[-1]
import re

with open(file, "r") as f:
    html = f.read()
    g = re.search(r'<title>(.*)</title>', html, re.MULTILINE|re.DOTALL)
    if g:
        print(g.groups()[0])


                Banned in Boston: Without vaping, medical marijuana patients must adapt - Reuters


## Blueprint: Extracting the Title/Headline

In [44]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
soup.select("h1.ArticleHeader_headline")

[<h1 class="ArticleHeader_headline">Banned in Boston: Without vaping, medical marijuana patients must adapt</h1>]

In [45]:
soup.h1

<h1 class="ArticleHeader_headline">Banned in Boston: Without vaping, medical marijuana patients must adapt</h1>

In [46]:
soup.h1.text

'Banned in Boston: Without vaping, medical marijuana patients must adapt'

In [47]:
soup.title.text

'\n                Banned in Boston: Without vaping, medical marijuana patients must adapt - Reuters'

## Blueprint: Extracting the Article Text

In [48]:
soup.select_one("div.StandardArticleBody_body").text

'BOSTON (Reuters) - In the first few days of the four-month ban on all vaping products in Massachusetts, Laura Lee Medeiros, a medical marijuana patient, began to worry.\xa0 FILE PHOTO: An employee puts down an eighth of an ounce marijuana after letting a customer smell it outside the Magnolia cannabis lounge in Oakland, California, U.S. April 20, 2018. REUTERS/Elijah NouvelageThe 32-year-old massage therapist has a diagnosis of post-traumatic stress disorder (PTSD) from childhood trauma. To temper her unpredictable panic attacks, she relied on a vape pen and cartridges filled with the marijuana derivatives THC and CBD from state dispensaries. There are other ways to get the desired effect from  marijuana, and patients have filled dispensaries across the state in recent days to ask about edible or smokeable forms. But Medeiros has come to depend on her battery-powered pen, and wondered how she would cope without her usual supply of cartridges.  “In the midst of something where I’m on t

## Blueprint: Extracting Image Captions

In [49]:
soup.select("div.StandardArticleBody_body figure img")

[<img aria-label="FILE PHOTO: An employee puts down an eighth of an ounce marijuana after letting a customer smell it outside the Magnolia cannabis lounge in Oakland, California, U.S. April 20, 2018. REUTERS/Elijah Nouvelage" src="//s3.reutersmedia.net/resources/r/?m=02&amp;d=20191001&amp;t=2&amp;i=1435991144&amp;r=LYNXMPEF9039L&amp;w=20"/>,
 <img src="//s3.reutersmedia.net/resources/r/?m=02&amp;d=20191001&amp;t=2&amp;i=1435991145&amp;r=LYNXMPEF9039M"/>]

In [50]:
soup.select("div.StandardArticleBody_body figcaption")

[<figcaption><div class="Image_caption"><span>FILE PHOTO: An employee puts down an eighth of an ounce marijuana after letting a customer smell it outside the Magnolia cannabis lounge in Oakland, California, U.S. April 20, 2018. REUTERS/Elijah Nouvelage</span></div></figcaption>,
 <figcaption class="Slideshow_caption">Slideshow<span class="Slideshow_count"> (2 Images)</span></figcaption>]

## Blueprint: Extracting the URL

In [51]:
soup.find("link", {"rel": "canonical"})["href"]

'https://www.reuters.com/article/us-health-vaping-marijuana-idUSKBN1WG4KT'

## Blueprint: Extracting List Information (Authors)

In [52]:
soup.find("meta", {"name": "Author"})["content"]

'Jacqueline Tempera'

In [53]:
sel = "div.BylineBar_first-container.ArticleHeader_byline-bar div.BylineBar_byline span"
soup.select(sel)

[<span><a href="/journalists/jacqueline-tempera" target="_blank">Jacqueline Tempera</a>, </span>,
 <span><a href="/journalists/jonathan-allen" target="_blank">Jonathan Allen</a></span>]

In [54]:
[a.text for a in soup.select(sel)]

['Jacqueline Tempera, ', 'Jonathan Allen']

## Blueprint: Extracting Text of Links (Section)

In [55]:
soup.select_one("div.ArticleHeader_channel a").text

'Health News'

## Blueprint: Extracting Reading Time

In [56]:
soup.select_one("p.BylineBar_reading-time").text

'6 Min Read'

## Blueprint: Extracting Attributes (ID)

In [57]:
soup.select_one("div.StandardArticle_inner-container")["id"]

'USKBN1WG4KT'

## Blueprint: Extracting Attribution

In [58]:
soup.select_one("p.Attribution_content").text

'Reporting Jacqueline Tempera in Brookline and Boston, Massachusetts, and Jonathan Allen in New York; Editing by Frank McGurty and Bill Berkrot'

## Blueprint: Extracting Timestamp

In [59]:
p_time = soup.find("meta", {"property": "og:article:published_time"})["content"]
print(p_time)

2019-10-01T19:23:16+0000


In [60]:
from dateutil import parser
parser.parse(p_time)

datetime.datetime(2019, 10, 1, 19, 23, 16, tzinfo=tzutc())

## Blueprint: Spidering

In [61]:
import requests
from bs4 import BeautifulSoup
import os.path
from dateutil import parser

REUT_URL = "https://www.reuters.com"
ARCH_URL = REUT_URL + "/news/archive/"
AUTHORS_ITEM = "div.BylineBar_first-container.ArticleHeader_byline-bar div.BylineBar_byline span"


def download_archive_page(page):
    filename = "page-%06d.html" % page
    if not os.path.isfile(filename):
        url = ARCH_URL + "?view=page&page=%d&pageSize=10" % page
        r = requests.get(url)
        with open(filename, "w+") as f:
            f.write(r.text)
            
def parse_archive_page(page_file):
    with open(page_file, "r") as f:
        html = f.read()
        
    soup = BeautifulSoup(html, "html.parser")
    hrefs = [REUT_URL + a["href"]
             for a in soup.select("article.story div.story-content a")]
    return hrefs

def download_article(url):
    # Check if it's there
    filename = url.split("/")[-1] + ".html"
    if not os.path.isfile(filename):
        r = requests.get(url)
        with open(filename, "w+") as f:
            f.write(r.text)
            
def parse_article(article_file):
    with open(article_file, "r") as f:
        html = f.read()
    r = {}
    soup = BeautifulSoup(html, "html.parser")
    r["id"] = soup.select_one("div.StandardArticle_inner-container")["id"]
    r["url"] = soup.find("link", {"rel": "canonical"})["href"]
    r["headline"] = soup.h1.text
    r["section"] = soup.select_one("div.ArticleHeader_channel a").text
    r["text"] = soup.select_one("div.StandardArticleBody_body").text
    r["authors"] = [a.text for a in soup.select(AUTHORS_ITEM)]
    r["time"] = soup.find("meta", {"property": "og:article:published_time"})["content"]
    return r

Download 10 pages of archive

In [67]:
for p in range(1, 10):
    download_archive_page(p)

Parse archive and add to article_urls

In [64]:
import glob

In [65]:
article_urls = []
for page_file in glob.glob("page-*.html"):
    article_urls += parse_archive_page(page_file)

In [66]:
print(len(article_urls))
article_urls[:4]

117


['https://www.reuters.com/article/us-nike-layoffs/nike-plans-to-cut-jobs-in-digital-push-idUSKBN23X2AV',
 'https://www.reuters.com/article/us-health-coronavirus-usa-florida/florida-stops-bars-serving-alcohol-onsite-amid-coronavirus-surge-idUSKBN23X29E',
 'https://www.reuters.com/article/us-wirecard-accounts-softbank/softbank-plans-to-sue-ey-over-wirecard-scandal-der-spiegel-idUSKBN23X1A8',
 'https://www.reuters.com/article/us-asean-summit/amid-pandemic-se-asian-nations-warn-of-alarming-south-china-sea-incidents-idUSKBN23X1F8']

Download Articles

In [68]:
for url in article_urls:
    download_article(url)

Arrange in a DataFrame

In [69]:
import pandas as pd
df = pd.DataFrame()
for article_file in glob.glob("*-id???????????.html"):
    df = df.append(parse_article(article_file), ignore_index=True)
    
df["time"] = pd.to_datetime(df["time"])
df.head()

Unnamed: 0,authors,headline,id,section,text,time,url
0,[Brad Brooks],"New U.S. coronavirus cases hit 45,242 for bigg...",USKBN23X29G,Health News,"LUBBOCK, Texas (Reuters) - The United States r...",2020-06-27 01:49:48+00:00,https://www.reuters.com/article/us-health-coro...
1,[Patricia Zengerle],"Senate advances $740 billion defense bill, sta...",USKBN23W38I,Politics,A man walks past the U.S. Capitol building in ...,2020-06-25 20:15:07+00:00,https://www.reuters.com/article/us-usa-defense...
2,[],Democrats say U.S. withdrawal from Open Skies ...,USKBN23T373,Politics,FILE PHOTO - U.S. Senate Minority Leader Chuck...,2020-06-23 15:55:09+00:00,https://www.reuters.com/article/us-usa-russia-...
3,[],New Jersey public schools to reopen 'in some c...,USKBN23X2HZ,U.S.,FILE PHOTO: New Jersey Governor Phil Murphy ta...,2020-06-26 17:54:51+00:00,https://www.reuters.com/article/us-health-corn...
4,[Simon Lewis],University of Michigan pulls out of Trump-Bide...,USKBN23U25Y,Politics,FILE PHOTO: Democratic U.S. presidential candi...,2020-06-23 13:47:24+00:00,https://www.reuters.com/article/us-usa-electio...


# Density-based Text Extraction

In [70]:
from readability import Document

doc = Document(html)
doc.title()

'Banned in Boston: Without vaping, medical marijuana patients must adapt - Reuters'

In [71]:
doc.short_title()

'Banned in Boston: Without vaping, medical marijuana patients must adapt'

In [72]:
doc.summary()

'<html><body><div><div class="StandardArticleBody_body"><p>BOSTON (Reuters) - In the first few days of the four-month ban on all vaping products in Massachusetts, Laura Lee Medeiros, a medical marijuana patient, began to worry.\xa0 </p><div class="PrimaryAsset_container"><div class="Image_container" tabindex="-1"><figure class="Image_zoom"/><figcaption><p class="Image_caption"><span>FILE PHOTO: An employee puts down an eighth of an ounce marijuana after letting a customer smell it outside the Magnolia cannabis lounge in Oakland, California, U.S. April 20, 2018. REUTERS/Elijah Nouvelage</span></p></figcaption></div></div><p>The 32-year-old massage therapist has a diagnosis of post-traumatic stress disorder (PTSD) from childhood trauma. To temper her unpredictable panic attacks, she relied on a vape pen and cartridges filled with the marijuana derivatives THC and CBD from state dispensaries. </p><p>There are other ways to get the desired effect from  marijuana, and patients have filled d

In [73]:
density_soup = BeautifulSoup(doc.summary(), "html.parser")
density_soup.body.text

'BOSTON (Reuters) - In the first few days of the four-month ban on all vaping products in Massachusetts, Laura Lee Medeiros, a medical marijuana patient, began to worry.\xa0 FILE PHOTO: An employee puts down an eighth of an ounce marijuana after letting a customer smell it outside the Magnolia cannabis lounge in Oakland, California, U.S. April 20, 2018. REUTERS/Elijah NouvelageThe 32-year-old massage therapist has a diagnosis of post-traumatic stress disorder (PTSD) from childhood trauma. To temper her unpredictable panic attacks, she relied on a vape pen and cartridges filled with the marijuana derivatives THC and CBD from state dispensaries. There are other ways to get the desired effect from  marijuana, and patients have filled dispensaries across the state in recent days to ask about edible or smokeable forms. But Medeiros has come to depend on her battery-powered pen, and wondered how she would cope without her usual supply of cartridges.  “In the midst of something where I’m on t

# All-in-one Approach

(Using Scrapy)

In [75]:
import scrapy
import logging

class ReutersArchiveSpider(scrapy.Spider):
    name = "reuters-archive"
    
    custom_settings = {
        "LOG_LEVEL": logging.WARNING,
        "FEED_FORMAT": "json",
        "FEED_URI": "reuters-archive.json"
    }
    
    start_urls = [ARCH_URL]
    
    def parse(self, response):
        for article in response.css("article.story div.story-content a"):
            yield response.follow(article.css("a::attr(href)").extract_first(),
                                  self.parse_article)
        next_page_url = response.css("a.control-nav-next::attr(href)").extract_first()
        if (next_page_url is not None) & ("page=2" not in next_page_url):
            yield response.follow(next_page_url, self.parse)
                
    def parse_article(self, response):
        yield {
            "title": response.css("h1::text").extract_first().strip(),
            "section": response.css("div.ArticleHeader_channel a::text").extract_first().strip(),
            "text": "\n".join(response.css("div.StandardArticleBody_body p::text").extract())
        }

In [None]:
if False:  # We're not actually going to do this
    from scrapy.crawler import CrawlerProcess
    process = CrawlerProcess()

    process.crawl(ReutersArchiveSpider)
    process.start()