# Scraping Websites and Extracting Data

## Remark

The example provided is adapted from Albrecht, RamachandranThe Reuters website has changed significantly and is now more complicated to search. For the examples below we will use the the Internet Archive version. We could use alternative libraries with utility functions to handle the RSS obfuscation, but this would go beyond the scope of our illustration.

Several layout and formatting commands, like `figsize` to control figure size or subplot commands are not necessary, but formating preferences.

## Setup: Load Python Settings

Common imports, defaults for formatting in Matplotlib, Pandas etc.

In [None]:
# suppress warnings
import warnings;
warnings.filterwarnings('ignore');

# common imports
import pandas as pd
import numpy as np
import math
import re
import glob
import os
import sys
import json
import random
import pprint as pp
import textwrap
import sqlite3
import logging

import spacy
import nltk

from tqdm.auto import tqdm
# register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
tqdm.pandas()

# pandas display options
# https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html#available-options
pd.options.display.max_columns = 30 # default 20
pd.options.display.max_rows = 60 # default 60
pd.options.display.float_format = '{:.2f}'.format
# pd.options.display.precision = 2
pd.options.display.max_colwidth = 200 # default 50; -1 = all
# otherwise text between $ signs will be interpreted as formula and printed in italic
pd.set_option('display.html.use_mathjax', False)

# np.set_printoptions(edgeitems=3) # default 3

import matplotlib
from matplotlib import pyplot as plt

plot_params = {'figure.figsize': (8, 4), 
               'axes.labelsize': 'large',
               'axes.titlesize': 'large',
               'xtick.labelsize': 'large',
               'ytick.labelsize':'large',
               'figure.dpi': 100}
# adjust matplotlib defaults
matplotlib.rcParams.update(plot_params)

import seaborn as sns
sns.set_style("darkgrid")


# Download and interpret robots.txt

In [None]:
import urllib.robotparser
rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://www.reuters.com/robots.txt")
rp.read()

In [None]:
rp.can_fetch("*", "https://www.reuters.com/arc/outboundfeeds/news-sitemap/?outputType=xml")

In [None]:
rp.can_fetch("*", "https://www.reuters.com/finance/stocks/option")

# Finding URLs from sitemap.xml

In [None]:
# might need to install xmltodict
%conda install xmltodict


In [None]:
import xmltodict
import requests

sitemap = xmltodict.parse(requests.get('https://www.reuters.com/arc/outboundfeeds/news-sitemap/?outputType=xml').text)

In [None]:
# just see some of the URLs
urls = [url["loc"] for url in sitemap["urlset"]["url"]]
print("\n".join(urls[0:3]))

# Finding URLs from RSS

Reuters removed its RSS feed. However, we can use a saved copy from the Internet archive

In [None]:
# might need to install feedparser
%conda install feedparser

In [None]:
import feedparser
feed = feedparser.parse('http://web.archive.org/web/20200613003232if_/http://feeds.reuters.com/Reuters/worldNews')

In [None]:
[(e.title, e.link) for e in feed.entries]

In [None]:
[e.id for e in feed.entries]

## Downloading HTML pages with Python

In [None]:
%%time
s = requests.Session()
for url in urls[0:10]:
    # get the part after the last / in URL and use as filename
    file = url.split("/")[-1]
    
    r = s.get(url)
    with open(file, "w+b") as f:
        f.write(r.text.encode('utf-8'))

In [None]:
with open("urls.txt", "w+b") as f:
    f.write("\n".join(urls).encode('utf-8'))

# Extraction with regular expressions

We first have to download a single article

In [None]:
url = 'https://www.reuters.com/article/us-health-vaping-marijuana-idUSKBN1WG4KT'

file = url.split("/")[-1] + ".html"

r = requests.get(url)

with open(file, "w+") as f:
    f.write(r.text)

In [None]:
import re
with open(file, "r") as f:
    html = f.read()
    g = re.search(r'<title>(.*)</title>', html, re.MULTILINE|re.DOTALL)
    if g:
        print(g.groups()[0])

# Using an HTML parser for extraction

We download the articles from the Internet archive which still has the old HTML structure.

In [None]:
WA_PREFIX = "http://web.archive.org/web/20200118131624/"
html = s.get(WA_PREFIX + url).text

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
soup.select("h1.ArticleHeader_headline")

## Extracting the title/headline

In [None]:
soup.h1

In [None]:
soup.h1.text

In [None]:
soup.title.text

In [None]:
soup.title.text.strip()

## Extracting the article text

In [None]:
soup.select_one("div.StandardArticleBody_body").text

## Extracting image captions

In [None]:
soup.select("div.StandardArticleBody_body figure")

In [None]:
soup.select("div.StandardArticleBody_body figure img")

In [None]:
soup.select("div.StandardArticleBody_body figcaption")

## Extracting the URL

In [None]:
soup.find("link", {'rel': 'canonical'})['href']

In [None]:
soup.select_one("link[rel=canonical]")['href']

## Extracting list information (authors)

In [None]:
soup.find("meta", {'name': 'Author'})['content']

In [None]:
sel = "div.BylineBar_first-container.ArticleHeader_byline-bar div.BylineBar_byline span"
soup.select(sel)

In [None]:
[a.text for a in soup.select(sel)]

## Extracting text of links (section)


In [None]:
soup.select_one("div.ArticleHeader_channel a").text

## Extracting reading time

In [None]:
soup.select_one("p.BylineBar_reading-time").text

## Extracting attributes (id)

In [None]:
soup.select_one("div.StandardArticle_inner-container")['id']

## Extracting Attribution

In [None]:
soup.select_one("p.Attribution_content").text

## Extracting Timestamp

In [None]:
ptime = soup.find("meta", { 'property': "og:article:published_time"})['content']
print(ptime)

In [None]:
from dateutil import parser
parser.parse(ptime)

In [None]:
parser.parse(soup.find("meta", { 'property': "og:article:modified_time"})['content'])

## Spidering

In [None]:
import requests
from bs4 import BeautifulSoup
import os.path
from dateutil import parser

def download_archive_page(page):
    filename = "page-%06d.html" % page
    if not os.path.isfile(filename):
        url = "https://www.reuters.com/news/archive/" + \
              "?view=page&page=%d&pageSize=10" % page
        r = requests.get(url)
        with open(filename, "w+") as f:
            f.write(r.text)

def parse_archive_page(page_file):
    with open(page_file, "r") as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    hrefs = ["https://www.reuters.com" + a['href'] 
               for a in soup.select("article.story div.story-content a")]
    return hrefs

def download_article(url):
    # check if article already there
    filename = url.split("/")[-1] + ".html"
    if not os.path.isfile(filename):
        r = requests.get(url)
        with open(filename, "w+") as f:
            f.write(r.text)

def parse_article(article_file):
    def find_obfuscated_class(soup, klass):
        return soup.find_all(lambda tag: tag.has_attr("class") and (klass in " ".join(tag["class"])))
    
    with open(article_file, "r") as f:
        html = f.read()
    r = {}
    soup = BeautifulSoup(html, 'html.parser')
    r['url'] = soup.find("link", {'rel': 'canonical'})['href']
    r['id'] = r['url'].split("-")[-1]
    r['headline'] = soup.h1.text
    r['section'] = find_obfuscated_class(soup, "ArticleHeader-channel")[0].text
    
    r['text'] = "\n".join([t.text for t in find_obfuscated_class(soup, "Paragraph-paragraph")])
    r['authors'] = find_obfuscated_class(soup, "Attribution-attribution")[0].text
    r['time'] = soup.find("meta", { 'property': "og:article:published_time"})['content']
    return r

In [None]:
# download 2 pages of archive
for p in range(1, 2):
    download_archive_page(p)

In [None]:
# parse archive and add to article_urls
import glob

article_urls = []
for page_file in glob.glob("page-*.html"):
    article_urls += parse_archive_page(page_file)

In [None]:
# download articles
for url in article_urls:
    download_article(url)

In [None]:
# arrange in pandas DataFrame
import pandas as pd

df = pd.DataFrame()
for article_file in glob.glob("*-id???????????.html"):
    df = df.append(parse_article(article_file), ignore_index=True)
df['time'] = pd.to_datetime(df.time)

In [None]:
df

In [None]:
df.sort_values("time")

# Scrapy

Unfortunately, the code for `scrapy` cannot be changed easily. One more argument for using *up to date* separate libraries. In this version, it still collects the titles of the articles but nothing more.

In [None]:
# might need to install scrapy
%conda install scrapy

In [None]:
import scrapy
import logging


class ReutersArchiveSpider(scrapy.Spider):
    name = 'reuters-archive'
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'FEED_FORMAT': 'json',
        'FEED_URI': 'reuters-archive.json'
    }
    
    start_urls = [
        'https://www.reuters.com/news/archive/',
    ]

    def parse(self, response):
        for article in response.css("article.story div.story-content a"):
            yield response.follow(article.css("a::attr(href)").extract_first(), self.parse_article)

        next_page_url = response.css('a.control-nav-next::attr(href)').extract_first()
        if (next_page_url is not None) & ('page=2' not in next_page_url):
            yield response.follow(next_page_url, self.parse)

    def parse_article(self, response):
        yield {
          'title': response.css('h1::text').extract_first().strip(),
        }

In [None]:
# this can be run only once from a Jupyter notebook due to Twisted
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()

process.crawl(ReutersArchiveSpider)
process.start()

In [None]:
glob.glob("*.json")

In [None]:
!cat 'reuters-archive.json'