In [1]:
import os
import requests
from lxml import html
import lxml
import requests_cache
import time
from typing import Optional

#  Download data from gov.uk

In [2]:
url = 'https://www.gov.uk/search/policy-papers-and-consultations?content_store_document_type%5B%5D=policy_papers&order=updated-newest'

In [3]:
requests_cache.install_cache(cache_name='gov_cache', backend='sqlite', expire_after=180)
session = requests.Session()

In [4]:
def extract_links(url: str, pages: int = 1, user_agent: Optional[str] = None) -> list:
    results = []
    page_number = 1

    # Parse through pages (maximum defined in function)
    while True and page_number <= pages:
        if user_agent != None:
            session.headers.update({'User-Agent': user_agent})
        
        response = session.get(f"{url}&page={page_number}")
        if response.status_code != 200:
            break
        now = time.ctime(int(time.time()))
        
        # Extract links/href from webpage into list
        print("Time: {0} / Used Cache: {1}".format(now, response.from_cache)) 
        tree = html.fromstring(response.content)
        body = tree.xpath("""//div[contains(@class, 'finder-results')]
        //li[contains(@class, 'gem-c-document-list__item')]/a/@href""")
        results.extend(body)

        page_number = page_number + 1
        time.sleep(1) # for delay
        
    return results


def absolute_link(link: str, head_url: str = 'https://www.gov.uk') -> str:
    # Return absolute links (convert from relative)
    link = link.strip()
    if link.startswith('/'):
        link = head_url + link
    return link

In [5]:
# Absolute links of 1st page
list(map(absolute_link, extract_links(url)))

Time: Mon Jan 23 18:53:28 2023 / Used Cache: False


['https://www.gov.uk/government/publications/monitoring-noise-and-vibration-on-the-hs2-phase-one-and-2a-route-november-2022',
 'https://www.gov.uk/government/publications/monitoring-air-quality-and-dust-on-the-hs2-phase-one-and-2a-route-november-2022',
 'https://www.gov.uk/government/publications/statement-of-reasons-related-to-the-use-of-section-35-of-the-scotland-act-1998',
 'https://www.gov.uk/government/publications/cruiser-sb-emergency-registration-report',
 'https://www.gov.uk/government/publications/hmrc-approach-to-working-with-agents',
 'https://www.gov.uk/government/publications/defence-science-and-technology-programmes-and-projects',
 'https://www.gov.uk/government/publications/plastic-packaging-tax',
 'https://www.gov.uk/government/publications/monitoring-noise-and-vibration-on-the-hs2-phase-one-and-2a-route-october-2022',
 'https://www.gov.uk/government/publications/monitoring-air-quality-and-dust-on-the-hs2-phase-one-and-2a-route-october-2022',
 'https://www.gov.uk/govern

In [6]:
# Absolute links of 3 pages
list(map(absolute_link, extract_links(url, 3)))

Time: Mon Jan 23 18:53:32 2023 / Used Cache: True
Time: Mon Jan 23 18:53:34 2023 / Used Cache: False
Time: Mon Jan 23 18:53:35 2023 / Used Cache: False


['https://www.gov.uk/government/publications/monitoring-noise-and-vibration-on-the-hs2-phase-one-and-2a-route-november-2022',
 'https://www.gov.uk/government/publications/monitoring-air-quality-and-dust-on-the-hs2-phase-one-and-2a-route-november-2022',
 'https://www.gov.uk/government/publications/statement-of-reasons-related-to-the-use-of-section-35-of-the-scotland-act-1998',
 'https://www.gov.uk/government/publications/cruiser-sb-emergency-registration-report',
 'https://www.gov.uk/government/publications/hmrc-approach-to-working-with-agents',
 'https://www.gov.uk/government/publications/defence-science-and-technology-programmes-and-projects',
 'https://www.gov.uk/government/publications/plastic-packaging-tax',
 'https://www.gov.uk/government/publications/monitoring-noise-and-vibration-on-the-hs2-phase-one-and-2a-route-october-2022',
 'https://www.gov.uk/government/publications/monitoring-air-quality-and-dust-on-the-hs2-phase-one-and-2a-route-october-2022',
 'https://www.gov.uk/govern

# Processing each link

In [7]:
def extract_metadata(url: str, user_agent: Optional[str] = None) -> dict:
    results = {url: {}}

    if user_agent != None:
        session.headers.update({'User-Agent': user_agent})

    response = session.get(f"{url}")
    now = time.ctime(int(time.time()))

    print("Time: {0} / Used Cache: {1}".format(now, response.from_cache)) 
    tree = html.fromstring(response.content)
    
    # Extract header
    body = tree.xpath("""//meta[@property='og:title']/@content""")
    results[url].update({'title': body[0]})
    
    #Extract author(s)
    body = tree.xpath("""//div[contains(@class, 'gem-c-metadata')]
    //a[contains(@class, 'govuk-link')]/text()""")
    results[url].update({'authors': body})

    time.sleep(1) # for delay

    return results


def authors_test():
    results = []
    expected_results = [1, 2, 6] # number of authors from each file
    path = os.getcwd().replace('/code', '/fixtures')
    html_files = ['one_author.html', 
                 'two_authors.html', 
                 'multiple_authors_accented_title.html']
    
    for f in html_files:
        with open(f'{path}/{f}', "r") as f:
            page = f.read()
        tree = html.fromstring(page)

        #Extract author(s)
        body = tree.xpath("""//div[contains(@class, 'gem-c-metadata')]
        //a[contains(@class, 'govuk-link')]/text()""")
        results.append(len(body))

    assert results == expected_results


In [8]:
links50 = list(map(absolute_link, extract_links(url, 3)))[:50]

Time: Mon Jan 23 18:53:42 2023 / Used Cache: True
Time: Mon Jan 23 18:53:43 2023 / Used Cache: True
Time: Mon Jan 23 18:53:44 2023 / Used Cache: True


## Run test

In [9]:
# Passed
authors_test()

## Extract title & authors for first 50 links

In [10]:
[extract_metadata(l) for l in links50]

Time: Mon Jan 23 18:54:12 2023 / Used Cache: False
Time: Mon Jan 23 18:54:13 2023 / Used Cache: False
Time: Mon Jan 23 18:54:14 2023 / Used Cache: False
Time: Mon Jan 23 18:54:15 2023 / Used Cache: False
Time: Mon Jan 23 18:54:16 2023 / Used Cache: False
Time: Mon Jan 23 18:54:17 2023 / Used Cache: False
Time: Mon Jan 23 18:54:18 2023 / Used Cache: False
Time: Mon Jan 23 18:54:20 2023 / Used Cache: False
Time: Mon Jan 23 18:54:21 2023 / Used Cache: False
Time: Mon Jan 23 18:54:22 2023 / Used Cache: False
Time: Mon Jan 23 18:54:23 2023 / Used Cache: False
Time: Mon Jan 23 18:54:24 2023 / Used Cache: False
Time: Mon Jan 23 18:54:25 2023 / Used Cache: False
Time: Mon Jan 23 18:54:26 2023 / Used Cache: False
Time: Mon Jan 23 18:54:28 2023 / Used Cache: False
Time: Mon Jan 23 18:54:29 2023 / Used Cache: False
Time: Mon Jan 23 18:54:30 2023 / Used Cache: False
Time: Mon Jan 23 18:54:31 2023 / Used Cache: False
Time: Mon Jan 23 18:54:32 2023 / Used Cache: False
Time: Mon Jan 23 18:54:33 2023 

[{'https://www.gov.uk/government/publications/monitoring-noise-and-vibration-on-the-hs2-phase-one-and-2a-route-november-2022': {'title': 'Monitoring Noise and Vibration on the HS2 Phase One and 2a route (November 2022)',
   'authors': ['High Speed Two (HS2) Limited']}},
 {'https://www.gov.uk/government/publications/monitoring-air-quality-and-dust-on-the-hs2-phase-one-and-2a-route-november-2022': {'title': 'Monitoring air quality and dust on the HS2 Phase One and 2a route (November 2022)',
   'authors': ['High Speed Two (HS2) Limited']}},
 {'https://www.gov.uk/government/publications/statement-of-reasons-related-to-the-use-of-section-35-of-the-scotland-act-1998': {'title': 'Statement of reasons related to the use of section 35 of the Scotland Act 1998',
   'authors': ['Equality Hub',
    'Office of the Secretary of State for Scotland',
    'Government Equalities Office']}},
 {'https://www.gov.uk/government/publications/cruiser-sb-emergency-registration-report': {'title': 'Cruiser SB eme

## Other approaches
- Could explore using Scrapy python library which allows for more high-level web crawling and web scraping
- Could run scraper based on json file with all information needed for scraping defined

# Scaling Up

## Large Scale-Architecture
**Main points:**
- Multiple json files and Python tasks minimizing the cost of adding new scrapers (scalability)
- A set of scraping functions allowing to handle the most complex cases by customizing the json files (flexibility)
- An alerting system and a monitoring dashboard to quickly identify defective scrapers or decrease of data quality (robustness)

**Source:** https://heka-ai.medium.com/setting-up-a-large-scale-scraping-architecture-with-python-3b26cb6571a6 

## Speeding Up Scraping with Asynchronous Programming
**What is asynchronous programming?**
At it's core it's essentially just pausable functions (called coroutines) which can pause when inactive and give way to active ones. This often has huge benefits in IO (input/output) bound programs that rely on waiting for some sort of external input or output.

**Source:** https://scrapecrow.com/asynchronous-web-scraping.html

## Make Proxy Management a Priority
**Main points:**
- Defining traffic profile: websites, volume, and geo-locations
- Understanding the available proxy pool: location, type (data center or residential)
- Proxy management, takes into account:
    - proxy rotation
    - automatic header management
    - geolocation based on needs
    - maintaining sessions
    
**Source:** https://www.zyte.com/learn/how-to-scale-up-web-scraping/