### Script to scrape published data from NHSD webpages and output a compiled and procesed CSV of FTE days available and FTE days lost by staff group and organisation

In [1]:
import requests, os
from bs4 import BeautifulSoup
import re
from datetime import datetime
from urllib.parse import urlparse
import os

In [2]:
check_url = 'https://digital.nhs.uk/data-and-information/publications/statistical/nhs-sickness-absence-rates/'
file_source_url = 'https://digital.nhs.uk'

response = requests.get(check_url)

soup = BeautifulSoup(response.content, 'html.parser')
past_links = soup.find( id="past-publications").find_all(href=re.compile("publications/statistical/nhs-sickness-absence-rates/"))
latest_link = soup.find( id="latest-statistics").find_all(href=re.compile("publications/statistical/nhs-sickness-absence-rates/"))

In [3]:
past_links.append(latest_link[0])

In [4]:
def retrieve_sub_url(thesoup, data_type):
    test = False

    if test:
        print(f"data type is: {data_type}")
        #print(soup)
        
    csv_search = thesoup.find_all(href=re.compile("\.csv$"))
    if test:
        print(csv_search)

    xls_search = thesoup.find_all(href=re.compile("\.xlsx$"))

    if test:
        print(xls_search)
        
    result = [x['href'] for x in csv_search if data_type in x['href'].lower()]
    if len(result) == 0:
        result = [x['href'] for x in xls_search if data_type in x['href'].lower()]
    try:
        return(result)
    except:
        pass

In [5]:
def retrieve_stats_urls(url_string, file_source_url, data_types):
    test = False
    
    x = url_string.split('/')
    if test:
        print(x[-1])

    date_string = x[-1]
    regex_date_string = [m.group() for m in re.finditer("((january|february|march|april|may|june|july|august|september|october|november|december)-\d{4})", date_string)]

    if test:
        print(regex_date_string)

    result_dict = {}

    try:
        # Retrieve find date, which takes account of URLs with a date range within
        formatted_date = datetime.strptime(regex_date_string[-1], "%B-%Y")
        if test:
            print(formatted_date)
        full_url = file_source_url+url_string
        #print(full_url)
        response = requests.get(full_url)
        soup = BeautifulSoup(response.content, 'html.parser')

        result_list = []

        for data_type in data_types:
            if test:
                print(f"checking {data_type}")
            dt = retrieve_sub_url(soup, data_type)
            
            if test:
                print(dt)
                
            result_list.append(dt)
            
        dictionary = dict(zip(data_types, result_list))
        dictionary.update({
            'the_date': formatted_date
        })

        #print(dictionary)
        return dictionary
    except:
        print('Could not format date')
        pass

In [None]:
past_links


In [None]:
# Testing 
# reason_stats_urls_list = []
# test_links_href  = "/data-and-information/publications/statistical/nhs-sickness-absence-rates/nhs-sickness-absence-rates-february-2017"

# get_url = retrieve_stats_urls(test_links_href, file_source_url, ["reason", "rate", "covd"])
# print(get_url)

In [6]:
stats_urls_list = []

for link in past_links:
    #print(link['href'])
    # NOTE: Absence by reason not available before April 2019
    get_urls = retrieve_stats_urls(link['href'], file_source_url, ["reason", "rate", "covid", "benchmark"])
    if get_urls is not None:
        stats_urls_list.append(get_urls)


In [None]:
#Refactoring = time of 41.6s down from 2minutes!

In [7]:
len(stats_urls_list)

111

In [8]:
stats_urls_list

[{'reason': ['https://files.digital.nhs.uk/B5/529AA8/NHS%20Sickness%20Absence%20by%20reason%20and%20staff%20group%20CSV%2C%20May%202023.csv'],
  'rate': ['https://files.digital.nhs.uk/7F/F8E444/NHS%20Sickness%20Absence%20rates%20CSV%2C%20May%202023.csv'],
  'covid': ['https://files.digital.nhs.uk/F6/C179AA/NHS%20Sickness%20Absence%2C%20COVID-19%20related%20absence%20CSV%2C%20May%202023.csv'],
  'benchmark': ['https://files.digital.nhs.uk/E3/5AEB41/NHS%20Sickness%20Absence%20benchmarking%20tool%2C%20May%202023.csv'],
  'the_date': datetime.datetime(2023, 5, 1, 0, 0)},
 {'reason': ['https://files.digital.nhs.uk/D2/EB1D77/NHS%20Sickness%20Absence%20by%20reason%20and%20staff%20group%20CSV%2C%20April%202023.csv'],
  'rate': ['https://files.digital.nhs.uk/D0/DD8DBA/NHS%20Sickness%20Absence%20rates%20CSV%2C%20April%202023.csv'],
  'covid': ['https://files.digital.nhs.uk/9E/7E5E1B/NHS%20Sickness%20Absence%2C%20COVID-19%20related%20absence%20CSV%2C%20April%202023.csv'],
  'benchmark': ['https:/

In [9]:
def download_url(url_to_file, thedate, folder):

    # Set suffix for downloaded file
    suffix = "csv"
    if "xlsx" in url_to_file.lower():
        suffix = "xlsx"

    # Set filename
    
    file_name = "tempdir"
    additional_type = ""

    if folder == 'rate':
        rate_type = "-monthly"
        if "annual" in url_to_file.lower():
            rate_type = "-annual"
        elif "quarterly" in url_to_file.lower():
            rate_type = '-quarterly'
        additional_type = rate_type
    elif folder == "reason" and "mds" in url_to_file.lower():
        additional_type = '-mds'

    file_name = file_name + "/" + folder + "/" + thedate.strftime("%Y-%m-%d") + "-" + folder + additional_type + "." + suffix

    response = requests.get(url_to_file)

    with open(file_name, 'wb') as file:
        file.write(response.content)

In [10]:
for m in stats_urls_list:
    #print(link['href'])
    # NOTE: Absence by reason not available before April 2019
    if len(m['reason']) > 0:
        for i in m['reason']:
            download_url(i, m['the_date'], "reason")
    if len(m['rate']) > 0:
        for i in m['rate']:
            download_url(i, m['the_date'], "rate")
    if len(m['covid']) > 0:
        for i in m['covid']:
            download_url(i, m['the_date'], "covid")
    if len(m['benchmark']) > 0:
        for i in m['benchmark']:
            download_url(i, m['the_date'], "benchmark")

# About a minute to download all files