In [317]:
import requests
import json
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from lxml import etree


In [89]:
API_URL = 'https://api.beta.ons.gov.uk/v1/datasets'
RSS_URL = 'https://backfeed.strangecode.com/DxbyswYRHTXxwBFbsk/http://www.ons.gov.uk/releasecalendar%3Frss'
RSS_URL2 = 'https://www.ons.gov.uk/releasecalendar?rss'

In [341]:
def get_publications_and_datasets(link):
    log = requests.get(link)
    soup = BeautifulSoup(log.content, features= 'html')
    
    h3_source = soup.findAll('h3') # Publications under an h3 header
    publications = soup.find_all('a', {'data-gtm-type': 'related-publications'})
    datasets = soup.find_all('a', {'data-gtm-type': 'related-datasets'})

    related_publications = {x.getText(): 'https://www.ons.gov.uk/'+x['href'] for x in publications}
    related_datasets = {x.getText(): 'https://www.ons.gov.uk/'+x['href'] for x in datasets}
    
    return related_publications, related_datasets

def read_publication(
    link, 
    min_text_len = 3, return_ptags = True, return_litags = False,
    remove_duplicates = True, preprocess = True):

    '''
    link: url of ONS publication
    min_text_len: minimum number of words in each sentence
    return_ptags: return any text that was in <p> tags
    return_litags: return any text that was in <li> tags
    remove_duplicates: checks and removes duplicate text
    preprocess: 
        removes text with links,

    '''

    log = requests.get(link)
    soup = BeautifulSoup(log.content, features = 'html')

    # the reason I am checking for no additional styling attributes
    # is because the ONS data uses the default for the content of interest
    if return_ptags:
        ptags = soup.find_all(lambda tag: tag.name == 'p' and not tag.attrs)
        
    if return_litags:
        litags = soup.find_all(lambda tag: tag.name == 'li' and not tag.attrs)
    
    # concat together
    if return_ptags and return_litags:
        all_text = [x.get_text() for x in (ptags + litags) if len(x.get_text().split()) > min_text_len]
    elif return_ptags and not return_litags:
        all_text = [x.get_text() for x in ptags if len(x.get_text().split()) > min_text_len]
    elif return_litags and not return_ptags:
        all_text = [x.get_text() for x in litags if len(x.get_text().split()) > min_text_len]
    else:
        raise ValueError('Did you set both return_ptag and return_litag to False?')
        
    # remove duplicates
    if remove_duplicates:
        all_text = np.unique(all_text)
    
    # preprocess
    if preprocess:
        preprocessed = []
        for string in all_text:

            # remove | because its only present in related_links
            # remove all references to twitter as these are only for following X account
            # remove all self references to ONS, as these are contact details
            if not any(bad_terms in string for bad_terms in [
                '|', 'twitter', 'Twitter', 'Office for National Statistics', 'Totals may not sum due to rounding'
                ]):

                # remove trailing whitespace
                string = string.strip()
                # remove urls
                string = re.sub(r'http\S+', '', string)
                # remove \" symbol 
                string = re.sub('\"', '', string)
                preprocessed.append(string)

        return preprocessed

    return all_text

In [343]:
log = requests.get(RSS_URL)
soup = BeautifulSoup(log.content, features = 'xml')

articles = soup.findAll('item')
result_list = []
for article in articles:
    title = article.find('title').text
    link = article.find('link').text
    date = article.find('pubDate').text

    publications, datasets = get_publications_and_datasets(link)

    entry = {
        'title': title,
        'link': link,
        'publications': publications,
        'datasets': datasets,
        'date': date
    }

    result_list.append(entry)

In [344]:
result_list

[{'title': 'Sexual orientation, UK: 2019',
  'link': 'https://www.ons.gov.uk/releases/sexualorientationuk2019',
  'publications': {'Sexual orientation, UK: 2019': 'https://www.ons.gov.uk//peoplepopulationandcommunity/culturalidentity/sexuality/bulletins/sexualidentityuk/2019'},
  'datasets': {'Sexual orientation, UK': 'https://www.ons.gov.uk//peoplepopulationandcommunity/culturalidentity/sexuality/datasets/sexualidentityuk'},
  'date': 'Thu, 27 May 2021 08:30:00 GMT'},
 {'title': 'Economic activity and social change in the UK, real-time indicators: 27 May 2021',
  'link': 'https://www.ons.gov.uk/releases/economicactivityandsocialchangeintheukrealtimeindicators27may2021',
  'publications': {'Economic activity and social change in the UK, real-time indicators: 27 May 2021': 'https://www.ons.gov.uk//economy/economicoutputandproductivity/output/bulletins/economicactivityandsocialchangeintheukrealtimeindicators/27may2021'},
  'datasets': {'Weekly shipping indicators': 'https://www.ons.gov.u

In [353]:
result_list[3]['publications']

{'COVID-19 Schools Infection Survey Round 4, England: antibody data, March 2021': 'https://www.ons.gov.uk//peoplepopulationandcommunity/healthandsocialcare/conditionsanddiseases/bulletins/covid19schoolsinfectionsurveyround4england/antibodydatamarch2021'}

In [354]:
test = read_publication('https://www.ons.gov.uk//peoplepopulationandcommunity/healthandsocialcare/conditionsanddiseases/bulletins/covid19schoolsinfectionsurveyround4england/antibodydatamarch2021')

In [356]:
test

['A confidence interval gives an indication of the degree of uncertainty of an estimate, showing the precision of a sample estimate. The 95% confidence intervals are calculated so that if we repeated the study many times, 95% of the time the true unknown value would lie between the lower and upper confidence limits. A wider interval indicates more uncertainty in the estimate. Overlapping confidence intervals indicate that there may not be a true difference between two estimates. For more information, see our methodology page on statistical uncertainty.',
 'A result is said to be statistically significant if it is likely not caused by chance or the variable nature of the samples. For more information, see our methodology page on statistical uncertainty.',
 'As at 16 April 2021, in Round 4 of testing, 4,154 staff participated in at least one current COVID-19 infection or COVID-19 antibody test. This is around 35% of eligible staff in the sampled schools.',
 'As our vaccination rates rela