# June 15th

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
API_URL = 'https://api.beta.ons.gov.uk/v1/datasets'
RSS_URL = 'https://www.ons.gov.uk/releasecalendar?rss'

In [3]:
DATASETS = requests.get(API_URL + '?limit=1000')

In [4]:
DATASETS.json()['items'][0]

{'contacts': [{'email': '+44 (0)1329 444661',
   'name': 'Population Statistics Division',
   'telephone': 'pop.info@ons.gov.uk'}],
 'description': 'Indicators included have been derived from the published 2019 mid-year population estimates for the UK, England, Wales, Scotland and Northern Ireland. These are the number of persons and percentage of the population aged 65 years and over, 85 years and over, 0 to 15 years, 16 to 64 years, 16 years to State Pension age, State Pension age and over, median age and the Old Age Dependency Ratio (the number of people of State Pension age per 1000 of those aged 16 years to below State Pension age).\n\nThis dataset has been produced by the Ageing Analysis Team for inclusion in a subnational ageing tool, which was published in July 2020. The tool enables users to compare latest and projected measures of ageing for up to four different areas through selection on a map or from a drop-down menu.',
 'id': 'ageing-population-estimates',
 'keywords': ['a

Here I am getting all the URLS for the subsections. I will be using these to scrape the publications / datasets

In [5]:
log = requests.get('https://www.ons.gov.uk')
soup = BeautifulSoup(log.content, features = 'html')

In [6]:
ALL_SECTIONS = [x['href'] for x in soup.find_all('a', {'class' : 'primary-nav__child-link'})]
ALL_SECTIONS

['/businessindustryandtrade/business',
 '/businessindustryandtrade/changestobusiness',
 '/businessindustryandtrade/constructionindustry',
 '/businessindustryandtrade/itandinternetindustry',
 '/businessindustryandtrade/internationaltrade',
 '/businessindustryandtrade/manufacturingandproductionindustry',
 '/businessindustryandtrade/retailindustry',
 '/businessindustryandtrade/tourismindustry',
 '/economy/economicoutputandproductivity',
 '/economy/environmentalaccounts',
 '/economy/governmentpublicsectorandtaxes',
 '/economy/grossdomesticproductgdp',
 '/economy/grossvalueaddedgva',
 '/economy/inflationandpriceindices',
 '/economy/investmentspensionsandtrusts',
 '/economy/nationalaccounts',
 '/economy/regionalaccounts',
 '/employmentandlabourmarket/peopleinwork',
 '/employmentandlabourmarket/peoplenotinwork',
 '/peoplepopulationandcommunity/birthsdeathsandmarriages',
 '/peoplepopulationandcommunity/crimeandjustice',
 '/peoplepopulationandcommunity/culturalidentity',
 '/peoplepopulationandc

In [32]:
[x.split('/')[2] for x in ALL_SECTIONS]

['business',
 'changestobusiness',
 'constructionindustry',
 'itandinternetindustry',
 'internationaltrade',
 'manufacturingandproductionindustry',
 'retailindustry',
 'tourismindustry',
 'economicoutputandproductivity',
 'environmentalaccounts',
 'governmentpublicsectorandtaxes',
 'grossdomesticproductgdp',
 'grossvalueaddedgva',
 'inflationandpriceindices',
 'investmentspensionsandtrusts',
 'nationalaccounts',
 'regionalaccounts',
 'peopleinwork',
 'peoplenotinwork',
 'birthsdeathsandmarriages',
 'crimeandjustice',
 'culturalidentity',
 'educationandchildcare',
 'elections',
 'healthandsocialcare',
 'householdcharacteristics',
 'housing',
 'leisureandtourism',
 'personalandhouseholdfinances',
 'populationandmigration',
 'wellbeing']

In [11]:
with open('secondary/subsection_urls.txt', 'w') as f:
    f.write("\n".join(ALL_SECTIONS))

In [7]:
path = ALL_SECTIONS[0]
url = 'https://www.ons.gov.uk' + path
datasets_url = url + '/datalist?filter=datasets'
bulletins_url = url + '/publications?sortBy=release_date&query=&filter=bulletin&size=100'

In [8]:
datasets_log = requests.get(datasets_url)
soup = BeautifulSoup(datasets_log.content, features = 'html')
h3s = soup.find_all('h3', {'class':'search-results__title'})

In [9]:
soup.find_all('a', {'data-gtm-uri':True})[0]['data-gtm-uri']

'/businessindustryandtrade/business/businessservices/datasets/servicesturnoverintheuk'

In [19]:
[x['data-gtm-uri'] for x in soup.find_all('a', {'data-gtm-uri':True})]

['/businessindustryandtrade/business/businessservices/datasets/servicesturnoverintheuk',
 '/businessindustryandtrade/business/activitysizeandlocation/datasets/businessformsmanagementpracticesandenterpriselifecyclesadissectionoftheukbusinesspopulation1999to2020',
 '/businessindustryandtrade/business/businessservices/datasets/businessinsightsanalysisovertimeuk',
 '/businessindustryandtrade/business/activitysizeandlocation/datasets/businessdemographyquarterlyexperimentalstatisticsuk',
 '/businessindustryandtrade/business/activitysizeandlocation/datasets/industriesandfirmswhereturnoverwasresilientduringthecoronaviruscovid19pandemicfirmlevelregressiontables',
 '/businessindustryandtrade/business/businessinnovation/datasets/foreigndirectinvestmentinvolvingukcompanies2013inwardtables',
 '/businessindustryandtrade/business/businessinnovation/datasets/foreigndirectinvestmentinvolvingukcompaniesoutwardtables',
 '/businessindustryandtrade/business/activitysizeandlocation/datasets/publichousesandb

In [23]:
with open('test.txt', 'w') as f:
    f.write(soup.prettify())

In [23]:
def get_datasets_and_publications(path):
    # construct url
    # path is expected to be of the format
    # businessindustryandtrade/business
    url = 'https://www.ons.gov.uk' + path

    # thankfully you can append /datalist?filter=datasets
    # or append /publications?sortBy=release_date&query=&filter=bulletin&size=10
    # thankful because there is another landing page before actually seeing the data/bulletins
    # but we can bypass it...

    datasets_url = url + '/datalist?filter=datasets'
    bulletins_url = url + '/publications?sortBy=release_date&query=&filter=bulletin&size=100'

    datalog = requests.get(datasets_url)
    datasoup = BeautifulSoup(datalog.content, features = 'html')
    bulletinlog = requests.get(bulletins_url)
    bulletinsoup = BeautifulSoup(bulletinlog.content, features = 'html')

    datasets = [x['data-gtm-uri'] for x in datasoup.find_all('a', {'data-gtm-uri':True})]
    bulletins = [x['data-gtm-uri'] for x in bulletinsoup.find_all('a', {'data-gtm-uri':True})]
    
    return datasets, bulletins

In [25]:
datasets, bulletins= get_datasets_and_publications('/peoplepopulationandcommunity/wellbeing')

# I am re-writting the above function to only return bulletins

I think thats a better approach. Previously I was reading the whole list of available datasets/bulletins but that was no way to actually know how to link the two. 

I need to go into each bulletin and utilise the "View all data used in this statistical bulletin" to see exactly what tables are needed to answer the questions.