Script to scrape local unemployment data by Region from ONS website

Outputs CSV file with unemployment by month and region in thousands and %

In [1]:
import requests
import pandas as pd
import urllib.parse
import io

In [2]:

def download_data(url):
    decoded_url = urllib.parse.unquote(url)
    response = requests.get(decoded_url)
    response.raise_for_status()  # Check if the request was successful

    data = response.content.decode('utf-8')
    df = pd.read_csv(io.StringIO(data))

    return df


In [3]:
url_codes = pd.read_csv('REF_localunemployment_codes.csv')

suffixes = url_codes['Suffix']

base_url = "https://www.ons.gov.uk/generator?format=csv&uri=/employmentandlabourmarket/peoplenotinwork/unemployment/timeseries/"

#dfs = []
combined_df = pd.DataFrame()


suffixes.head()

0    ycnf/lms
1    ycnh/lms
2    ycni/lms
3    ycnc/lms
4    ycnd/lms
Name: Suffix, dtype: object

In [4]:
for suffix in suffixes:
    url = f"{base_url}{suffix}"
#    requests.get(url, headers = {'User-agent': 'your bot 0.1'})
#    print(url)
    df = download_data(url)
    # Exclude first 8 rows
    df = df.iloc[8:, :]
    current_columns = df.columns
    # Rename the first column
    new_columns = {current_columns[0]: 'Date',current_columns[1]:"Value"}
    df = df.rename(columns=new_columns)
    # Add DataFrame source column
    df['Source'] = suffix
    combined_df = pd.concat([combined_df, df], ignore_index=True)

HTTPError: 429 Client Error: Too Many Requests for url: https://www.ons.gov.uk/generator?format=csv&uri=/employmentandlabourmarket/peoplenotinwork/unemployment/timeseries/ycmx/lms

In [None]:
combined_df

Unnamed: 0,Date,Value,Source
0,1993,8.8,ycnf/lms
1,1994,8.2,ycnf/lms
2,1995,7.2,ycnf/lms
3,1996,7.1,ycnf/lms
4,1997,5.6,ycnf/lms
...,...,...,...
9481,2022 DEC,86,ycmr/lms
9482,2023 JAN,96,ycmr/lms
9483,2023 FEB,107,ycmr/lms
9484,2023 MAR,104,ycmr/lms


In [None]:
url_codes = url_codes.rename(columns={'Suffix': 'Source'})
url_codes

Unnamed: 0,Region,NHSE region name,Source,Type
0,East Midlands,Midlands,ycnf/lms,%
1,East of England,East of England,ycnh/lms,%
2,London,London,ycni/lms,%
3,North East,North East,ycnc/lms,%
4,North West,North West,ycnd/lms,%
5,South East,South East,ycnj/lms,%
6,South West,South West,ycnk/lms,%
7,West Midlands,Midlands,ycng/lms,%
8,York & the Humber,North East and Yorkshire,ycne/lms,%
9,East Midlands,Midlands,ycms/lms,thousands


In [None]:
merged_df = combined_df.merge(url_codes, on='Source')
merged_df

Unnamed: 0,Date,Value,Source,Region,NHSE region name,Type
0,1993,8.8,ycnf/lms,East Midlands,Midlands,%
1,1994,8.2,ycnf/lms,East Midlands,Midlands,%
2,1995,7.2,ycnf/lms,East Midlands,Midlands,%
3,1996,7.1,ycnf/lms,East Midlands,Midlands,%
4,1997,5.6,ycnf/lms,East Midlands,Midlands,%
...,...,...,...,...,...,...
9481,2022 DEC,86,ycmr/lms,York & the Humber,North East and Yorkshire,thousands
9482,2023 JAN,96,ycmr/lms,York & the Humber,North East and Yorkshire,thousands
9483,2023 FEB,107,ycmr/lms,York & the Humber,North East and Yorkshire,thousands
9484,2023 MAR,104,ycmr/lms,York & the Humber,North East and Yorkshire,thousands


In [None]:
# Define the regex pattern to extract the monthly values
pattern = r'(\d{4} [A-Z]{3})'  # Matches YYYY MMM format

# Extract the monthly values using regex
merged_df['Date'] = merged_df['Date'].str.extract(pattern)

# Drop rows with missing values (non-monthly values)
merged_df.dropna(subset=['Date'], inplace=True)

merged_df['Date'] = pd.to_datetime(merged_df['Date'], format='%Y %b')
merged_df.head()

Unnamed: 0,Date,Value,Source,Region,NHSE region name,Type
154,1992-04-01,8.7,ycnf/lms,East Midlands,Midlands,%
155,1992-05-01,8.4,ycnf/lms,East Midlands,Midlands,%
156,1992-06-01,8.6,ycnf/lms,East Midlands,Midlands,%
157,1992-07-01,8.4,ycnf/lms,East Midlands,Midlands,%
158,1992-08-01,8.4,ycnf/lms,East Midlands,Midlands,%


In [None]:

# # Make Type thousands and Type % columns instead of rows
# Pivot the dataframe
pivoted_df = merged_df.pivot(index=['Date', 'Region','NHSE region name'], columns='Type', values='Value').reset_index()

pivoted_df['%'] = pivoted_df['%'].astype(float)

pivoted_df['%'] = pivoted_df['%']/100


In [None]:
pivoted_df.head()

Type,Date,Region,NHSE region name,%,thousands
0,1992-04-01,East Midlands,Midlands,0.087,177
1,1992-04-01,East of England,East of England,0.079,210
2,1992-04-01,London,London,0.123,424
3,1992-04-01,North East,North East,0.122,146
4,1992-04-01,North West,North West,0.101,331


In [None]:
pivoted_df.to_csv('../ONS_localunemployment_monthly.csv', index=False)
