Script to scrape local unemployment data by Region from ONS website

Outputs CSV file with unemployment by month and region in thousands and %

In [1]:
import requests
import pandas as pd
import urllib.parse
import io
import time

In [2]:
def download_data(url):
    decoded_url = urllib.parse.unquote(url)
    response = requests.get(decoded_url)
    response.raise_for_status()  # Check if the request was successful

    data = response.content.decode('utf-8')
    df = pd.read_csv(io.StringIO(data))

    return df


In [3]:
url_codes = pd.read_csv('REF_localunemployment_codes.csv')

suffixes = url_codes['Suffix']

base_url = "https://www.ons.gov.uk/generator?format=csv&uri=/employmentandlabourmarket/peoplenotinwork/unemployment/timeseries/"

#dfs = []
combined_df = pd.DataFrame()

suffixes.head()

0    ycnf/lms
1    ycnh/lms
2    ycni/lms
3    ycnc/lms
4    ycnd/lms
Name: Suffix, dtype: object

In [4]:
for suffix in suffixes:
    url = f"{base_url}{suffix}"
#    requests.get(url, headers = {'User-agent': 'your bot 0.1'})
#    print(url)
    # delay 1 second to avoid making too many requests too quickly
    time.sleep(1)

    response = requests.get(url, headers={'User-agent': 'your bot 0.1'})
    
    # Check if the request was successful
    if response.status_code == 200:
        df = download_data(url)
        # Exclude first 8 rows
        df = df.iloc[8:, :]
        current_columns = df.columns
        # Rename the first column
        new_columns = {current_columns[0]: 'Date', current_columns[1]: "Value"}
        df = df.rename(columns=new_columns)
        # Add DataFrame source column
        df['Source'] = suffix
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    else:
        print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")

    # df = download_data(url)
    # # Exclude first 8 rows
    # df = df.iloc[8:, :]
    # current_columns = df.columns
    # # Rename the first column
    # new_columns = {current_columns[0]: 'Date',current_columns[1]:"Value"}
    # df = df.rename(columns=new_columns)
    # # Add DataFrame source column
    # df['Source'] = suffix
    # combined_df = pd.concat([combined_df, df], ignore_index=True)

HTTPError: 429 Client Error: Too Many Requests for url: https://www.ons.gov.uk/generator?format=csv&uri=/employmentandlabourmarket/peoplenotinwork/unemployment/timeseries/ycne/lms

In [5]:
combined_df

Unnamed: 0,Date,Value,Source
0,1993,8.8,ycnf/lms
1,1994,8.2,ycnf/lms
2,1995,7.2,ycnf/lms
3,1996,7.1,ycnf/lms
4,1997,5.6,ycnf/lms
...,...,...,...
4235,2023 FEB,5.1,ycng/lms
4236,2023 MAR,4.8,ycng/lms
4237,2023 APR,5.0,ycng/lms
4238,2023 MAY,5.2,ycng/lms


In [6]:
url_codes = url_codes.rename(columns={'Suffix': 'Source'})
url_codes

Unnamed: 0,Region,NHSE region name,Source,Type
0,East Midlands,Midlands,ycnf/lms,%
1,East of England,East of England,ycnh/lms,%
2,London,London,ycni/lms,%
3,North East,North East,ycnc/lms,%
4,North West,North West,ycnd/lms,%
5,South East,South East,ycnj/lms,%
6,South West,South West,ycnk/lms,%
7,West Midlands,Midlands,ycng/lms,%
8,York & the Humber,North East and Yorkshire,ycne/lms,%
9,East Midlands,Midlands,ycms/lms,thousands


In [14]:
merged_df = combined_df.merge(url_codes, on='Source')
merged_df

Unnamed: 0,Date,Value,Source,Region,NHSE region name,Type
0,1993,8.8,ycnf/lms,East Midlands,Midlands,%
1,1994,8.2,ycnf/lms,East Midlands,Midlands,%
2,1995,7.2,ycnf/lms,East Midlands,Midlands,%
3,1996,7.1,ycnf/lms,East Midlands,Midlands,%
4,1997,5.6,ycnf/lms,East Midlands,Midlands,%
...,...,...,...,...,...,...
4235,2023 FEB,5.1,ycng/lms,West Midlands,Midlands,%
4236,2023 MAR,4.8,ycng/lms,West Midlands,Midlands,%
4237,2023 APR,5.0,ycng/lms,West Midlands,Midlands,%
4238,2023 MAY,5.2,ycng/lms,West Midlands,Midlands,%


In [16]:
# Convert 'Value' column to numeric
merged_df['Value'] = pd.to_numeric(merged_df['Value'], errors='coerce')

In [18]:
# Our mapped regions only have Midlands whereas the original data split 
# between East and West Mids. We need to average across for a total Mids value.

# Group by date and region, then calculate the average value
merged_df2 = merged_df.groupby(['Date','Type', 'NHSE region name'], as_index=False)['Value'].mean()

# Now df_aggregated contains the aggregated DataFrame with one row per date in the Midlands cate
merged_df2

Unnamed: 0,Date,Type,NHSE region name,Value
0,1992 APR,%,East of England,7.90
1,1992 APR,%,London,12.30
2,1992 APR,%,Midlands,9.70
3,1992 APR,%,North East,12.20
4,1992 APR,%,North West,10.10
...,...,...,...,...
3705,2023 Q2,%,Midlands,4.45
3706,2023 Q2,%,North East,4.20
3707,2023 Q2,%,North West,4.30
3708,2023 Q2,%,South East,3.70


In [19]:
# Define the regex pattern to extract the monthly values
pattern = r'(\d{4} [A-Z]{3})'  # Matches YYYY MMM format

# Extract the monthly values using regex
merged_df2['Date'] = merged_df2['Date'].str.extract(pattern)

# Drop rows with missing values (non-monthly values)
merged_df2.dropna(subset=['Date'], inplace=True)

merged_df2['Date'] = pd.to_datetime(merged_df2['Date'], format='%Y %b')
merged_df2.tail()

Unnamed: 0,Date,Type,NHSE region name,Value
3691,2023-05-01,%,Midlands,4.45
3692,2023-05-01,%,North East,4.2
3693,2023-05-01,%,North West,4.3
3694,2023-05-01,%,South East,3.7
3695,2023-05-01,%,South West,3.3


In [27]:

# # Make Type thousands and Type % columns instead of rows
# Pivot the dataframe
pivoted_df = merged_df2.pivot(index=['Date', 'NHSE region name'], columns='Type', values='Value').reset_index()

pivoted_df['%'] = pivoted_df['%'].astype(float)

pivoted_df['%'] = pivoted_df['%']/100


In [28]:
pivoted_df.tail()

Type,Date,NHSE region name,%
2620,2023-06-01,Midlands,0.0455
2621,2023-06-01,North East,0.052
2622,2023-06-01,North West,0.053
2623,2023-06-01,South East,0.039
2624,2023-06-01,South West,0.036


In [29]:
pivoted_df.to_csv('../ONS_localunemployment_monthly.csv', index=False)
