# Employment Rate Data Prep

## Import Libraries

In [25]:
# import libraries
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

## Prepare Data

### Import

In [26]:
# employment_data_path = 'raw_data/employment_data.csv'
raw_employment_data = pd.read_csv('raw_data/employment_data.csv')

# Display the first few rows of the dataset
raw_employment_data.head()

Unnamed: 0,Area code,Area name,Time period,Value (%),Confidence interval lower,Confidence interval upper
0,E06000001,Hartlepool,2004,63.0,60.3,65.7
1,E06000001,Hartlepool,2005,64.7,62.1,67.3
2,E06000001,Hartlepool,2006,64.6,61.9,67.3
3,E06000001,Hartlepool,2007,63.9,61.0,66.8
4,E06000001,Hartlepool,2008,65.2,62.4,68.0


In [27]:
# Display data types
raw_employment_data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7260 entries, 0 to 7259
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Area code                  7260 non-null   object 
 1   Area name                  7260 non-null   object 
 2   Time period                7260 non-null   int64  
 3   Value (%)                  7222 non-null   float64
 4   Confidence interval lower  7222 non-null   float64
 5   Confidence interval upper  7222 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 340.4+ KB


In [28]:
# Display shape of the data
raw_employment_data.shape

(7260, 6)

In [29]:
# check for columns that have null values for 50% of the entries
raw_employment_data.isnull().sum()/len(raw_employment_data)

Area code                    0.000000
Area name                    0.000000
Time period                  0.000000
Value (%)                    0.005234
Confidence interval lower    0.005234
Confidence interval upper    0.005234
dtype: float64

In [30]:
# list of new column names
employment_data_columns = ['area_code', 'area_name', 'year', 'employment_rate', 'confidence_interval_lower', 'confidence_interval_upper']
# change column names
raw_employment_data.columns = employment_data_columns

In [31]:
# View changed column names
raw_employment_data.columns

Index(['area_code', 'area_name', 'year', 'employment_rate',
       'confidence_interval_lower', 'confidence_interval_upper'],
      dtype='object')

In [32]:
# Define date ranges
start_year = 2013
end_year = 2023

# filter for employment rate in desired date range
raw_employment_data = raw_employment_data[(
    raw_employment_data['year'] >= 2013) & (raw_employment_data['year'] <= 2023)]

In [33]:
# relevant columns
relevant_columns = ['area_code', 'area_name', 'year', 'employment_rate']
# select relevant columns
raw_employment_data = raw_employment_data[relevant_columns]

In [34]:
# relevant area codes
area_codes = ['E08000011', 'E08000012', 'E11000002', 'E08000014', 'E08000013',
            'E08000007', 'E06000007', 'E08000010', 'E08000015']

# Regions in Merseyside
regions = ['Prenton', 'Newton-Le-Willows', 'Birkenhead',
           'Wirral', 'Bootle', 'St Helens', 'Wallasey', 'Southport',
           'Prescot', 'Wigan', 'Widnes', 'Neston', 'Warrington',
           'Ellesmere Port', 'Wilmslow', 'Coniston', 'Stockport', 'Northwood',
           'Crewe', 'Winsford', 'Merseyside', 'Sefton', 'Wirral', 'Liverpool', 'Knowsley']

# filter for relevant area codes and names
raw_employment_data = raw_employment_data[(raw_employment_data['area_name'].isin(
    regions)) | (raw_employment_data['area_code'].isin(
        area_codes))].sort_values('year')

In [35]:
# Handle missing values by dropping rows with any missing values
clean_employment_data = raw_employment_data.dropna()

In [36]:
# view cleaned data
clean_employment_data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 88 entries, 129 to 5099
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   area_code        88 non-null     object 
 1   area_name        88 non-null     object 
 2   year             88 non-null     int64  
 3   employment_rate  88 non-null     float64
dtypes: float64(1), int64(1), object(2)
memory usage: 3.4+ KB


In [37]:
# Convert the year column to string, then to datetime
clean_employment_data['year'] = clean_employment_data['year'].astype(str)
clean_employment_data['year'] = pd.to_datetime(
    clean_employment_data['year'], format='%Y')

# to keep only the year part
clean_employment_data['year'] = clean_employment_data['year'].dt.year

In [38]:
# Convert employment rate from percentatage to rates
clean_employment_data['employment_rate'] = clean_employment_data['employment_rate']/100

In [39]:
# Aggregate the data
clean_employment_data = clean_employment_data.groupby(['area_code', 'area_name', 'year']).agg({
    'employment_rate': 'mean'
}).reset_index()

In [40]:
clean_employment_data

Unnamed: 0,area_code,area_name,year,employment_rate
0,E06000007,Warrington,2013,0.782
1,E06000007,Warrington,2014,0.779
2,E06000007,Warrington,2015,0.779
3,E06000007,Warrington,2016,0.762
4,E06000007,Warrington,2017,0.774
...,...,...,...,...
83,E08000015,Wirral,2019,0.768
84,E08000015,Wirral,2020,0.705
85,E08000015,Wirral,2021,0.711
86,E08000015,Wirral,2022,0.769


### Export

In [41]:
# Save the filtered and cleaned dataset
clean_employment_data.to_csv(
    'clean_data/clean_employment_data.csv', index=False)