# Income Data Prep

## Import Libraries

In [39]:
# import libraries
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

## Prepare Data

### Import

In [40]:
# employment_data_path = 'raw_data/employment_data.csv'
raw_income_data = pd.read_csv('raw_data/income_data.csv')

# Display the first few rows of the dataset
raw_income_data.head()

Unnamed: 0,Area code,Area name,Time period,Value (£),Confidence interval lower,Confidence interval upper
0,E06000001,Hartlepool,2008,370.5,310.95324,430.04676
1,E06000001,Hartlepool,2009,363.1,299.760836,426.439164
2,E06000001,Hartlepool,2010,383.7,326.544048,440.855952
3,E06000001,Hartlepool,2011,368.5,309.27468,427.72532
4,E06000001,Hartlepool,2012,391.4,323.124184,459.675816


In [41]:
# Display data types
raw_income_data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5603 entries, 0 to 5602
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Area code                  5603 non-null   object 
 1   Area name                  5603 non-null   object 
 2   Time period                5603 non-null   int64  
 3   Value (£)                  5559 non-null   float64
 4   Confidence interval lower  5471 non-null   float64
 5   Confidence interval upper  5471 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 262.8+ KB


In [42]:
# Display shape of the data
raw_income_data.shape

(5603, 6)

In [43]:
# check for columns that have null values for 50% of the entries
raw_income_data.isnull().sum()/len(raw_income_data)

Area code                    0.000000
Area name                    0.000000
Time period                  0.000000
Value (£)                    0.007853
Confidence interval lower    0.023559
Confidence interval upper    0.023559
dtype: float64

In [44]:
# list of new column names
income_data_columns = ['area_code', 'area_name', 'year',
                           'gross_median_weekly_pay', 'confidence_interval_lower', 'confidence_interval_upper']
# change column names
raw_income_data.columns = income_data_columns

In [45]:
# View changed column names
raw_income_data.columns

Index(['area_code', 'area_name', 'year', 'gross_median_weekly_pay',
       'confidence_interval_lower', 'confidence_interval_upper'],
      dtype='object')

In [46]:
# relevant columns
relevant_columns = ['area_code', 'area_name',
                    'year', 'gross_median_weekly_pay']
# select relevant columns
raw_income_data = raw_income_data[relevant_columns]

In [47]:
# relevant area codes
area_codes = ['E08000011', 'E08000012', 'E11000002', 'E08000014', 'E08000013',
              'E08000007', 'E06000007', 'E08000010', 'E08000015']

# Regions in Merseyside
regions = ['Prenton', 'Newton-Le-Willows', 'Birkenhead',
           'Wirral', 'Bootle', 'St Helens', 'Wallasey', 'Southport',
           'Prescot', 'Wigan', 'Widnes', 'Neston', 'Warrington',
           'Ellesmere Port', 'Wilmslow', 'Coniston', 'Stockport', 'Northwood',
           'Crewe', 'Winsford', 'Merseyside', 'Sefton', 'Wirral', 'Liverpool', 'Knowsley']

# filter for relevant area codes and names
raw_income_data = raw_income_data[(raw_income_data['area_name'].isin(
    regions)) | (raw_income_data['area_code'].isin(
        area_codes))].sort_values('year')

In [48]:
# Handle missing values by dropping rows with any missing values
clean_income_data = raw_income_data.dropna()

In [49]:
# view cleaned data
clean_income_data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 128 entries, 96 to 3783
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   area_code                128 non-null    object 
 1   area_name                128 non-null    object 
 2   year                     128 non-null    int64  
 3   gross_median_weekly_pay  128 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 5.0+ KB


In [50]:
# Convert the year column to string, then to datetime
clean_income_data['year'] = clean_income_data['year'].astype(str)
clean_income_data['year'] = pd.to_datetime(
    clean_income_data['year'], format='%Y')

# to keep only the year part
clean_income_data['year'] = clean_income_data['year'].dt.year

In [51]:
# Aggregate the data if necessary (e.g., by region or city)
# Assuming there's a column 'Region' or 'City'
clean_income_data = clean_income_data.groupby(['area_code', 'area_name', 'year']).agg({
    'gross_median_weekly_pay': 'mean'
}).reset_index()

### Export

In [52]:
# Save the filtered and cleaned dataset
clean_income_data.to_csv(
    'clean_data/clean_income_data.csv', index=False)

In [53]:
clean_income_data

Unnamed: 0,area_code,area_name,year,gross_median_weekly_pay
0,E06000007,Warrington,2008,410.2
1,E06000007,Warrington,2009,424.2
2,E06000007,Warrington,2010,428.5
3,E06000007,Warrington,2011,402.7
4,E06000007,Warrington,2012,411.8
...,...,...,...,...
123,E08000015,Wirral,2019,465.3
124,E08000015,Wirral,2020,475.3
125,E08000015,Wirral,2021,469.7
126,E08000015,Wirral,2022,522.0
