# UK HPI Data Prep

## Import Libraries

In [1]:
# import libraries
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

## Prepare Data

### Import

In [2]:
# import the data
hpi_data = pd.read_csv('raw_data/uk_hpi.csv')
# display the data set
hpi_data.head()

Unnamed: 0,Date,RegionName,AreaCode,AveragePrice,Index,IndexSA,1m%Change,12m%Change,AveragePriceSA,SalesVolume,...,NewPrice,NewIndex,New1m%Change,New12m%Change,NewSalesVolume,OldPrice,OldIndex,Old1m%Change,Old12m%Change,OldSalesVolume
0,01/04/1968,East Midlands,E12000004,3025.670615,1.968954,,0.0,,,,...,,,,,,,,,,
1,01/04/1968,England,E92000001,3408.108064,1.680067,,0.0,,,,...,,,,,,,,,,
2,01/04/1968,London,E12000007,4418.489911,1.096815,,0.0,,,,...,,,,,,,,,,
3,01/04/1968,Northern Ireland,N92000001,3661.4855,3.30042,,0.0,,,,...,,,,,,,,,,
4,01/04/1968,Scotland,S92000003,2844.980688,2.108087,,0.0,,,,...,,,,,,,,,,


In [4]:
# print information about hpi data
hpi_data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139365 entries, 0 to 139364
Data columns (total 54 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Date                    139365 non-null  object 
 1   RegionName              139365 non-null  object 
 2   AreaCode                139365 non-null  object 
 3   AveragePrice            139365 non-null  float64
 4   Index                   139365 non-null  float64
 5   IndexSA                 4884 non-null    float64
 6   1m%Change               138941 non-null  float64
 7   12m%Change              134553 non-null  float64
 8   AveragePriceSA          4884 non-null    float64
 9   SalesVolume             134898 non-null  float64
 10  DetachedPrice           132852 non-null  float64
 11  DetachedIndex           132852 non-null  float64
 12  Detached1m%Change       132462 non-null  float64
 13  Detached12m%Change      128196 non-null  float64
 14  SemiDetachedPrice   

In [5]:
# get dimensions of the dataset
hpi_data.shape

(139365, 54)

In [6]:
# check for columns that have null values for 50% of the entries
hpi_data.isnull().sum()/len(hpi_data)

Date                      0.000000
RegionName                0.000000
AreaCode                  0.000000
AveragePrice              0.000000
Index                     0.000000
IndexSA                   0.964955
1m%Change                 0.003042
12m%Change                0.034528
AveragePriceSA            0.964955
SalesVolume               0.032053
DetachedPrice             0.046733
DetachedIndex             0.046733
Detached1m%Change         0.049532
Detached12m%Change        0.080142
SemiDetachedPrice         0.046733
SemiDetachedIndex         0.046733
SemiDetached1m%Change     0.049532
SemiDetached12m%Change    0.080142
TerracedPrice             0.046540
TerracedIndex             0.046540
Terraced1m%Change         0.049338
Terraced12m%Change        0.079948
FlatPrice                 0.044236
FlatIndex                 0.044236
Flat1m%Change             0.047042
Flat12m%Change            0.077731
CashPrice                 0.594963
CashIndex                 0.594963
Cash1m%Change       

In [7]:
# change date to dattime instead of object
hpi_data['Date'] = pd.to_datetime(hpi_data['Date'])
# assert statement making sure of conversion to datetime
assert hpi_data['Date'].dtype == 'datetime64[ns]'

In [8]:
# Define the date range
start_date = '2013-01-01'
end_date = '2023-12-31'

# Filter for the date range
hpi_data_filtered = hpi_data[(hpi_data['Date'] >= start_date) & (
    hpi_data['Date'] <= end_date)]

In [9]:
# Select relevant columns
relevant_columns = [
    'Date', 'RegionName', 'AreaCode', 'AveragePrice', 'Index', '1m%Change', '12m%Change', 'SalesVolume'
]
hpi_data_filtered = hpi_data_filtered[relevant_columns]

In [10]:
# Drop complete duplicates from ride_sharing
hpi_data_filtered = hpi_data_filtered.drop_duplicates()

In [11]:
hpi_data_filtered.isnull().sum()/len(hpi_data)

Date            0.000000
RegionName      0.000000
AreaCode        0.000000
AveragePrice    0.000000
Index           0.000000
1m%Change       0.000000
12m%Change      0.000000
SalesVolume     0.005812
dtype: float64

In [12]:
# Handle missing values by dropping rows with any missing values
hpi_data_cleaned = hpi_data_filtered.dropna()

In [21]:
# Regions in Merseyside
regions = ['Liverpool', 'Prenton', 'Newton-Le-Willows', 'Birkenhead',
        'Wirral', 'Bootle', 'St Helens', 'Wallasey', 'Southport',
        'Prescot', 'Wigan', 'Widnes', 'Neston', 'Warrington',
        'Ellesmere Port', 'Wilmslow', 'Coniston', 'Stockport', 'Northwood',
        'Crewe', 'Winsford', 'Merseyside', 'Sefton', 'Wirral', 'St Helens', 'Liverpool', 'Knowsley']
hpi_data_cleaned = hpi_data_cleaned[hpi_data_cleaned['RegionName'].isin(regions)]

In [24]:
# declare column names for standardisation
column_names = ['date', 'region_name', 'area_code', 'average_price', 'index', '1m%_change', '12m%_change', 'sales_volume']
# change column names
hpi_data_cleaned.columns = column_names

In [26]:
hpi_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1170 entries, 86092 to 138542
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1170 non-null   datetime64[ns]
 1   region_name    1170 non-null   object        
 2   area_code      1170 non-null   object        
 3   average_price  1170 non-null   float64       
 4   index          1170 non-null   float64       
 5   1m%_change     1170 non-null   float64       
 6   12m%_change    1170 non-null   float64       
 7   sales_volume   1170 non-null   float64       
dtypes: datetime64[ns](1), float64(5), object(2)
memory usage: 82.3+ KB


### Export

In [27]:
# Save the filtered and cleaned dataset
hpi_data_cleaned.to_csv(
    'clean_data/clean_hpi_data.csv', index=False)