# Urban and Spatial Data Analytics: Property Price Prediction


## Importing Libraries


In [33]:
# import libraries
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# import pgeocode
# import missingno as msno
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

## Prepare Data


### Import


In [2]:
# import the data
house_data = pd.read_csv(
    'raw_data/prices_paid.csv')
# display the data set to check it has imported correctly
house_data.head()

Unnamed: 0,{109BBF80-1E51-4910-8E2C-B124E1117A7A},34950,1995-06-09 00:00,BA3 3AH,F,N,L,COOMBEND HOUSE,7,COOMBEND,RADSTOCK,RADSTOCK.1,WANSDYKE,AVON,A,A.1
0,{D5233D67-2975-426B-B98D-B124E39331D9},133000,1995-06-19 00:00,SW18 5AG,T,N,F,142,,ASTONVILLE STREET,LONDON,LONDON,WANDSWORTH,GREATER LONDON,A,A
1,{718BD35C-25E1-431C-8AF7-B124E63ED4E1},83000,1995-08-21 00:00,IP11 7PU,D,N,F,3,,GARFIELD ROAD,FELIXSTOWE,FELIXSTOWE,SUFFOLK COASTAL,SUFFOLK,A,A
2,{E127F626-6247-4D95-A392-B124F4C0A558},59000,1995-07-27 00:00,PE21 0SF,D,Y,F,BEAUCADRE,,GAYSFIELD ROAD,FISHTOFT,BOSTON,BOSTON,LINCOLNSHIRE,A,A
3,{B10762A0-2C9C-4C82-A885-B1250B55FFF9},51000,1995-02-28 00:00,ME7 4DF,T,N,F,41,,FRANKLIN ROAD,GILLINGHAM,GILLINGHAM,GILLINGHAM,KENT,A,A
4,{BBAB55E8-216F-4FFA-B9EC-B4AD71F833EB},37000,1995-06-02 00:00,TN1 2ET,F,N,L,48,TOP FLOOR FLAT,UPPER GROSVENOR ROAD,TUNBRIDGE WELLS,TUNBRIDGE WELLS,TUNBRIDGE WELLS,KENT,A,A


In [3]:
# add column names to house_data
column_names = ['transaction_id', 'price', 'transfer_date', 'postcode', 'property_type', 'is_old_or_new', 'property_tenure',
                'house_number_or_name', 'unit_number', 'street', 'locality', 'town', 'district', 'county', 'ppd_transaction_category', 'record_status_monthly_file_only']
house_data.columns = column_names

In [4]:
# print information about house data
house_data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29212383 entries, 0 to 29212382
Data columns (total 16 columns):
 #   Column                           Non-Null Count     Dtype 
---  ------                           --------------     ----- 
 0   transaction_id                   29212383 non-null  object
 1   price                            29212383 non-null  int64 
 2   transfer_date                    29212383 non-null  object
 3   postcode                         29165098 non-null  object
 4   property_type                    29212383 non-null  object
 5   is_old_or_new                    29212383 non-null  object
 6   property_tenure                  29212383 non-null  object
 7   house_number_or_name             29208198 non-null  object
 8   unit_number                      3463419 non-null   object
 9   street                           28748147 non-null  object
 10  locality                         18547494 non-null  object
 11  town                             29212383 non-nu

In [5]:
# get dimensions of the dataset
house_data.shape

(29212383, 16)

In [6]:
# check for columns that have null values for 50% of the entries
house_data.isnull().sum()/len(house_data)

transaction_id                     0.000000
price                              0.000000
transfer_date                      0.000000
postcode                           0.001619
property_type                      0.000000
is_old_or_new                      0.000000
property_tenure                    0.000000
house_number_or_name               0.000143
unit_number                        0.881440
street                             0.015892
locality                           0.365081
town                               0.000000
district                           0.000000
county                             0.000000
ppd_transaction_category           0.000000
record_status_monthly_file_only    0.000000
dtype: float64

In [7]:
# convert price to float type
house_data['price'] = house_data['price'].astype(float)

In [8]:
# convert transfer date to datetime
house_data['transfer_date'] = pd.to_datetime(house_data['transfer_date'])
# assert statement making sure of conversion to datetime
assert house_data['transfer_date'].dtype == 'datetime64[ns]'

In [9]:
# make a list of redundant columns
redundant_columns = ['house_number_or_name', 'unit_number', 'locality',
                    'street', 'record_status_monthly_file_only']
# drop redundant columns
house_data = house_data.drop(redundant_columns, axis=1)
# Assert redundant columns have been dropped
assert len(house_data.columns) != len(column_names)

In [10]:
# Find duplicates
duplicates = house_data.duplicated(subset='transaction_id', keep=False)
# Drop complete duplicates from house_data
unique_house_data = house_data.drop_duplicates()
duplicated_sales = unique_house_data[duplicates == True]
# Assert duplicates are processed
assert duplicated_sales.shape[0] == 0

In [11]:
# print unique values in columns
unique_house_data['property_type'].unique()
# Create mappings and replace
property_type_mapping = {'T': 'Terraced', 'D': 'Detached', 'F': 'Flats/Maisonettes',
                        'S': 'Semi-Detached', 'O': 'Other'}
# replace old values and change data type to categorical
unique_house_data['property_type'] = unique_house_data['property_type'].replace(
    property_type_mapping).astype('category')
# print new unique values in columns
unique_house_data['property_type'].unique()

['Terraced', 'Detached', 'Flats/Maisonettes', 'Semi-Detached', 'Other']
Categories (5, object): ['Detached', 'Flats/Maisonettes', 'Other', 'Semi-Detached', 'Terraced']

In [12]:
# print unique values in columns
unique_house_data['is_old_or_new'].unique()
# create mapping tp replace
old_or_new_mapping = {'N':'Old', 'Y':'New'}
unique_house_data['is_old_or_new'] = unique_house_data['is_old_or_new'].replace(
    old_or_new_mapping).astype('category')
# print new unique values in columns
unique_house_data['is_old_or_new'].unique()

['Old', 'New']
Categories (2, object): ['New', 'Old']

In [13]:
# print unique values in columns
unique_house_data['property_tenure'].unique()
# Remove rows with 'U' in property_tenure
unique_house_data = unique_house_data[unique_house_data['property_tenure'] != 'U']
# create mapping tp replace
property_tenure_mapping = {'F': 'Freehold', 'L': 'Leasehold'}
unique_house_data['property_tenure'] = unique_house_data['property_tenure'].replace(
    property_tenure_mapping).astype('category')
unique_house_data['property_tenure'].unique()

['Freehold', 'Leasehold']
Categories (2, object): ['Freehold', 'Leasehold']

In [14]:
# change ppd_transaction_category to category data type
unique_house_data['ppd_transaction_category'] = unique_house_data['ppd_transaction_category'].astype(
    'category')

In [15]:
# filter data for freehold transactions in merseyside from 2013 to 2023
merseyside_house_data = unique_house_data[(unique_house_data['county'] == 'MERSEYSIDE') & (
    unique_house_data['transfer_date'].dt.year >= 2013) & (unique_house_data['transfer_date'].dt.year <= 2023)].copy()

In [17]:
# Sort data by teansaction date
merseyside_house_data = merseyside_house_data.sort_values('transfer_date', ascending=True)
merseyside_house_data.head()

Unnamed: 0,transaction_id,price,transfer_date,postcode,property_type,is_old_or_new,property_tenure,town,district,county,ppd_transaction_category
18251630,{CD1FD346-02E2-40B9-AD20-AF02A78999D1},113000.0,2013-01-02,L31 2HS,Semi-Detached,Old,Freehold,LIVERPOOL,SEFTON,MERSEYSIDE,A
18351985,{EF89E3A8-2BD1-4347-8B9B-F2CEEB2E62DC},75000.0,2013-01-02,CH43 5RF,Flats/Maisonettes,Old,Leasehold,PRENTON,WIRRAL,MERSEYSIDE,A
18512520,{554C7E6D-FB60-4BF3-AEF0-F18802D4C110},385000.0,2013-01-02,WA12 0JF,Detached,Old,Freehold,NEWTON-LE-WILLOWS,ST HELENS,MERSEYSIDE,A
18275462,{3A27DE8C-0D42-41CC-8501-7367F7E98993},115000.0,2013-01-02,L13 5UP,Semi-Detached,Old,Freehold,LIVERPOOL,LIVERPOOL,MERSEYSIDE,A
18627036,{B5756393-923E-437D-B3BC-D21F5BAB0189},110000.0,2013-01-02,L3 5XY,Flats/Maisonettes,Old,Leasehold,LIVERPOOL,LIVERPOOL,MERSEYSIDE,A


In [18]:
merseyside_house_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 247523 entries, 18251630 to 28794487
Data columns (total 11 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   transaction_id            247523 non-null  object        
 1   price                     247523 non-null  float64       
 2   transfer_date             247523 non-null  datetime64[ns]
 3   postcode                  246787 non-null  object        
 4   property_type             247523 non-null  category      
 5   is_old_or_new             247523 non-null  category      
 6   property_tenure           247523 non-null  category      
 7   town                      247523 non-null  object        
 8   district                  247523 non-null  object        
 9   county                    247523 non-null  object        
 10  ppd_transaction_category  247523 non-null  category      
dtypes: category(4), datetime64[ns](1), float64(1), object(5)
memo

In [27]:
# calculate the number of null values in postcode colum
merseyside_house_data['postcode'].isnull().sum()

736

In [28]:
# exclude rows with null postcode values
merseyside_house_data = merseyside_house_data[~merseyside_house_data['postcode'].isnull(
)]

### Export

In [34]:
# Save the filtered dataset
merseyside_house_data.to_csv(
    'clean_data/clean_property_price_data.csv', index=False)