# Importing libraries & data

In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf
from geopy.geocoders import Nominatim
from MySQLConnect import mysql_engine

In [2]:
uk_hpi = pd.read_csv("http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/UK-HPI-full-file-2023-04.csv?utm_medium=GOV.UK&utm_source=datadownload&utm_campaign=full_fil&utm_term=9.30_21_06_23")

In [3]:
uk_hpi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136125 entries, 0 to 136124
Data columns (total 54 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Date                    136125 non-null  object 
 1   RegionName              136125 non-null  object 
 2   AreaCode                136125 non-null  object 
 3   AveragePrice            136125 non-null  float64
 4   Index                   136125 non-null  float64
 5   IndexSA                 4764 non-null    float64
 6   1m%Change               135702 non-null  float64
 7   12m%Change              131325 non-null  float64
 8   AveragePriceSA          4764 non-null    float64
 9   SalesVolume             131658 non-null  float64
 10  DetachedPrice           129708 non-null  float64
 11  DetachedIndex           129708 non-null  float64
 12  Detached1m%Change       129319 non-null  float64
 13  Detached12m%Change      125064 non-null  float64
 14  SemiDetachedPrice   

# Extracting region data

In [4]:
uk_hpi["Country"]= "United Kingdom" #creating a "country" column and filling every entry with "United Kingdom"

In [5]:
uk_hpi["Region"]=uk_hpi["RegionName"] + ", " + uk_hpi["Country"] #combining region name and country for accurate geocoding

In [6]:
uk_hpi.head()

Unnamed: 0,Date,RegionName,AreaCode,AveragePrice,Index,IndexSA,1m%Change,12m%Change,AveragePriceSA,SalesVolume,...,New1m%Change,New12m%Change,NewSalesVolume,OldPrice,OldIndex,Old1m%Change,Old12m%Change,OldSalesVolume,Country,Region
0,01/01/2004,Aberdeenshire,S12000034,81693.66964,40.864214,,,,,388.0,...,,,103.0,81043.95084,40.883367,,,285.0,United Kingdom,"Aberdeenshire, United Kingdom"
1,01/02/2004,Aberdeenshire,S12000034,81678.76231,40.856757,,-0.018248,,,326.0,...,0.192576,,107.0,80965.29542,40.843688,-0.097053,,219.0,United Kingdom,"Aberdeenshire, United Kingdom"
2,01/03/2004,Aberdeenshire,S12000034,83525.09702,41.780317,,2.260483,,,453.0,...,1.907838,,140.0,82903.23948,41.821302,2.393549,,313.0,United Kingdom,"Aberdeenshire, United Kingdom"
3,01/04/2004,Aberdeenshire,S12000034,84333.679,42.18478,,0.968071,,,571.0,...,0.025432,,180.0,84003.99161,42.376586,1.327755,,391.0,United Kingdom,"Aberdeenshire, United Kingdom"
4,01/05/2004,Aberdeenshire,S12000034,86379.95396,43.208353,,2.426403,,,502.0,...,1.848208,,167.0,86222.73484,43.495852,2.641235,,335.0,United Kingdom,"Aberdeenshire, United Kingdom"


In [7]:
# uk_hpi.drop(["RegionName","Country"], axis=1, inplace=True) #dropping region name and country columns.

In [8]:
len(uk_hpi["Region"].unique())

405

In [9]:
regions = uk_hpi[["Region"]].drop_duplicates(keep="last") #creating a new dataframe with only unique region values.
regions.reset_index(inplace=True, drop=True)
regions

Unnamed: 0,Region
0,"Aberdeenshire, United Kingdom"
1,"Adur, United Kingdom"
2,"Amber Valley, United Kingdom"
3,"Angus, United Kingdom"
4,"Antrim and Newtownabbey, United Kingdom"
...,...
400,"Wychavon, United Kingdom"
401,"Wyre, United Kingdom"
402,"Wyre Forest, United Kingdom"
403,"York, United Kingdom"


In [10]:
regions.duplicated().sum()

0

In [11]:
regions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405 entries, 0 to 404
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Region  405 non-null    object
dtypes: object(1)
memory usage: 3.3+ KB


In [12]:
geolocator = Nominatim(user_agent="uk_regions_list") #creating a geocoder instance for future use.

coordinates = [] #creating an empty list to store coordinates.

In [13]:
for region in regions["Region"]:
    location = geolocator.geocode(region)
    if location is not None:
        coordinates.append((region, location.latitude, location.longitude))
#geocoding the "Region" column using a  to retrieve coordinates, and storing the coordinates and region names in empty list.

In [14]:
# regions[regions["Region"].str.contains("angus", case=False)]

In [15]:
coordinates

[('Aberdeenshire, United Kingdom', 57.166667, -2.666667),
 ('Adur, United Kingdom', 50.8453169, -0.2939886608407643),
 ('Amber Valley, United Kingdom', 53.029038400000005, -1.4625031096700565),
 ('Angus, United Kingdom', 56.666667, -2.916667),
 ('Antrim and Newtownabbey, United Kingdom', 54.6951611, -5.946965601134577),
 ('Ards and North Down, United Kingdom', 54.64896425, -5.562785215049392),
 ('Argyll and Bute, United Kingdom', 56.4603494, -5.8618419478791015),
 ('Arun, United Kingdom', 50.8312375, -0.5667058666407475),
 ('Ashfield, United Kingdom', 53.089773750000006, -1.2518767461557148),
 ('Ashford, United Kingdom', 51.148555, 0.8722566),
 ('Babergh, United Kingdom', 52.06297535, 0.9122241568513769),
 ('Barking and Dagenham, United Kingdom', 51.5540907, 0.15048888801039415),
 ('Barnet, United Kingdom', 51.65309, -0.2002261),
 ('Barnsley, United Kingdom', 53.5527719, -1.4827755),
 ('Basildon, United Kingdom', 51.5754602, 0.4757363),
 ('Basingstoke and Deane, United Kingdom', 51.258

In [16]:
len(coordinates) #checking number of successfully geocoded regions. 402/405 were successful.

402

In [17]:
# lsti = []
# lsti.append((geolocator.reverse(f"{coordinates[20][1]},{coordinates[20][2]}").raw["address"], coordinates[20][0]))
# lsti

In [18]:
coord_list = [] #creating empty list to store results of reverse geocoding.

for i in coordinates:
    coords = f"{i[1]}, {i[2]}"
    address = geolocator.reverse(coords).raw["address"]
    if address is not None:
            coord_list.append(address)
#reverse geocoding the coordinates to find the full address, including state. storing addresses in empty list.

In [19]:
coord_list

[{'town': 'Alford',
  'county': 'Aberdeenshire',
  'ISO3166-2-lvl6': 'GB-ABD',
  'state': 'Alba / Scotland',
  'ISO3166-2-lvl4': 'GB-SCT',
  'postcode': 'AB33 8PX',
  'country': 'United Kingdom',
  'country_code': 'gb'},
 {'road': 'Ricardo Noise Test Track',
  'village': 'Lancing',
  'city': 'Adur',
  'county': 'West Sussex',
  'ISO3166-2-lvl6': 'GB-WSX',
  'state': 'England',
  'ISO3166-2-lvl4': 'GB-ENG',
  'postcode': 'BN15 0RJ',
  'country': 'United Kingdom',
  'country_code': 'gb'},
 {'road': 'Marlborough Drive',
  'town': 'Belper CP',
  'village': 'Openwoodgate',
  'city': 'Amber Valley',
  'county': 'Derbyshire',
  'ISO3166-2-lvl6': 'GB-DBY',
  'state': 'England',
  'ISO3166-2-lvl4': 'GB-ENG',
  'postcode': 'DE56 1LB',
  'country': 'United Kingdom',
  'country_code': 'gb'},
 {'isolated_dwelling': 'Cranshade',
  'town': 'Royal Burgh of Forfar',
  'county': 'Angus',
  'ISO3166-2-lvl6': 'GB-ANS',
  'state': 'Alba / Scotland',
  'ISO3166-2-lvl4': 'GB-SCT',
  'postcode': 'DD8 5QL',
  

In [20]:
address_df = pd.DataFrame(coord_list) #converting list of addresses to a dataframe.
address_df.head()

Unnamed: 0,town,county,ISO3166-2-lvl6,state,ISO3166-2-lvl4,postcode,country,country_code,road,village,...,leisure,building,farm,locality,industrial,club,place,commercial,borough,farmyard
0,Alford,Aberdeenshire,GB-ABD,Alba / Scotland,GB-SCT,AB33 8PX,United Kingdom,gb,,,...,,,,,,,,,,
1,,West Sussex,GB-WSX,England,GB-ENG,BN15 0RJ,United Kingdom,gb,Ricardo Noise Test Track,Lancing,...,,,,,,,,,,
2,Belper CP,Derbyshire,GB-DBY,England,GB-ENG,DE56 1LB,United Kingdom,gb,Marlborough Drive,Openwoodgate,...,,,,,,,,,,
3,Royal Burgh of Forfar,Angus,GB-ANS,Alba / Scotland,GB-SCT,DD8 5QL,United Kingdom,gb,,,...,,,,,,,,,,
4,Newtownabbey,County Antrim,,Northern Ireland / Tuaisceart Éireann,GB-NIR,BT36 5QA,United Kingdom,gb,Carnmoney Road North,,...,,,,,,,,,,


In [21]:
address_df.info() #the state values of all 402 coordinates were successfully reverse geocoded.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402 entries, 0 to 401
Data columns (total 44 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   town               146 non-null    object
 1   county             316 non-null    object
 2   ISO3166-2-lvl6     315 non-null    object
 3   state              402 non-null    object
 4   ISO3166-2-lvl4     402 non-null    object
 5   postcode           381 non-null    object
 6   country            402 non-null    object
 7   country_code       402 non-null    object
 8   road               323 non-null    object
 9   village            160 non-null    object
 10  city               222 non-null    object
 11  isolated_dwelling  8 non-null      object
 12  office             1 non-null      object
 13  suburb             265 non-null    object
 14  region             9 non-null      object
 15  ISO3166-2-lvl5     9 non-null      object
 16  man_made           5 non-null      object
 1

In [22]:
coord_df = pd.DataFrame(coordinates, columns=["Region","Latitude","Longitude"])
coord_df.head() #creating a dataframe of the region and coordinate values.

Unnamed: 0,Region,Latitude,Longitude
0,"Aberdeenshire, United Kingdom",57.166667,-2.666667
1,"Adur, United Kingdom",50.845317,-0.293989
2,"Amber Valley, United Kingdom",53.029038,-1.462503
3,"Angus, United Kingdom",56.666667,-2.916667
4,"Antrim and Newtownabbey, United Kingdom",54.695161,-5.946966


In [23]:
region_df = pd.concat([coord_df, address_df], axis=1)[["Region","county","state"]]
region_df.head() #concatenating the address and region dataframes column-wise, and extracting the region and state
                 #columns of the resulting dataframe into a new dataframe.

Unnamed: 0,Region,county,state
0,"Aberdeenshire, United Kingdom",Aberdeenshire,Alba / Scotland
1,"Adur, United Kingdom",West Sussex,England
2,"Amber Valley, United Kingdom",Derbyshire,England
3,"Angus, United Kingdom",Angus,Alba / Scotland
4,"Antrim and Newtownabbey, United Kingdom",County Antrim,Northern Ireland / Tuaisceart Éireann


In [24]:
region_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402 entries, 0 to 401
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Region  402 non-null    object
 1   county  316 non-null    object
 2   state   402 non-null    object
dtypes: object(3)
memory usage: 9.5+ KB


In [25]:
pd.unique(region_df["state"]) #checking the unique values of the state column.

array(['Alba / Scotland', 'England',
       'Northern Ireland / Tuaisceart Éireann', 'Cymru / Wales'],
      dtype=object)

In [26]:
region_df['state'] = region_df['state'].replace(['Alba / Scotland',
       'Northern Ireland / Tuaisceart Éireann', 'Cymru / Wales'], ['Scotland',
       'Northern Ireland', 'Wales']) #keeping only the english version of the state names.

In [27]:
pd.unique(region_df["state"])

array(['Scotland', 'England', 'Northern Ireland', 'Wales'], dtype=object)

In [28]:
regions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405 entries, 0 to 404
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Region  405 non-null    object
dtypes: object(1)
memory usage: 3.3+ KB


In [29]:
final_reg = sqldf("select a.Region, b.state, b.county from regions a full join region_df b on (a.Region = b.Region)")
final_reg.head() #merging the new dataframe to the original regions dataframe using SQL full join, in order to retain the
                 #three locations which were not successfully geocoded.

Unnamed: 0,Region,state,county
0,"Aberdeenshire, United Kingdom",Scotland,Aberdeenshire
1,"Adur, United Kingdom",England,West Sussex
2,"Amber Valley, United Kingdom",England,Derbyshire
3,"Angus, United Kingdom",Scotland,Angus
4,"Antrim and Newtownabbey, United Kingdom",Northern Ireland,County Antrim


In [30]:
final_reg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405 entries, 0 to 404
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Region  405 non-null    object
 1   state   402 non-null    object
 2   county  316 non-null    object
dtypes: object(3)
memory usage: 9.6+ KB


In [31]:
sqldf("select * from final_reg where state is null")
#checking unsuccessfully geocoded regions. from google searches, all 3 locations were found to be in Northern Ireland.

Unnamed: 0,Region,state,county
0,"Armagh City Banbridge and Craigavon, United Ki...",,
1,"Lisburn and Castlereagh, United Kingdom",,
2,"Mid and East Antrim, United Kingdom",,


In [32]:
final_reg["state"].fillna("Northern Ireland", inplace=True) #replacing NA values in "state" with "Northern Ireland".

In [33]:
uk_hp_index = sqldf("select a.*, b.state, b.county from uk_hpi a join final_reg b on (a.Region = b.Region)") 
#adding the state column to the original dataset using SQL join.

In [34]:
uk_hp_index.drop(["Region"], axis=1, inplace=True) #dropping region column.

## Exporting the modified dataset to a MySQL server

In [None]:
engine = mysql_engine() #creating an instance of the SQL connection engine from the central module.

In [50]:
uk_hp_index.to_sql("uk_house_price_index", con=engine, if_exists="replace", index=None)
#saving the modified dataset as a SQL table.

136125

In [36]:
# geolocator.reverse(coordinates[40]).raw["address"]

In [37]:
# coordinates[40]

In [38]:
# test = geolocator.reverse("53.480837, -2.244914").raw["address"]

In [39]:
# test["state"]

In [40]:
# coordinates

https://jharding.co.uk/list-of-uk-counties/

In [41]:
#counties = pd.read_csv("https://jharding.co.uk/download/433/?tmstv=1687349276")

In [42]:
# counties = counties[0:112] #taking only the ones that have actual info
# counties.isna().sum()

In [43]:
#sum(counties["County"].isin(regions["RegionName"]))

In [44]:
#sum(regions["RegionName"].isin(counties["County"]))

In [45]:
# counties

In [46]:
# sqldf("select a.*, b.* from counties a join regions b on (a.County = b.RegionName)")

In [47]:
# all_regions = pd.merge(counties, regions, left_on='County', right_on='RegionName', how='right')
# all_regions.to_csv("regions.csv")

In [48]:
# sqldf("select * from all_regions where RegionName is not null and County is null")

In [49]:
# sum(pd.notnull(all_regions["County"]))