In [7]:
import numpy as np
import pandas as pd
import zipfile # para descomprimir archivos zip
import urllib.request # para descargar de URL
import os
ruta = os.path.join("res", "PPR-ALL.zip")

In [8]:
import ssl

# descargar dataset 
url = 'https://www.propertypriceregister.ie/website/npsra/ppr/npsra-ppr.nsf/Downloads/PPR-ALL.zip/$FILE/PPR-ALL.zip'  
ssl._create_default_https_context = ssl._create_unverified_context

# Descargar usando un contexto SSL sin verificación
with urllib.request.urlopen(url, context=ssl._create_unverified_context()) as response, open(ruta, 'wb') as out_file:
	out_file.write(response.read())


In [None]:
# descomprimiendo archivo zip
ruta_ext = os.path.join("res")
with zipfile.ZipFile(ruta, 'r') as z: 
    print('Extracting all files...') 
    z.extractall(ruta_ext) # destino

Extracting all files...
Done!


In [11]:
ruta_csv = os.path.join("res", "PPR-ALL.csv")
properties_dataset = pd.read_csv(ruta_csv, sep=',', header=0, engine='python', encoding='latin1')


In [None]:
# Preprocesamiento del dataset

# Format Date -  Date of Sale (dd/mm/yyyy)
properties_dataset = properties_dataset.rename(columns={'Date of Sale (dd/mm/yyyy)': 'Date'})
properties_dataset['Date'] = pd.to_datetime(properties_dataset['Date'], format='%d/%m/%Y', errors='coerce').dt.normalize()

# Format Price -Price ()
#properties_dataset = properties_dataset.rename(columns={'Price ()': 'Price'})
properties_dataset.columns.values[4] = 'Price'  # Renaming the column to 'Price' Every time I open the file the char changes...
properties_dataset['Price'] = properties_dataset['Price'].str.replace(r'[^0-9\.\-]', '', regex=True)
properties_dataset['Price'] = pd.to_numeric(properties_dataset['Price'], errors='coerce')

# Format Address - Address
properties_dataset['Address'] = properties_dataset['Address'].str.strip()
properties_dataset['Address'] = properties_dataset['Address'].astype(str)

# Format County - County
properties_dataset['County'] = properties_dataset['County'].str.strip()
properties_dataset['County'] = properties_dataset['County'].astype(str)

# Format Eircode - Eircode
properties_dataset['Eircode'] = properties_dataset['Eircode'].str.strip()
properties_dataset['Eircode'] = properties_dataset['Eircode'].astype(str)

# Format Description - Description of Property
properties_dataset = properties_dataset.rename(columns={'Description of Property': 'Description'})
properties_dataset['Description'] = properties_dataset['Description'].str.strip()
properties_dataset['Description'] = properties_dataset['Description'].astype(str)

# Format Size - Property Size Description
properties_dataset = properties_dataset.rename(columns={'Property Size Description': 'Size'})
properties_dataset['Size'] = properties_dataset['Size'].str.strip()
properties_dataset['Size'] = properties_dataset['Size'].astype(str)


In [None]:
# Split the address into components
properties_dataset[['Street', 'Neighbourhood', 'Area']] = properties_dataset['Address'].str.split(',', expand=True)
properties_dataset[['SNumber', 'SRoad']] = properties_dataset['Street'].str.split(' ', n=1, expand=True)

# To split a string on the 3rd character in a pandas column, use .str.slice
# Slice only if the Eircode is not nan or empty
mask = properties_dataset['Eircode'].notna() & (properties_dataset['Eircode'].str.strip() != '') & (properties_dataset['Eircode'].str.lower() != 'nan')
properties_dataset['EIRRouteKey'] = np.where(mask, properties_dataset['Eircode'].str.slice(0, 3), np.nan)
properties_dataset['EIRUID'] = np.where(mask, properties_dataset['Eircode'].str.slice(3), np.nan)




In [21]:
# Save the cleaned dataset to a new CSV file
output_csv = os.path.join("res", "PPR-ALL-cleaned.csv")
properties_dataset.to_csv(output_csv, index=False, encoding='utf-8-sig')
print(f"Dataset cleaned and saved to {output_csv}")

Dataset cleaned and saved to res\PPR-ALL-cleaned.csv


In [None]:
# Get missing EIRCODEs -- https://github.com/ireland/eircodes/blob/master/EircodeRoutingKeys
# Select rows where Eircode is missing (nan or empty string)
missing_eircodes = properties_dataset[(properties_dataset['Eircode'].str.strip() == 'nan')].head(3)

eircodes_ruta = os.path.join("res", "eircodes.csv")
eircodes_csv = pd.read_csv(eircodes_ruta, sep=',', header=0, engine='python')


for idx, row in missing_eircodes.iterrows():
    addstr = row['Address'] + row['County']
    print (f"Processing address: {addstr}")

    # Have to find a way to get te Eircode from the address
    
display(eircodes_csv)

Unnamed: 0,ROUTING_KEY,REGION
0,A41,BALLYBOUGHAL
1,A42,GARRISTOWN
2,A45,OLDTOWN
3,A63,GREYSTONES
4,A67,WICKLOW
...,...,...
134,Y14,ARKLOW
135,Y21,ENNISCORTHY
136,Y25,GOREY
137,Y34,NEW ROSS


In [20]:

display(properties_dataset.head(5))
display(properties_dataset.info()) 
display(properties_dataset.describe(include='all'))

Unnamed: 0,Date,Address,County,Eircode,Price,Not Full Market Price,VAT Exclusive,Description,Size,Street,Neighbourhood,Area,SNumber,SRoad,EIRRouteKey,EIRUID
0,2010-01-01,"5 Braemor Drive, Churchtown, Co.Dublin",Dublin,,343000.0,No,No,Second-Hand Dwelling house /Apartment,,5 Braemor Drive,Churchtown,Co.Dublin,5,Braemor Drive,,
1,2010-01-03,"134 Ashewood Walk, Summerhill Lane, Portlaoise",Laois,,185000.0,No,Yes,New Dwelling house /Apartment,greater than or equal to 38 sq metres and less...,134 Ashewood Walk,Summerhill Lane,Portlaoise,134,Ashewood Walk,,
2,2010-01-04,"1 Meadow Avenue, Dundrum, Dublin 14",Dublin,,438500.0,No,No,Second-Hand Dwelling house /Apartment,,1 Meadow Avenue,Dundrum,Dublin 14,1,Meadow Avenue,,
3,2010-01-04,"1 The Haven, Mornington",Meath,,400000.0,No,No,Second-Hand Dwelling house /Apartment,,1 The Haven,Mornington,,1,The Haven,,
4,2010-01-04,"11 Melville Heights, Kilkenny",Kilkenny,,160000.0,No,No,Second-Hand Dwelling house /Apartment,,11 Melville Heights,Kilkenny,,11,Melville Heights,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726373 entries, 0 to 726372
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Date                   726373 non-null  datetime64[ns]
 1   Address                726373 non-null  object        
 2   County                 726373 non-null  object        
 3   Eircode                726373 non-null  object        
 4   Price                  726373 non-null  float64       
 5   Not Full Market Price  726373 non-null  object        
 6   VAT Exclusive          726373 non-null  object        
 7   Description            726373 non-null  object        
 8   Size                   726373 non-null  object        
 9   Street                 726373 non-null  object        
 10  Neighbourhood          726373 non-null  object        
 11  Area                   643630 non-null  object        
 12  SNumber                726373 non-null  obje

None

Unnamed: 0,Date,Address,County,Eircode,Price,Not Full Market Price,VAT Exclusive,Description,Size,Street,Neighbourhood,Area,SNumber,SRoad,EIRRouteKey,EIRUID
count,726373,726373,726373,726373.0,726373.0,726373,726373,726373,726373.0,726373,726373,643630,726373,627599,189877,189877
unique,,647378,26,182872.0,,2,2,5,7.0,533742,70110,20150,56333,127651,276,51206
top,,"Broomfield, Midleton",Dublin,,,No,No,Second-Hand Dwelling house /Apartment,,APT 1,CLONDALKIN,DUBLIN,APT,HOUSE,V94,W9NN
freq,,21,227727,536496.0,,689468,603873,601601,673542.0,1594,2542,26836,21222,4989,6889,34
mean,2019-01-03 17:08:41.439535360,,,,304649.2,,,,,,,,,,,
min,2010-01-01 00:00:00,,,,5001.0,,,,,,,,,,,
25%,2015-11-27 00:00:00,,,,138000.0,,,,,,,,,,,
50%,2019-04-08 00:00:00,,,,231277.5,,,,,,,,,,,
75%,2022-06-23 00:00:00,,,,348500.0,,,,,,,,,,,
max,2025-06-06 00:00:00,,,,387665200.0,,,,,,,,,,,
