In [None]:
import numpy as np
import pandas as pd
import zipfile # para descomprimir archivos zip
import urllib.request # para descargar de URL
import os
import re
ruta = os.path.join("res", "PPR-ALL.zip")

In [14]:
import ssl

# descargar dataset 
url = 'https://www.propertypriceregister.ie/website/npsra/ppr/npsra-ppr.nsf/Downloads/PPR-ALL.zip/$FILE/PPR-ALL.zip'  
ssl._create_default_https_context = ssl._create_unverified_context

# Descargar usando un contexto SSL sin verificación
with urllib.request.urlopen(url, context=ssl._create_unverified_context()) as response, open(ruta, 'wb') as out_file:
	out_file.write(response.read())


In [None]:
# Unzip file
ruta_ext = os.path.join("res")
with zipfile.ZipFile(ruta, 'r') as z: 
    print('Extracting all files...') 
    z.extractall(ruta_ext)

Extracting all files...


In [350]:
# Load dataset
ruta_csv = os.path.join("res", "PPR-ALL.csv")
properties_dataset = pd.read_csv(ruta_csv, sep=',', header=0, engine='python', encoding='latin1')


In [351]:
# Process dataset

# Format Date -  Date of Sale (dd/mm/yyyy)
properties_dataset = properties_dataset.rename(columns={'Date of Sale (dd/mm/yyyy)': 'Date'})
properties_dataset['Date'] = pd.to_datetime(properties_dataset['Date'], format='%d/%m/%Y', errors='coerce').dt.normalize()

# Format Price -Price ()
properties_dataset = properties_dataset.rename(columns={'Price ()': 'Price'})
#properties_dataset.columns.values[4] = 'Price'  # Renaming the column to 'Price' Every time I open the file the char changes...
properties_dataset['Price'] = properties_dataset['Price'].str.replace(r'[^0-9\.\-]', '', regex=True)
properties_dataset['Price'] = pd.to_numeric(properties_dataset['Price'], errors='coerce')

# Format Address - Address
properties_dataset['Address'] = properties_dataset['Address'].str.strip().str.lower()

# Format County - County
properties_dataset['County'] = properties_dataset['County'].str.strip().str.lower()

# Format Eircode - Eircode
properties_dataset['Eircode'] = properties_dataset['Eircode'].str.strip().str.lower()

# Format Description - Description of Property
properties_dataset = properties_dataset.rename(columns={'Description of Property': 'Description'})
properties_dataset['Description'] = properties_dataset['Description'].str.strip().str.lower()

# Format Size - Property Size Description
properties_dataset = properties_dataset.rename(columns={'Property Size Description': 'Size'})
properties_dataset['Size'] = properties_dataset['Size'].str.strip().str.lower()

# Split the address into components # TODO: Some Streets have "number, street name" (e.g., "47, Main Street, Dublin") but its very rare
properties_dataset[['Street', 'Neighbourhood', 'Area']] = properties_dataset['Address'].str.split(',', expand=True)

In [352]:
# Function to format street names
def format_street_name(street):
    if isinstance(street, str):
        street = street.strip()
        elements = street.split(' ')
        if (len(elements) == 0):
            return ''
        elif (elements[0].isdigit()):
            return street # The street starts with a number, return as is
        elif (len(elements) == 1):
            return "0 " + street # Add a leading zero if no number is present and has a single element
        else: # The street does not start with a number, it might have a number in the middle or end
            for i in range(len(elements)):
                potential_number = re.sub(r'[^0-9]', '', elements[i])
                if potential_number.isdigit():
                    number_part = potential_number
                    elements.remove(elements[i])  # Remove the number part from the list
                    elements = [el for el in elements if el not in ['apt', 'and']]
                    text_part = ' '.join(elements)
                    return number_part + " " + text_part.strip()
            text_part = ' '.join(elements) # Street does not contain a number, return with a leading zero
            number_part = "0"
            return number_part + " " + text_part.strip()
    return street

# Apply the function to format street names and split into SNumber and SRoad
properties_dataset['Street'] = properties_dataset['Street'].apply(format_street_name)
properties_dataset[['SNumber', 'SRoad']] = properties_dataset['Street'].str.split(' ', n=1, expand=True)



In [353]:
# Function to format area
def format_area(area):
    if isinstance(area, str):
        area = area.strip()
        area = area.replace('.', ' ')  # Replace dots with spaces
        elements = area.split(' ')
        # Remove unwanted elements
        elements = [el for el in elements if el not in ['co', 'co.', 'county', 'road', 'st', 'rd', 'upper', 'lower', 'w', 'e', 'n', 's', 'fw']]
        if len(elements) > 2:
            if(elements[-1].isdigit() and elements[-2] == 'dublin'):
                # If the last two elements are dublin followed by a number, keep only them
                elements = elements[-2:] 
        if len(elements) == 0:
            return ''
        return ' '.join(elements).strip()
    return area

properties_dataset['Area'] = properties_dataset['Area'].apply(format_area)

In [299]:
# Copy the dataset for testing purposes

properties_dataset_test = properties_dataset.copy()

In [None]:
# Testing formatting

display(properties_dataset_test[['SNumber', 'SRoad','Street', 'Neighbourhood', 'Area', 'Address', 'Eircode']].sample(10))


Unnamed: 0,SNumber,SRoad,Street,Neighbourhood,Area,Address,Eircode
594119,5,newtown avenue,5 newtown avenue,the meadows,kill,"5 newtown avenue, the meadows, kill",
76019,1,no castlebawn,1 no castlebawn,ballintubber,castlerea,"no 1 castlebawn, ballintubber, castlerea",
60517,14,st peters drive,14 st peters drive,walkinstown,,"14 st peters drive, walkinstown",
710014,1,apartment,1 apartment,2 careys lane,mullingar,"apartment 1, 2 careys lane, mullingar",n91h016
413198,26,cois glaisin green,26 cois glaisin green,johnstown,navan,"26 cois glaisin green, johnstown, navan",
457480,68,derham park,68 derham park,balbriggan,dublin,"68 derham park, balbriggan, dublin",
464912,10,glenbarrow,10 glenbarrow,portlaoise,laois,"10 glenbarrow, portlaoise, laois",
652313,0,monascriebe,0 monascriebe,mountpleasant,louth,"monascriebe, mountpleasant, county louth",a91c642
241733,11,scarbh leathan,11 scarbh leathan,greenane,kanturk,"11 scarbh leathan, greenane, kanturk",
15974,10,whitehall,10 whitehall,paulstown,,"10 whitehall, paulstown",


In [None]:

# To split a string on the 3rd character in a pandas column, use .str.slice
# Slice only if the Eircode is not nan or empty
mask = properties_dataset['Eircode'].notna() & (properties_dataset['Eircode'].str.strip() != '') & (properties_dataset['Eircode'].str.lower() != 'nan')
properties_dataset['EIRRouteKey'] = np.where(mask, properties_dataset['Eircode'].str.slice(0, 3), np.nan)
properties_dataset['EIRUID'] = np.where(mask, properties_dataset['Eircode'].str.slice(3), np.nan)



In [355]:
# Save the cleaned dataset to a new CSV file
output_csv = os.path.join("res", "PPR-ALL-cleaned.csv")
properties_dataset.to_csv(output_csv, index=False, encoding='utf-8-sig')
print(f"Dataset cleaned and saved to {output_csv}")

Dataset cleaned and saved to res\PPR-ALL-cleaned.csv


In [None]:
# Load the cleaned dataset to a new CSV file
input_csv = os.path.join("res", "PPR-ALL-cleaned.csv")
properties_dataset = pd.read_csv(input_csv, sep=',', header=0, engine='python', encoding='utf-8-sig')

In [None]:
# Function to find the Routing Key using neighbourhoods TODO continue from here first complete Routing Key for Dublin 1 - 24
# then try to find the Routing Key for other neighbourhoods using known ones 

def get_neighbourhoods(df):
    neighbourhoods = df['Neighbourhood'].dropna().unique()
    return neighbourhoods

In [None]:
# Load Eircode CSV
eircodes_ruta = os.path.join("res", "eircodes.csv")
eircodes_csv = pd.read_csv(eircodes_ruta, sep=',', header=0, engine='python')

In [None]:
# Get missing EIRCODEs -- Access to the EIRCODE DB from the IRL gob costs 3000 EU
# Will be using the Eircode CSV file with the Routiong Key and the neighbourhoods to find the possible EIRRouteKey
# Select rows where Eircode is missing (nan or empty string)
missing_eircodes = properties_dataset[(properties_dataset['Eircode'].str.strip() == 'nan')].head(3)
known_eircodes = properties_dataset[(properties_dataset['Eircode'].str.strip().isna == False)]
#.drop_duplicates(subset=['EIRRouteKey', 'SRoad', 'Neighbourhood', 'County'])

display(eircodes_csv)
display(known_eircodes)

In [None]:

for idx, row in missing_eircodes.iterrows():
    addstr = row['Address'] + row['County']
    print (f"Processing address: {addstr}")

    # Have to find a way to get te Eircode from the address

In [32]:

display(properties_dataset.head(5))
display(properties_dataset.info()) 
display(properties_dataset.describe(include='all'))

Unnamed: 0,Date,Address,County,Eircode,Price,Not Full Market Price,VAT Exclusive,Description,Size
0,2010-01-01,"5 Braemor Drive, Churchtown, Co.Dublin",Dublin,,343000.0,No,No,Second-Hand Dwelling house /Apartment,
1,2010-01-03,"134 Ashewood Walk, Summerhill Lane, Portlaoise",Laois,,185000.0,No,Yes,New Dwelling house /Apartment,greater than or equal to 38 sq metres and less...
2,2010-01-04,"1 Meadow Avenue, Dundrum, Dublin 14",Dublin,,438500.0,No,No,Second-Hand Dwelling house /Apartment,
3,2010-01-04,"1 The Haven, Mornington",Meath,,400000.0,No,No,Second-Hand Dwelling house /Apartment,
4,2010-01-04,"11 Melville Heights, Kilkenny",Kilkenny,,160000.0,No,No,Second-Hand Dwelling house /Apartment,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726373 entries, 0 to 726372
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Date                   726373 non-null  datetime64[ns]
 1   Address                726373 non-null  object        
 2   County                 726373 non-null  object        
 3   Eircode                189877 non-null  object        
 4   Price                  726373 non-null  float64       
 5   Not Full Market Price  726373 non-null  object        
 6   VAT Exclusive          726373 non-null  object        
 7   Description            726373 non-null  object        
 8   Size                   52831 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(7)
memory usage: 49.9+ MB


None

Unnamed: 0,Date,Address,County,Eircode,Price,Not Full Market Price,VAT Exclusive,Description,Size
count,726373,726373,726373,189877,726373.0,726373,726373,726373,52831
unique,,647378,26,182871,,2,2,5,6
top,,"Broomfield, Midleton",Dublin,D24W9NN,,No,No,Second-Hand Dwelling house /Apartment,greater than or equal to 38 sq metres and less...
freq,,21,227727,34,,689468,603873,601601,38097
mean,2019-01-03 17:08:41.439535360,,,,304649.2,,,,
min,2010-01-01 00:00:00,,,,5001.0,,,,
25%,2015-11-27 00:00:00,,,,138000.0,,,,
50%,2019-04-08 00:00:00,,,,231277.5,,,,
75%,2022-06-23 00:00:00,,,,348500.0,,,,
max,2025-06-06 00:00:00,,,,387665200.0,,,,
