In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

In [3]:
pd.read_csv("data/myHome_from_page_1_till_page_790_by_20.csv", index_col=["Unnamed: 0"])

Unnamed: 0,beds,baths,floor_area,Price,Address,Property_type,New_build,Page
0,3.0,3.0,,PoA,"3 Bedroom Apartment - 55 Lad Lane, Dublin 2",Apartment,,1.0
1,3.0,1.0,105m 2,550000,"38 The Avenue, Boden Park, Rathfarnham, Dubli...",Semi-Detached House,,1.0
2,4.0,3.0,110m 2,795000,"130 Carysfort Park, Blackrock, County Dublin",Semi-Detached House,,1.0
3,2.0,1.0,68m 2,295000,"5 Woodbrook Hall, Carpenterstown, Dublin 15",Apartment,,1.0
4,4.0,2.0,170.07m 2,895000,"70 Kincora Grove, Clontarf, Dublin 3",Semi-Detached House,,1.0
...,...,...,...,...,...,...,...,...
10005,5.0,5.0,200m 2,365000,"No. 1 Gleann Na Boirne, Bellharbour, Clare",Detached House,,500.0
10006,3.0,1.0,107m 2,419000,"Knockdilly Cottage, Knockdilly, Kilmuckridge,...",Detached House,,500.0
10007,4.0,3.0,2675ft 2,435000,"Dooneen, Carrigtwohill, Cork",Detached House,,500.0
10008,2.0,3.0,75m 2,197500,"14 Blossom Court, Esker Hills, Portlaoise, Laois",Detached House,,500.0


In [130]:
url = "https://www.myhome.ie/residential/ireland/property-for-sale?page="

property_dict = {"beds":[],
                  "baths":[],
                  "floor_area":[],
                  "Price":[],
                  "Address":[],
                  "Property_type":[],
                  "New_build":[],
                  "Page":[],
                    }
for i in range(1,3,1):
    current_url = f"{url}{i}"
    response = requests.get(current_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    for property_listings in soup.find_all("div", "mb-3 ng-star-inserted"):
        # Details include beds, bath, m2 and property_type
        property_details = property_listings.find_all("span", "PropertyInfoStrip__Detail PropertyInfoStrip__Detail--dark ng-star-inserted")
        extract_property_details(property_details, property_dict)
        #Price of the property
        price = property_listings.find_all("div", re.compile("^ng-tns.*ng-star-inserted$"))
        
        address = property_listings.find_all("a",re.compile(".*PropertyListingCard"))
        
        property_dict["Price"].append(price[0].text)
        property_dict["Address"].append(address[0].text)
        property_dict["Page"].append(i)  
        property_dict["New_build"].append(np.nan)
        imput_null_values(property_dict)

In [158]:
class MyHome_web_scraper:
    
    """ 
    The goal of this class is to scrape property data from the Irish property site Daft.ie.
    The class utilizes PySpark for parralization.
    Creating ....
    
    The parameters for the class are:
    url: This is the base url the model will use. Allows you to search the whole of Ireland or can use a differnt url for a small search.
    start_page: what page the model will start scraping from. Will use for parralization.
    end_page: what page the model will stop scraping at. Will use for parralization.
    page_size: The number of property listings each page should return. Usually set to 20.
    """
    
    def __init__(self, url, start_page, end_page):
        self.url = url
        self.start_page = start_page
        self.end_page = end_page
        self.property_dict = {"beds":[],
                  "baths":[],
                  "floor_area":[],
                  "Price":[],
                  "Address":[],
                  "Property_type":[],
                  "New_build":[],
                  "Page":[],
                    }
        
    def start_scraping(self):
        """ 
        The function is used to start the scraping process.
        It uses a for loop to go through all the pages in [start_page, end_page],
        and BeautifulSoup() to retreive the page data.
        
        The soup is then passed the extract_data function.
        """
        
        for i in range(self.start_page, self.end_page):
            self.i = i
            current_url = f"{self.url}{self.i}"
            response = requests.get(current_url)
            soup = BeautifulSoup(response.text, 'html.parser')

            self.extract_data(soup)
        
        return self.property_dict
    
    def extract_data(self, soup):
        
        """ 
        This function is used to find all property listings using the 'soup' then it checks if the listing is a single listing
        or part of a sublisting.
        A sublisting is common for new builds as the builder is selling multiple properties in the new house developement.
        
        If the listing is a single listing the data is extracted using the extract_single_listing function.
        If it is a sublisting (multiple listings) the data is extracted using the extract_sub_listing function.
        """
        
        for property_listings in soup.find_all("div", "mb-3 ng-star-inserted"):
            
            # Details include beds, bath, m2 and property_type
            property_details = property_listings.find_all("span", "PropertyInfoStrip__Detail PropertyInfoStrip__Detail--dark ng-star-inserted")
            self.extract_property_details(property_details)
            
            #Price and ddress of the property
            price = property_listings.find_all("div", re.compile("^ng-tns.*ng-star-inserted$"))
            price = self.convert_price_to_int(price[0].text)
            address = property_listings.find_all("a",re.compile(".*PropertyListingCard"))
            
            #add values to dict
            self.property_dict["Price"].append(price)
            self.property_dict["Address"].append(address[0].text)
            self.property_dict["Page"].append(self.i)  
            self.property_dict["New_build"].append(np.nan)
            
            # add null if any values are missing
            self.imput_null_values()
            
    def extract_property_details(self, property_details):

        """
        This function extracts the number of beds, baths and floor-area for single listed properties.
        """

        for detail in property_details:
            # This will be either bed, bath or property_type
            if len(detail) == 2:
                detail_list = detail.text.strip().split(" ")

                if len(detail_list) == 2:
                    if detail_list[1] in ["beds", "baths"]:
                        self.property_dict[detail_list[1]].append(detail_list[0])
                    elif detail_list[1] in ["bed", "bath"]:
                        self.property_dict[f"{detail_list[1]}s"].append(detail_list[0])
                    else:
                        self.property_dict["Property_type"].append(" ".join(detail_list))

                elif len(detail_list) == 1 or len(detail_list) == 4:
                    self.property_dict["Property_type"].append(detail_list[0])

                else:
                    print(f"Property detail didn't fit into bed, bath or property_type: {detail_list}")

            # This will be floor_area
            elif len(detail) == 3:
                floor_area = detail.text
                if floor_area[-3:] == "m 2" or floor_area[-4:] == "ft 2":
                    self.property_dict["floor_area"].append(floor_area)
                else:
                    print(f"floor area not in ft or m but in type {floor_area}")

    def imput_null_values(self):

        """
        Appends a null value for features in the dict which are missing an entry. 
        """
        maxLen = max([len(self.property_dict[key]) for key in self.property_dict])
        for key in property_dict:
            if len(self.property_dict[key]) < maxLen:
                self.property_dict[key].append(np.nan)

        if sum(np.array([len(self.property_dict[key]) for key in self.property_dict]) == maxLen) != len(self.property_dict):
            print("Error dictionary not of uniform length")
            self.imput_null_values()
    
    def convert_price_to_int(self, price):
        
        """
        This function tries to convert the price to an int if not returns the orginal string.
        """
        try:
            price = int("".join(price.split("€")[1].split(",")))
        except:
            if price.strip() in ["Price on Application", "POA", "AMV: Price on Application"]:
                price = "PoA"
            else:
                print(price, self.i)
                price = price.strip()
        return price


In [None]:
url = "https://www.myhome.ie/pricechanges/page-"

In [156]:
url = "https://www.myhome.ie/pricechanges/page-"
start_page = 1
end_page = 350

s1 = MyHome_web_scraper(url, start_page, end_page)

property_dict = s1.start_scraping()

properties = pd.DataFrame(property_dict)

In [25]:
def convert_price_to_int(price):
        
        """
        This function tries to convert the price to an int if not returns the orginal string.
        """
        try:
            price = int("".join(price.split("€")[1].split(",")))
        except:
            if price.strip() in ["Price on Application", "POA", "AMV: Price on Application"]:
                price = "PoA"
            else:
                print(price, self.i)
                price = price.strip()
        return price

In [42]:
url = "https://www.myhome.ie/priceregister/page-"
sold_dict = {"Date":[],
              "Address":[],
              "Price":[],
              "Page":[],
                    }

for i in range(10,26,5):
    current_url = f"{url}{i}"
    response = requests.get(current_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    for property_listings in soup.find_all("div", "PriceRegisterListItem SoldPropertyListItem"):
        
        address = property_listings.find_all("a", "SoldPropertyListItem__Address")[0].text.strip()
        date = pd.to_datetime(property_listings.find_all("span", "PriceRegisterListItem__Date")[0].text.strip())
        
        price_data = property_listings.find_all("span", "ng-star-inserted")[1].text
        
        print(address)
        print(date)        
        print(price_data)                

722 Ryder Cup Village, Kclub, Straffan, Co. Kildare, W23dp49
2022-11-03 00:00:00
€311,000
Aghamore, Annaduff, Co. Leitrim, N41ya99
2022-11-03 00:00:00
€140,000
Eslin Bridge, Carrick-on-Shannon, Co. Leitrim, N41e7t1
2022-11-03 00:00:00
€140,000
162 Dundoogan, Haynestown, Dundalk, Co. Louth
2022-11-03 00:00:00
€220,220
11 Spire View, Johnstown, Navan, Co. Meath, C15k50a
2022-11-03 00:00:00
€255,000
15 Cloran Court, Athboy, Co. Meath, C15e7w7
2022-11-03 00:00:00
€160,000
11 Rathdrum, Drumalee, Co. Cavan, H12ay90
2022-11-03 00:00:00
€181,000
10 LANA CROKE, DUN CHORMAIC, DUALLA RD CASHEL, Co. Tipperary, E25F803
2022-11-03 00:00:00
€240,000
60 Ashfield, Mullingar, Co. Westmeath, N91h2c4
2022-11-03 00:00:00
€251,000
71 Derrygreenagh Park, Rochfortbridge, Mullingar, Co. Westmeath, N91a9x9
2022-11-03 00:00:00
€180,000
2 Lakeview Crescent, Wicklow, A67fh42, Co. Wicklow
2022-11-03 00:00:00
€285,000
21 Ferguson Rd, Dublin City, Drumcondra, Dublin 3, D09c2p2
2022-11-03 00:00:00
€551,500
23 The Cres

In [38]:
url = "https://www.myhome.ie/priceregister/page-"
sold_dict = {"Date":[],
              "Address":[],
              "Price":[],
              "Page":[],
                    }

for i in range(10,26,5):
    current_url = f"{url}{i}"
    response = requests.get(current_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    for property_listings in soup.find_all("div", "PriceRegisterListItem SoldPropertyListItem"):
        
        address = property_listings.find_all("a", "SoldPropertyListItem__Address")[0].text
        date = pd.to_datetime(property_listings.find_all("span", "PriceRegisterListItem__Date")[0].text.strip())
        
        price_data = property_listings.find_all("span", "ng-star-inserted")
        p1 = price_data[0]
        p2 = price_data[1]
        
        change_euro = convert_price_to_int(p1.find_all("span")[0].text)
        change_precentage = float(p1.find_all("span")[1].text[1:-2])
        if change_precentage < 0:
            change_euro = change_euro *-1
        old_price = convert_price_to_int(p2.find_all("span")[0].text)
        new_price = convert_price_to_int(p2.find_all("span")[1].text)
        
        price_change_dict["Date"].append(date)
        price_change_dict["Address"].append(address)
        price_change_dict["New_price"].append(new_price)
        price_change_dict["Old_price"].append(old_price)
        price_change_dict["Change(€)"].append(change_euro)
        price_change_dict["Change(%)"].append(change_precentage)
        price_change_dict["Page"].append(i)

In [39]:
pd.DataFrame(price_change_dict)

Unnamed: 0,Date,Address,New_price,Old_price,Change(€),Change(%),Page
0,2022-11-17,"73, Block 5A, The Casino, Malahide, Dublin",545000,595000,-50000,-8.4,10
1,2022-11-17,"308 Sundays Well, Naas, Kildare",375000,379000,-4000,-1.06,10
2,2022-11-17,"Tulligee, Clonakilty, West Cork",350000,360000,-10000,-2.78,10
3,2022-11-17,"71 Carton House, The Oaks, Ridgewood, Swords,...",270000,280000,-10000,-3.57,10
4,2022-11-17,"19 Fernhill , Arklow, Wicklow",289000,325000,-36000,-11.08,10
5,2022-11-17,"Old Creagh School, Creagh, Baltimore, Cork",100000,80000,20000,25.0,10
6,2022-11-17,"Gormanston Road, Stamullen, Meath",315000,325000,-10000,-3.08,10
7,2022-11-17,"17 Monastery Heath Court, Clondalkin, Dublin 22",225000,245000,-20000,-8.16,10
8,2022-11-17,"21 Pebble Drive, Pebble Beach, Tramore, Water...",155000,165000,-10000,-6.06,10
9,2022-11-17,"Old Labour Exchange, Quay Road, Dungloe, Done...",130000,165000,-35000,-21.21,10


In [131]:
def extract_property_details(property_details, propery_dict):

    """
    This function extracts the number of beds, baths and floor-area for single listed properties.
    """

    for detail in property_details:
        # This will be either bed, bath or property_type
        if len(detail) == 2:
            detail_list = detail.text.strip().split(" ")
            
            if len(detail_list) == 2:
                if detail_list[1] in ["beds", "baths"]:
                    propery_dict[detail_list[1]].append(detail_list[0])
                elif detail_list[1] in ["bed", "bath"]:
                    propery_dict[f"{detail_list[1]}s"].append(detail_list[0])
                else:
                    propery_dict["Property_type"].append(" ".join(detail_list))
                    
            elif len(detail_list) == 1:
                propery_dict["Property_type"].append(detail_list[0])
                
            else:
                print(f"Property detail didn't fit into bed, bath or property_type: {detail_list}")
        
        # This will be floor_area
        elif len(detail) == 3:
            floor_area = detail.text
            if floor_area[-3:] == "m 2" or floor_area[-4:] == "ft 2":
                propery_dict["floor_area"].append(floor_area)
            else:
                print(f"floor area not in ft or m but in type {floor_area}")
                
def imput_null_values(property_dict):

    """
    Appends a null value for features in the dict which are missing an entry. 
    """
    maxLen = max([len(property_dict[key]) for key in property_dict])
    for key in property_dict:
        if len(property_dict[key]) < maxLen:
            property_dict[key].append(np.nan)

    if sum(np.array([len(property_dict[key]) for key in property_dict]) == maxLen) != len(property_dict):
        print("Error dictionary not of uniform length")
        imput_null_values(property_dict)

In [None]:
def extract_property_details(self, property_details):
        
        """
        This function extracts the number of beds, baths and floor-area for single listed properties.
        """

        for detail in property_details:
            try:
                detail_value = detail.text.split()[0]
                detail_type = detail.text.split()[1]
            except:
                print(detail)

            if detail_type in ["Bed","Bath","m²"]:
                try:
                    self.property_dict[detail_type].append(int(detail_value))
                except ValueError:
                    self.property_dict[detail_type].append(detail_value)
            elif detail_type == "ac":
                size = self.convert_ac_m2(detail_value)
                self.property_dict["m²"].append(size)
            else:
                print(f"Found {detail}, while trying to extract bed, bath and floor-area")


In [132]:
for key in property_dict:
    print(len(property_dict[key]))

40
40
40
40
40
40
40
40


In [133]:
pd.DataFrame(property_dict)

Unnamed: 0,beds,baths,floor_area,Price,Address,Property_type,New_build,Page
0,4.0,2.0,143m 2,"€400,000","4 Highfield, Dublin road, Arklow, Wicklow",Detached House,,1
1,4.0,3.0,143.35m 2,"€925,000","20 Brighton Avenue, Foxrock, Dublin 18",Detached House,,1
2,1.0,1.0,56m 2,"€129,000","9 Dun Aoibhinn, Dungarvan, Co. Waterford",Apartment,,1
3,4.0,3.0,134.1m 2,"€350,000","56 The Meadows, Bullock Park, Green Road, Car...",Detached House,,1
4,3.0,3.0,103m 2,"€200,000","36 Norbury Woods Green, Norbury Woods, Tullam...",House,,1
5,4.0,2.0,131m 2,POA,"Pearsons Brook, Gorey, Wexford",Semi-Detached House,,1
6,4.0,2.0,131m 2,"€149,000","39 Beechwood Park , Granard, Longford",Semi-Detached House,,1
7,3.0,1.0,95m 2,"€340,000","43 Roseville, Naas, Co. Kildare",Semi-Detached House,,1
8,5.0,3.0,155m 2,"€695,000","42 Cypress Grove Road, Templeogue, Dublin 6W",Semi-Detached House,,1
9,4.0,2.0,,"€450,000","Harbour View, Maulicurrane, Union Hall, Wes...",Bungalow,,1


In [None]:
    def extract_property_details(self, property_details):
        
        """
        This function extracts the number of beds, baths and floor-area for single listed properties.
        """

        for detail in property_details:
            try:
                detail_value = detail.text.split()[0]
                detail_type = detail.text.split()[1]
            except:
                print(detail)

            if detail_type in ["Bed","Bath","m²"]:
                try:
                    self.property_dict[detail_type].append(int(detail_value))
                except ValueError:
                    self.property_dict[detail_type].append(detail_value)
            elif detail_type == "ac":
                size = self.convert_ac_m2(detail_value)
                self.property_dict["m²"].append(size)
            else:
                print(f"Found {detail}, while trying to extract bed, bath and floor-area")

In [11]:
for property_listings in soup.find_all("div", "mb-3 ng-star-inserted"):
    property_details = property_listings.find_all("span", "PropertyInfoStrip__Detail PropertyInfoStrip__Detail--dark ng-star-inserted")
    print(len(property_details))

4
 5 beds 
 4 baths 
250.09m 2
Detached House 
5
 4 beds 
 5 baths 
250m 2

Detached House 
5
 6 beds 
 3 baths 
256m 2

Terraced House 
5
 3 beds 
 4 baths 
180m 2

Semi-Detached House 
5
 3 beds 
 3 baths 
111m 2

Semi-Detached House 
5
 3 beds 
 1 bath 
76m 2

Semi-Detached House 
4
 3 beds 
 1 bath 

Terraced House 
2

Investment 
5
 4 beds 
 3 baths 
176m 2

Detached House 
5
 3 beds 
 1 bath 
91.66m 2

Semi-Detached House 
4
 2 beds 
68m 2

Detached House 
5
 5 beds 
 3 baths 
276m 2

Detached House 
5
 3 beds 
 2 baths 
88m 2

Semi-Detached House 
5
 2 beds 
 2 baths 
75m 2

Semi-Detached House 
5
 6 beds 
 4 baths 
248m 2

Detached House 
5
 3 beds 
 1 bath 
125.6m 2

Semi-Detached House 
5
 3 beds 
 1 bath 
100m 2

Semi-Detached House 
5
 4 beds 
 3 baths 
278m 2

Detached House 
5
 4 beds 
 4 baths 
195m 2

Semi-Detached House 
4
 4 beds 
 6 baths 

Detached House 


In [33]:
current_url = f"{url}{3}"
response = requests.get(current_url)
soup = BeautifulSoup(response.text, 'html.parser')
for property_listings in soup.find_all("div", "mb-3 ng-star-inserted"):
    addy = property_listings.find_all("a",re.compile(".*PropertyListingCard"))
    price = property_listings.find_all("div", re.compile("^ng-tns.*ng-star-inserted$"))
    print(addy[0].text)

 25 Philipsburgh Avenue, Fairview, Dublin 3
 Danescastle, Carrig On Bannow, Bannow, Wexford
 28 Grove Park, Rathmines, Dublin 6
 Cloonkeely, Tuam, Co. Galway
 5 St Barnabas Gardens, East Wall, Dublin 3
 12 Oranhill Road, Oranhill, Oranmore, Galway
 Leamanaghan, Ballycumber, Offaly
 39 Milford Park, Ballinabranna, Carlow
 65 Ranelagh Road, Ranelagh, Dublin 6
 155 Rosary Terrace, Irishtown, Dublin 4
 29 Gate Lodge, Castle Road, Blackrock, Cork
 59 Castleknock Laurels , Castleknock, Dublin 15
 106 Castle Park, Tallaght, Dublin 24
 20 Castle Elms, Coolock, Dublin 17
 7 Mayeston Court, Finglas, Dublin 11
 25A Meadow Park Avenue, Churchtown, Dublin 14
 Carstown, Ballymakenny, Drogheda, Co. Louth
 8 The Court, Newtown Manor, Kill, Co. Kildare
 27 Arconagh, Naas, Co. Kildare
 6 Brackinrainey Wood, Longwood, Co. Meath
