In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import re

from pyspark.sql.functions import pandas_udf

In [3]:
def scrape_daft(url, start_page, end_page, page_size):
    
    property_dict = {"Bed":[],
                    "Bath":[],
                    "m²":[],
                    "Price":[],
                    "Address":[],
                    "Property_type":[],
                     "New_build":[],
                     "Page":[],
                    }
    
    for i in range(start_page, end_page):
        
        current_url = f"{url}?from={i*page_size}&pageSize={page_size}"
        response = requests.get(current_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        extract_data(soup, property_dict, i)
        
    return property_dict

In [4]:
def extract_data(soup, property_dict, i):
    for property_listings in soup.find_all("li", "SearchPage__Result-gg133s-2 djuMQD"):
        
        try:
            property_address = property_listings.find_all("p", "TitleBlock__Address-sc-1avkvav-8 dzihyY")[0].text
            extract_single_listing(property_listings, property_dict, i)
        except IndexError:
            property_address = property_listings.find_all("p", "TitleBlock__Address-sc-1avkvav-8 hCMmam")[0].text
            extract_sub_listing(property_listings, property_dict, i)
            
        imput_null_values(property_dict)
        
    return property_dict

def extract_single_listing(property_listings, property_dict, i):
    
    property_price = property_listings.find_all("span", "TitleBlock__StyledSpan-sc-1avkvav-5 fKAzIL")[0].text
    property_address = property_listings.find_all("p", "TitleBlock__Address-sc-1avkvav-8 dzihyY")[0].text
    property_type = property_listings.find_all("p", "TitleBlock__CardInfoItem-sc-1avkvav-9 cKZZql")
    
    property_price = convert_price_to_int(property_price, i)
    
    property_dict["Price"].append(property_price)
    property_dict["New_build"].append("no")
    property_dict["Address"].append(property_address)
    property_dict["Page"].append(i)
    
    if len(property_type) > 0:
        property_dict["Property_type"].append(property_type[0].text)
    else:
        property_dict["Property_type"].append(np.nan)



    property_details = property_listings.find_all("p", "TitleBlock__CardInfoItem-sc-1avkvav-9 iLMdur")
    extract_property_details(property_details, property_dict)

    

def extract_sub_listing(property_listings, property_dict, i):
    
    property_address = property_listings.find_all("p", "TitleBlock__Address-sc-1avkvav-8 hCMmam")[0].text
    sub_listings = property_listings.find_all("div", "SubUnit__StyledCol-sc-10x486s-4 bIjqYp")
    for listing in sub_listings:
        property_price = listing.find_all("p","SubUnit__Title-sc-10x486s-5 feGTKf")
        
        if len(property_price) > 0:
            property_price = convert_price_to_int(property_price[0].text, i)
            details = listing.find_all("div","SubUnit__CardInfoItem-sc-10x486s-7 YYbRy")
            property_details = details[0].text.split(" · ")
            extract_sub_listing_property_details(property_details, property_dict)
        
            property_dict["m²"].append(np.nan)    
            property_dict["Price"].append(property_price)    
            property_dict["Address"].append(property_address)    
            property_dict["New_build"].append("yes")
            property_dict["Page"].append(i)

In [5]:
def convert_price_to_int(price, i):
    try:
        price = int("".join(price.split("€")[1].split(",")))
    except IndexError:
        if price.strip() != "Price on Application":
            #print(f"Error found on page {i}")
            #print(f"{price} is not of type int", end ="\n\n")
            price = np.nan
        else:
            price = price.strip()
    return price

def extract_property_details(property_details, property_dict):
    
    for detail in property_details:
        try:
            detail_value = detail.text.split()[0]
            detail_type = detail.text.split()[1]
        except:
            print(detail)
            
        if detail_type in ["Bed","Bath","m²"]:
            try:
                property_dict[detail_type].append(int(detail_value))
            except ValueError:
                property_dict[detail_type].append(detail_value)
        elif detail_type == "ac":
            size = convert_ac_m2(detail_value)
            property_dict["m²"].append(size)
        else:
            print(f"Found {detail}, while trying to extract bed, bath and floor-area")

def extract_sub_listing_property_details(property_details, property_dict):

    for detail in property_details[:-1]:
        try:
            detail_value = detail.split(" ")[0]
            detail_type = detail.split(" ")[1]
        except:
            print(detail)
            
        property_dict[detail_type].append(detail_value)
    
    property_dict["Property_type"].append(property_details[-1])
    
def convert_ac_m2(value):
    return float(value) / (0.00024711)

def imput_null_values(property_dict):
    maxLen = max([len(property_dict[key]) for key in property_dict])
    for key in property_dict:
        if len(property_dict[key]) < maxLen:
            property_dict[key].append(np.nan)
    
    if sum(np.array([len(property_dict[key]) for key in property_dict]) == maxLen) != len(property_dict):
        print("Error dictionary not of uniform length")
        print([(key, len(property_dict[key])) for key in property_dict])
        imput_null_values(property_dict)

In [8]:
url = "https://www.daft.ie/property-for-sale/ireland"

start_page = 0
end_page = 20
loop_size = 200
page_size = 20


property_dict = scrape_daft(url, start_page, end_page, page_size)

In [9]:
properties = pd.DataFrame(property_dict)

In [10]:
properties

Unnamed: 0,Bed,Bath,m²,Price,Address,Property_type,New_build,Page
0,3.0,1.0,,225000,"179 Kilnap Place, Farranree, Farranree, Co. Cork",Terrace,no,0
1,2.0,1.0,52.0,295000,"8 Parknasilla, Vevay Road, Bray, Co. Wicklow",Apartment,no,0
2,3.0,3.0,93.0,230000,"7 Meadow Avenue, The Meadows, Hollyhill, Co. Cork",Semi-D,no,0
3,4.0,3.0,86.0,265000,"5 Friar Street, Cork City, Co. Cork",Terrace,no,0
4,3.0,1.0,82.0,375000,"12 Barrack Street, Kinsale, Co. Cork",End of Terrace,no,0
...,...,...,...,...,...,...,...,...
395,4.0,2.0,153.0,975000,"120 Ard Na Mara, Malahide, Co. Dublin",Detached,no,19
396,5.0,3.0,200.0,1600000,"37 The Old Golf Links, Malahide, Co. Dublin",Detached,no,19
397,4.0,3.0,161.0,725000,"75 Furry Park Road, Killester, Dublin 5",End of Terrace,no,19
398,5.0,4.0,279.0,645000,"Taylors Cross, Banagher, Co. Offaly",Detached,no,19
