In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import re

In [343]:
def scrape_daft(url, start_page, end_page, page_size):
    
    property_dict = {"Bed":[],
                    "Bath":[],
                    "m²":[],
                    "Price":[],
                    "Address":[],
                    "Property_type":[],
                    }
    
    for i in range(start_page, end_page):
        
        current_url = f"{url}?from={i*page_size}&pageSize={page_size}"
        response = requests.get(current_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        extract_data(soup, property_dict, i)
        
    return property_dict

def extract_data(soup, property_dict, i):
    for property_listings in soup.find_all("a"):
        if len(property_listings) > 1:

            property_price = property_listings.find_all("span", "TitleBlock__StyledSpan-sc-1avkvav-5 fKAzIL")

            property_address = property_listings.find_all("p", "TitleBlock__Address-sc-1avkvav-8 dzihyY")
            property_type = property_listings.find_all("p", "TitleBlock__CardInfoItem-sc-1avkvav-9 cKZZql")

            if len(property_price) + len(property_address) + len(property_type) > 0:
                property_price = convert_price_to_int(property_price, i)
                property_dict["Price"].append(property_price)
                try:
                    property_dict["Address"].append(property_address[0].text)
                    property_dict["Property_type"].append(property_type[0].text)
                except IndexError:
                    property_dict["Address"].append(np.nan)
                    property_dict["Property_type"].append(np.nan)
                    


                property_details = property_listings.find_all("p", "TitleBlock__CardInfoItem-sc-1avkvav-9 iLMdur")
                extract_property_details(property_details, property_dict)
                
            imput_null_values(property_dict)
        
    return property_dict

def convert_price_to_int(price, i):
    price = price[0].text
    try:
        price = int("".join(price.split("€")[1].split(",")))
    except IndexError:
        if price.strip() != "Price on Application":
            #print(f"Error found on page {i}")
            #print(f"{price} is not of type int", end ="\n\n")
            price = np.nan
        else:
            price = price.strip()
    return price

def extract_property_details(property_details, property_dict):
    
    for detail in property_details:
        detail_value = detail.text.split()[0]
        detail_type = detail.text.split()[1]
        if detail_type in ["Bed","Bath","m²"]:
            try:
                property_dict[detail_type].append(int(detail_value))
            except ValueError:
                property_dict[detail_type].append(detail_value)
        elif detail_type == "ac":
            size = convert_ac_m2(detail_value)
            property_dict["m²"].append(size)
        else:
            print(f"Found {detail}, while trying to extract bed, bath and floor-area")
    
def convert_ac_m2(value):
    return float(value) / (0.00024711)

def imput_null_values(property_dict):
    maxLen = max([len(property_dict[key]) for key in property_dict])
    for key in property_dict:
        if len(property_dict[key]) < maxLen:
            property_dict[key].append(np.nan)
    
    if sum(np.array([len(property_dict[key]) for key in property_dict]) == maxLen) != len(property_dict):
        print("Error dictionary not of uniform length")
        imput_null_values(property_dict)

In [None]:
url = "https://www.daft.ie/property-for-sale/ireland"
start_page = 0
end_page = 50
page_size = 20
property_dict = scrape_daft(url, start_page, end_page, page_size)

In [336]:
properties = pd.DataFrame(property_dict)

In [324]:
for key in property_dict:
    print(f"{key}: {len(property_dict[key])}")

Bed: 40
Bath: 40
m²: 40
Price: 40
Address: 40
Property_type: 40
