# Dublin House Price Scraper

In [1]:
import pandas as pd
import requests
import re

from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver

### Retrieve all Ranelagh properties for sale

In [8]:
# For scraping JS content
browser = webdriver.Chrome('/Users/OliverM/Code/Scraping/chromedriver')

url_d6 = 'http://www.daft.ie/dublin/houses-for-sale/dublin-6/?s%5Bsort_by%5D=price&s%5Bsort_type%5D=d'
url_d4 = 'http://www.daft.ie/dublin/houses-for-sale/dublin-4/?s%5Bsort_by%5D=price&s%5Bsort_type%5D=d'

In [9]:
browser.get(url_d6) #navigate to the page
innerHTML = browser.execute_script("return document.body.innerHTML")
soup6 = BeautifulSoup(innerHTML, 'lxml')

In [10]:
browser.get(url_d4) #navigate to the page
innerHTML = browser.execute_script("return document.body.innerHTML")
soup4 = BeautifulSoup(innerHTML, 'lxml')

### Extract price of each property

In [30]:
price4 = soup4.find_all("strong", {"class": "price"})
price6 = soup6.find_all("strong", {"class": "price"})
price = price4 + price6

clean_prices = []
for i in price:
    clean = i.text.strip()
    clean_prices.append(clean)


for n, i in enumerate(clean_prices):
    num = i[1:len(i)]
    clean_prices[n] = num

### Extract address and link of each property

In [29]:
address4 = soup4.select('div div h2 a')
address6 = soup6.select('div div h2 a')
address = address4 + address6

clean_address = []
clean_address2 = []
clean_links = []
for i in address:
    txt = i.text.strip()
    clean_address.append(txt)
    clean_links.append(i['href'])

for i in clean_address:
    clean_txt = i.split(', Dublin', 1)[0].strip()
    clean_address2.append(clean_txt)

### For each property, extract floor area, description

In [18]:
clean_floor_area = []
clean_desc = []

for i in clean_links:
    prop_url = 'http://www.daft.ie/'+ i
    browser = webdriver.Chrome('/Users/OliverM/Code/Scraping/chromedriver')
    browser.get(prop_url) #navigate to the page
    innerHTML_desc = browser.execute_script("return document.body.innerHTML")
    soup_desc = BeautifulSoup(innerHTML_desc, 'lxml')
    
    space = soup_desc.find_all('div', {'class': 'description_block'})
    sqm = space[1].text.strip()
    clean_floor_area.append(sqm)
    
    desc = soup_desc.find_all('span', {'class': 'header_text'})
    for n, i in enumerate(desc):
        desc_strip = i.text.strip()
        desc[n] = desc_strip
    clean_desc.append(desc)

In [20]:
for n, i in enumerate(clean_floor_area):
    if 'Overall Floor Area:' not in i:
        clean_floor_area[n] = ''

In [28]:
clean_floor_area_strip = []

for i in clean_floor_area:
    sqm_clean = i.split('Sq. Metres', 1)[0]
    sqm_clean = sqm_clean.split('Overall Floor Area: ', 1)[-1]
    clean_floor_area_strip.append(sqm_clean)

### Create DataFrame

In [22]:
df = pd.DataFrame({'Address': clean_address2, 
                   'Description': clean_desc,
                   'Price': clean_prices, 
                   'Link': clean_links, 
                   'Floor Area': clean_floor_area, 
                   'Sqm': clean_floor_area_strip})

In [23]:
df['Price'] = df['Price'].str.replace(',', '')
df[['Price','Sqm']] = df[['Price','Sqm']].apply(pd.to_numeric)
df['Description'] = df['Description'].apply(lambda x: ', '.join(x))

In [24]:
df['Price / Sqm'] = df['Price'] / df['Sqm']

In [31]:
file_date = datetime.now().strftime("%y%m%d")
df.to_csv(file_date + ' - Dublin Property Prices.csv', index=False)

In [235]:
# test_url = 'http://www.daft.ie/dublin/apartments-for-sale/ranelagh/44-cullenswood-house-northbrook-avenue-ranelagh-dublin-1519797/'
# browser = webdriver.Chrome('/Users/OliverM/Code/Scraping/chromedriver')
# browser.get(test_url) #navigate to the page
# innerHTML_test = browser.execute_script("return document.body.innerHTML")
# soup_test = BeautifulSoup(innerHTML_test, 'lxml')


In [25]:
df

Unnamed: 0,Address,Description,Floor Area,Link,Price,Sqm,Price / Sqm
0,"81 Park Avenue, Sandymount","Detached House, 5 Beds, 3 Baths","Overall Floor Area: 316.98 Sq. Metres (3,412 S...",/dublin/houses-for-sale/sandymount/81-park-ave...,4600000,316.98,14511.95659
1,"25 Raglan Road, Ballsbridge","Terraced House, 5 Beds, 4 Baths","Overall Floor Area: 404 Sq. Metres (4,349 Sq. ...",/dublin/houses-for-sale/ballsbridge/25-raglan-...,4150000,404.0,10272.277228
2,"16 Wellington Road, Ballsbridge","End of Terrace House, 4 Beds, 4 Baths","Overall Floor Area: 303 Sq. Metres (3,261 Sq. ...",/dublin/houses-for-sale/ballsbridge/16-welling...,3750000,303.0,12376.237624
3,"10 Greenfield Crescent, Donnybrook","Detached House, 5 Beds, 5 Baths","Overall Floor Area: 373 Sq. Metres (4,015 Sq. ...",/dublin/houses-for-sale/donnybrook/10-greenfie...,3000000,373.0,8042.895442
4,"Ballsbridge, Ballsbridge","Detached House, 5 Beds, 4 Baths","Overall Floor Area: 185.8 Sq. Metres (2,000 Sq...",/dublin/houses-for-sale/ballsbridge/ballsbridg...,3000000,185.8,16146.393972
5,"57 Wellington Road, Ballsbridge","Terraced House, 4 Beds, 2 Baths",,/dublin/houses-for-sale/ballsbridge/57-welling...,3000000,,
6,"3 Seaview Terrace, Off Ailesbury Road","Semi-Detached House, 6 Beds, 6 Baths","Overall Floor Area: 463.9 Sq. Metres (4,993 Sq...",/dublin/houses-for-sale/dublin-4/3-seaview-ter...,2950000,463.9,6359.129123
7,"61 Wellington Road, Ballsbridge","Terraced House, 3 Beds, 3 Baths","Overall Floor Area: 285.95 Sq. Metres (3,078 S...",/dublin/houses-for-sale/ballsbridge/61-welling...,2950000,285.95,10316.488897
8,"40 Herbert Park, Ballsbridge","Detached House, 5 Beds, 3 Baths","Overall Floor Area: 300 Sq. Metres (3,229 Sq. ...",/dublin/houses-for-sale/ballsbridge/40-herbert...,2800000,300.0,9333.333333
9,"22 Elgin Road, Ballsbridge","Detached House, 10 Beds, 8 Baths","Overall Floor Area: 418.02 Sq. Metres (4,500 S...",/dublin/houses-for-sale/ballsbridge/22-elgin-r...,2750000,418.02,6578.632601


In [26]:
df.describe()

Unnamed: 0,Price,Sqm,Price / Sqm
count,40.0,38.0,38.0
mean,1943125.0,262.408684,7375.095902
std,970329.5,91.102738,2705.14539
min,995000.0,122.82,3549.382716
25%,1190000.0,195.8175,5894.052419
50%,1472500.0,231.5,6627.630611
75%,2762500.0,316.3675,8086.688687
max,4600000.0,480.0,16146.393972
