In [1]:
"""This is my 'data acquisition' notebook"""

"This is my 'data acquisition' notebook"

In [43]:
"""Imports"""
import requests
from bs4 import BeautifulSoup
import re
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import csv
import pandas as pd
import numpy as np

In [65]:
def get_property_data(property_url):
    """This function will receive the url of the property and parse its contents. It will then save the property's data in a dictionary."""
    property_headers = []   ##list of headers
    property_data = []   ##list of data/values
    counter = 0
    property = requests.get(property_url).text   ##request URL
    property_soup = BeautifulSoup(property, "html.parser")   ##parse URL to BeautifulSoup
    for code in property_soup.find("div", attrs={"class": "classified__header--immoweb-code"}): ##find unique code
        code_string = str(code)   ##convert to string
        code_strip = code_string.strip()  
        code_split = code_strip.split(" : ",)   ##split to list
        property_headers.append(code_split[0])   ##append first element as header
        property_data.append(code_split[1])   ##append second element as data/value
    for price in property_soup.find("p", attrs={"class": "classified__price"}):   ##find property price
        raw_price = str(price)   ##convert to string
        sub_price = re.sub(r"<.*?>", '', raw_price).strip()   ##remove html elements
        split_price = sub_price.split()   ##split to list
        for list_price in split_price:
            if counter == 0 and len(list_price) > 0:
                    counter = counter + 1
                    property_headers.append("Price_1")   ##append 'Price' header
                    property_data.append(list_price)   ##append price as data/value
            elif counter > 0 and len(list_price) > 0:
                    property_headers.append("Price_2")   ##append 'Price' header
                    property_data.append(list_price)   ##append price as data/value
    for address in property_soup.find("script", attrs={"type": "text/javascript"}): ##find property address
        string_address = str(address)   ##convert to string
        sub_address = re.search("\"country\"(.*?),\"distance\"", string_address).group() ##search address fields
        split_address = re.split(":|,", sub_address)   ##split to list
        split_address.remove('"distance"')   ##remove unneccessary item
        list_address = [item.replace('"', '') for item in split_address]   ##remove " and ,
        address_headers = list_address[0:len(list_address):2]   ##append even elements as headers
        for ah in address_headers:
            property_headers.append(ah)
        address_data = list_address[1:len(list_address):2]   ##append odd elements as data/values
        for ad in address_data:
            property_data.append(ad)
    for tr in property_soup.find_all("tr"):   ##find table of property characteristics
        for th in tr.find_all("th"):
            header = str(th.string)
            header_strip = header.strip()
            if header_strip != 'None':
                property_headers.append(header_strip)   ##append 'th' values as headers
        for td in tr.find_all("td"):
            raw_datum = str(td)
            sub_datum_1 = re.sub(r"<.*?>", '', raw_datum).strip()
            sub_datum_2 = re.sub(r"\n", '', sub_datum_1).strip()
            sub_datum_3 = re.sub(r"\s{2,}", '', sub_datum_2).strip()
            if sub_datum_3 != '':
                property_data.append(sub_datum_3)   ##append 'td' values as data/values
    property_dictionary = dict(zip(property_headers, property_data))   ##combine headers (as keys) and data (as values) into dictionary
    return (property_dictionary)

In [66]:
##Check
get_property_data("https://www.immoweb.be/en/classified/new-real-estate-project-houses/for-sale/wezembeek-oppem/1970/10154577")

{'Immoweb code': '10154577',
 'Price_1': '€670,000',
 'Price_2': '675000€',
 'country': 'Belgium',
 'region': 'null',
 'province': 'Brussels',
 'district': 'Brussels',
 'locality': 'Woluwe-St-Lambert',
 'postalCode': '1200',
 'street': 'Boulevard de la Woluwe 62 bo\\u00eete 3',
 'number': 'null',
 'box': 'null',
 'propertyName': 'null',
 'floor': 'null',
 'latitude': '50.84',
 'longitude': '4.44',
 'Available as of': 'At delivery',
 'Available date': 'June 30 2024 - 12:00 AM',
 'Neighbourhood or locality': 'Brabant Flamand',
 'Covered parking spaces': '1',
 'Outdoor parking spaces': '1',
 'Primary energy consumption': '115kWh/m²kilowatt hour per square meters',
 'Energy class': 'B',
 'Reference number of the EPC report': 'Not specified',
 'CO₂ emission': '45 kg CO₂/m²',
 'Yearly theoretical total energy consumption': 'Not specified',
 'Flood zone type': 'Non flood zone',
 'Tenement building': 'No',
 'Address': 'Boulevard de la Woluwe 62 boîte 31200- Woluwe-St-Lambert',
 'Website': 'htt

In [89]:
def get_property_links(page_url):
    """This function will get and save all the url links of properties in a page given the url of the page"""
    property_links = []   ##list of url links of properties 
    driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser
    driver.get(page_url)   ##open url of page with properties
    driver.implicitly_wait(5)
    keep_browsing = driver.find_element("id", "uc-btn-accept-banner")   ##find 'Keep browsing' button
    driver.implicitly_wait(5)
    keep_browsing.click()   ##click on 'Keep browsing' button
    page_soup = BeautifulSoup(driver.page_source, "html.parser")   ##parse URL to BeautifulSoup
    for property_link in page_soup.find_all("a", attrs={"class": "card__title-link"}):   ##find all property links in page
        property_links.append(property_link.get("href"))
    driver.close()   ##close browser
    return property_links

In [67]:
#Check
get_property_links("https://www.immoweb.be/en/search/apartment/for-sale?countries=BE")

  driver = webdriver.Chrome(ChromeDriverManager().install())


['https://www.immoweb.be/en/classified/apartment/for-sale/gavere/9890/10146915',
 'https://www.immoweb.be/en/classified/apartment/for-sale/evere/1140/10156372',
 'https://www.immoweb.be/en/classified/apartment/for-sale/woluwe-saint-pierre/1150/10154682',
 'https://www.immoweb.be/en/classified/apartment/for-sale/vilvoorde/1800/10157770',
 'https://www.immoweb.be/en/classified/apartment/for-sale/brugge-zeebrugge/8380/10157545',
 'https://www.immoweb.be/en/classified/duplex/for-sale/gent/9000/10157660',
 'https://www.immoweb.be/en/classified/new-real-estate-project-apartments/for-sale/ixelles/1050/10157528',
 'https://www.immoweb.be/en/classified/penthouse/for-sale/etterbeek/1040/10157565',
 'https://www.immoweb.be/en/classified/duplex/for-sale/etterbeek/1040/10157563',
 'https://www.immoweb.be/en/classified/new-real-estate-project-apartments/for-sale/ixelles/1050/10157831',
 'https://www.immoweb.be/en/classified/new-real-estate-project-apartments/for-sale/saint-gilles/1060/10157787',
 'h

In [79]:
def get_property_dataFrame(list_prop_dict):
    df = pd.DataFrame.from_dict(list_prop_dict)
    df.to_csv(r"properties.csv", index = False, header = True)

In [87]:
def get_page_links(root_url):
    page_links = []
    driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser
    driver.get(root_url)   ##open url of site
    driver.implicitly_wait(5)
    keep_browsing = driver.find_element("id", "uc-btn-accept-banner")   ##find 'Keep browsing' button
    driver.implicitly_wait(5)
    keep_browsing.click()   ##click on 'Keep browsing' button
    root_soup = BeautifulSoup(driver.page_source, "html.parser")   ##parse URL to BeautifulSoup
    for page_link in root_soup.find_all("a", attrs={"class": "top-navigation__link first-letter-uc"}):   ##find starting page links in site
        page_links.append(page_link.get("href"))
    sub_page_links = page_links[0:3]
    driver.close()   ##close browser
    return sub_page_links

In [88]:
#Check
get_page_links("https://www.immoweb.be/en")

  driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser


['https://www.immoweb.be/en/search/apartment/for-sale?countries=BE',
 'https://www.immoweb.be/en/search/house/for-sale?countries=BE',
 'https://www.immoweb.be/en/search/house-and-apartment/for-sale?countries=BE']

In [90]:
"""This is the process to get dictionaries of property characteristics given a page url"""

list_page_links = get_page_links("https://www.immoweb.be/en")
for page_url in list_page_links:
    property_links = get_property_links(page_url)
    list_of_prop_dicts = []
    for link in property_links:
        list_of_prop_dicts.append(get_property_data(link))
        get_property_dataFrame(list_of_prop_dicts)

  driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser
  driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser
  driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser
  driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser
