In [35]:
"""This is my 'data acquisition' notebook"""

"This is my 'data acquisition' notebook"

In [5]:
"""Imports"""
import requests
from bs4 import BeautifulSoup
import re
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import csv
import pandas as pd
import numpy as np

In [6]:
def get_property_data(property_url):
    """This function will receive the url of the property and parse its contents. It will then save the property's data in a dictionary."""
    property_headers = []   ##list of headers
    property_data = []   ##list of data/values
    counter = 0
    property = requests.get(property_url).text   ##request URL
    property_soup = BeautifulSoup(property, "html.parser")   ##parse URL to BeautifulSoup
    for code in property_soup.find("div", attrs={"class": "classified__header--immoweb-code"}): ##find unique code
        code_string = str(code)   ##convert to string
        code_strip = code_string.strip()  
        code_split = code_strip.split(" : ",)   ##split to list
        property_headers.append(code_split[0])   ##append first element as header
        property_data.append(code_split[1])   ##append second element as data/value
    for price in property_soup.find("p", attrs={"class": "classified__price"}):   ##find property price
        raw_price = str(price)   ##convert to string
        sub_price = re.sub(r"<.*?>", '', raw_price).strip()   ##remove html elements
        split_price = sub_price.split()   ##split to list
        for list_price in split_price:
            if counter == 0 and len(list_price) > 0:  ##find price values
                    counter = counter + 1
                    property_headers.append("Price_1")   ##append 'Price' header
                    property_data.append(list_price)   ##append price as data/value
            elif counter > 0 and len(list_price) > 0:
                    property_headers.append("Price_2")   ##append 'Price' header
                    property_data.append(list_price)   ##append price as data/value
    for address in property_soup.find("script", attrs={"type": "text/javascript"}): ##find property address
        string_address = str(address)   ##convert to string
        sub_address = re.search("\"country\"(.*?),\"distance\"", string_address).group() ##search address fields
        split_address = re.split(":|,", sub_address)   ##split to list
        split_address.remove('"distance"')   ##remove unneccessary item
        list_address = [item.replace('"', '') for item in split_address]   ##remove " and ,
        address_headers = list_address[0:len(list_address):2]   ##append even elements as headers
        for ah in address_headers:
            property_headers.append(ah)
        address_data = list_address[1:len(list_address):2]   ##append odd elements as data/values
        for ad in address_data:
            property_data.append(ad)
    for tr in property_soup.find_all("tr"):   ##find table of property characteristics
        for th in tr.find_all("th"):
            header = str(th.string)
            header_strip = header.strip()
            if header_strip != 'None':
                property_headers.append(header_strip)   ##append 'th' values as headers
        for td in tr.find_all("td"):
            raw_datum = str(td)
            sub_datum_1 = re.sub(r"<.*?>", '', raw_datum).strip()
            sub_datum_2 = re.sub(r"\n", '', sub_datum_1).strip()
            sub_datum_3 = re.sub(r"\s{2,}", '', sub_datum_2).strip()
            if sub_datum_3 != '':
                property_data.append(sub_datum_3)   ##append 'td' values as data/values
    property_dictionary = dict(zip(property_headers, property_data))   ##combine headers (as keys) and data (as values) into dictionary
    return (property_dictionary)

In [38]:
##Check
get_property_data("https://www.immoweb.be/en/classified/apartment/for-sale/schaerbeek/1030/10155618")

{'Immoweb code': '10155618',
 'Price_1': '€137,500',
 'Price_2': '137500€',
 'country': 'Belgium',
 'region': 'null',
 'province': 'Flemish Brabant',
 'district': 'Leuven',
 'locality': 'Leuven',
 'postalCode': '3000',
 'street': 'Vismarkt 10 c',
 'number': 'null',
 'box': 'null',
 'propertyName': 'null',
 'floor': 'null',
 'latitude': '50.88',
 'longitude': '4.7',
 'Property name': 'Josephine',
 'Construction year': '2004',
 'Floor': '4',
 'Number of floors': '4',
 'Building condition': 'Good',
 'Number of frontages': '4',
 'Living area': '28m²square meters',
 'Kitchen type': 'Installed',
 'Bedrooms': '1',
 'Bedroom 1 surface': '7m²square meters',
 'Bathrooms': '1',
 'Toilets': '1',
 'Furnished': 'No',
 'Terrace surface': '2m²square meters',
 'Elevator': 'Yes',
 'Visio phone': 'Yes',
 'Primary energy consumption': '527kWh/m²kilowatt hour per square meters',
 'Energy class': 'G',
 'Reference number of the EPC report': 'Not specified',
 'CO₂ emission': 'Not specified',
 'Yearly theoreti

In [7]:
def get_property_links(page_url):
    """This function will get and save all the url links of properties in a page given the url of the page"""
    property_links = []   ##list of url links of properties 
    driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser
    driver.get(page_url)   ##open url of page with properties
    driver.implicitly_wait(5)
    keep_browsing = driver.find_element("id", "uc-btn-accept-banner")   ##find 'Keep browsing' button
    driver.implicitly_wait(5)
    keep_browsing.click()   ##click on 'Keep browsing' button
    page_soup = BeautifulSoup(driver.page_source, "html.parser")   ##parse URL to BeautifulSoup
    for property_link in page_soup.find_all("a", attrs={"class": "card__title-link"}):   ##find all property links in page
        property_links.append(property_link.get("href"))
    driver.close()   ##close browser
    return property_links

In [40]:
#Check
get_property_links("https://www.immoweb.be/en/search/apartment/for-sale?countries=BE")

  driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser


['https://www.immoweb.be/en/classified/apartment/for-sale/deinze/9800/10160241',
 'https://www.immoweb.be/en/classified/apartment/for-sale/gavere/9890/10146915',
 'https://www.immoweb.be/en/classified/new-real-estate-project-apartments/for-sale/brussels-city/1000/9881253',
 'https://www.immoweb.be/en/classified/apartment/for-sale/oudenaarde/9700/10160259',
 'https://www.immoweb.be/en/classified/new-real-estate-project-apartments/for-sale/deinze/9800/10160247',
 'https://www.immoweb.be/en/classified/apartment/for-sale/woluwe-saint-pierre/1150/10154682',
 'https://www.immoweb.be/en/classified/apartment/for-sale/deinze/9800/10160243',
 'https://www.immoweb.be/en/classified/apartment/for-sale/deinze/9800/10160244',
 'https://www.immoweb.be/en/classified/apartment/for-sale/deinze/9800/10160246',
 'https://www.immoweb.be/en/classified/apartment/for-sale/deinze/9800/10160242',
 'https://www.immoweb.be/en/classified/new-real-estate-project-apartments/for-sale/court-st.-etienne/1490/10157688',


In [8]:
def get_property_dataFrame(list_prop_dict):
    """This function will save all the properties and their respective data into a pandas data frame then write it to a csv file"""
    df = pd.DataFrame.from_dict(list_prop_dict)
    df.to_csv(r"properties.csv", index = False, header = True, escapechar = "\\")

In [12]:
def get_page_links(root_url):
    raw_links = []
    page_links = []
    driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser
    driver.get(root_url)   ##open url of site
    driver.implicitly_wait(5)
    keep_browsing = driver.find_element("id", "uc-btn-accept-banner")   ##find 'Keep browsing' button
    driver.implicitly_wait(5)
    keep_browsing.click()   ##click on 'Keep browsing' button
    root_soup = BeautifulSoup(driver.page_source, "html.parser")   ##parse URL to BeautifulSoup
    for page_link in root_soup.find_all("a", attrs={"class": "top-navigation__link first-letter-uc"}):   ##find starting page links in site
        raw_links.append(page_link.get("href"))
    page_one_links = raw_links[0:2]  ##links to page one of "Apartment" and "House"
    for curr_page in page_one_links:
        page_links.append(curr_page)  ##add page one to page links
        curr_page_plus = f"{curr_page}&page="   ##format to locate next pages incrementally
        page_counter = 2
        while page_counter < 16:
            next_page = f"{curr_page_plus}{page_counter}"  ##locate next pages incrementally
            page_links.append(next_page)  ##add next pages to page links
            page_counter = page_counter + 1
    driver.close()   ##close browser
    return page_links

In [11]:
#Check
get_page_links("https://www.immoweb.be/en")

  driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser


['https://www.immoweb.be/en/search/apartment/for-sale?countries=BE',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=2',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=3',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=4',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=5',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=6',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=7',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=8',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=9',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=10',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=11',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=12',
 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=13',
 'https://www.i

In [13]:
"""This is the process to get dictionaries of property characteristics given the root url"""

list_of_prop_dicts = []
list_page_links = get_page_links("https://www.immoweb.be/en")
for page_url in list_page_links:
    property_links = get_property_links(page_url)
    for link in property_links:
        list_of_prop_dicts.append(get_property_data(link))

  driver = webdriver.Chrome(ChromeDriverManager().install())   ##use selenium to open Chrome browser


In [None]:
"""This is the process to save dictionaries of property characteristics into a pandas data frame then write it to csv"""

get_property_dataFrame(list_of_prop_dicts)