In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

### Import necessary packages

In [32]:
#Essentials
import numpy as np
import pandas as pd
import pickle
import re
import datetime as dt
import time

# For scraping
import requests
import os
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC


#SQL related - NEED TO DECIDE WHICH ONE I'LL BE USING AND DELETE THE REST
import psycopg2
import sqlite3
from sqlalchemy import create_engine
import pandas.io.sql as pd_sql

#Visualization
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns


%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 25, 6
rcParams.update({'font.size': 14})

## Gather data

### Helper functions

In [121]:
def start_webdriver():
    """This function simply starts the chrome driver (from a defined location on the machine)."""
    #Load .env file contents
    load_dotenv()
    
    chromedriver = os.environ.get('webdriver_loc')           #  path to the chromedriver executable
    os.environ["webdriver.chrome.driver"] = chromedriver
    url = os.environ.get('url')
    profile = os.environ.get('google_profile')
    
#   Define optional settings and go to the website
    chrome_options = ChromeOptions()
    chrome_options.add_argument(f"--user-data-dir={profile}")
#     options = Options()
#     options.add_argument("--headless")
#     options.add_argument("--disable-notifications")
#     driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chrome_options, options=options)
    
    driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chrome_options)
    driver.get(url)
    return driver
    
def is_for_sale_active():
    """Checks if 'For Sale' tab is selected, to make sure we're not looking at rentals
    Returns a Boolean (True for For Sale is active, False - if it isn't active)"""

    
def apply_basic_filters(filter_dict):
    """Takes a dictionary of basic filters, applies them and searches for the properties"""
    # Now apply desired filters (e.g. set location to Amsterdam)
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.NAME, "filter_location")))
    filter_loc = driver.find_element_by_name('filter_location')
    filter_loc.send_keys(filter_dict['location'])

    # Wait for the first dropdown option to appear and select the first option
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".autocomplete-list")))
    filter_loc.send_keys(Keys.ARROW_DOWN)
    filter_loc.send_keys(Keys.ENTER)

    filter_radius = Select(driver.find_element_by_id('Straal'))
    filter_radius.select_by_value(filter_dict['radius'])

    filter_price_from = Select(driver.find_element_by_name('filter_KoopprijsVan'))
    filter_price_from.select_by_value(filter_dict['min_price'])

    filter_price_upto = Select(driver.find_element_by_name('filter_KoopprijsTot'))
    filter_price_upto.select_by_value(filter_dict['max_price'])

    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//button[@class='button-primary-alternative']")))
    find_search = driver.find_element_by_xpath("//button[contains(text(), 'Search')]")
    find_search.click()
    
def get_id(html):
    """Given an html this function retrieves the property id from the html link.
    Returns the id"""
    return int(re.findall(r'-\d*-', html)[0].strip('-'))



# Get urls for all the pages and put them into a list
def get_url_list():
    """Records the current url and goes through the website, clicking Next as many times as there are pages.
    Returns a list of urls to be used in the get_htmls function."""
    #Creates a list of urls for all pages
    url_list = []
    
    #Reads the url of the page the driver is currently in and adds it into the list
    current_page_url = driver.current_url
    url_list.append(current_page_url)
    
    curr_page_num = 1
    count_exceptions = 0
    while True:
        try:
            curr_page_num += 1
            #Find "Next" button and click it
            find_next = driver.find_element_by_xpath("//a[@rel='next']")
            find_next.click()
            current_page_url = driver.current_url
            url_list.append(current_page_url)
            time.sleep(10)
        except:
            #Count the exceptions
            count_exceptions += 1
            curr_page_num -= 1 #since we are coming back to the same page
            #If the there haven't been 3 exceptions yet, sleep for a bit and then continue
            if count_exceptions < 3:
                time.sleep(10)
            else:
                #If "Next" button isn't there anymore or an error occurs, return the list
                #driver.close()
                print(f'Finished with an error. Last page number was {curr_page_num}.')
                return url_list    
    
    print(f'Url list collection - complete. Last page number was {curr_page_num}.')
    return url_list   


#Get all the html files for each property ad and put it into a list
def get_htmls():
    """Takes current html list.
    Returns updated html list with all the property ad htmls available on the page."""    
    #Find all property ad htmls
    html_list = []
    elems = driver.find_elements_by_xpath("//a[@href]")
    for elem in elems:
        html = elem.get_attribute("href")
        if (bool(re.search(r'appartement-\d+', html)) or bool(re.search('huis-\d+', html))) and html not in html_list: 
            html_list.append(html)
    return html_list


def get_feat_dict(html):
    """Takes a string with all highlighted features and puts them into a dictionary.
    Returns the dictionary"""
    
    # Open the html
    driver.get(html)

    # Click to see all the features available
    WebDriverWait(driver, 7).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".object-kenmerken-open-button")))
    driver.find_element_by_class_name('object-kenmerken-open-button').click()

    # Retrieve all the other features now
    time.sleep(5)
    feature_elems = driver.find_elements_by_class_name('object-kenmerken-body')
    feature_string = ''
    for elem in feature_elems:
        if feature_string == '':
            feature_string = elem.text
        else:
            feature_string += '\n' + str(elem.text)

    categories = ['Auction', 'Transfer of ownership', 'Construction', 'Surface areas and volume', 'Areas', 'Layout', 'Energy', 'What does this mean?', 'Cadastral data', 'Exterior space', 'Storage space', 'Parking', 'VVE (Owners Association) checklist', 'Garage', 'Commercial property']
    special_categories = ['Cadastral data', 'Commercial property']
    to_delete = ['Cadastral map']
    features = feature_string.split('\n')
    feat_list = [feat for feat in features if feat not in to_delete]

    feat_dict = {}   
    current_index = 0

    for feat in feat_list: 
        feat = feat_list[current_index]
    #     print(feat, current_index)
        if feat in categories:
            feat_dict[feat] = ''
            current_category = feat
            current_index += 1        
        else:
            if current_category in special_categories:
                feat_dict[current_category] += str(feat) + '; '
                if current_index+2 < len(feat_list):
                    current_index += 1
                else:
                    break
            elif feat in feat_dict.keys():
                feat_dict[feat+'_'+current_category] = feat_list[current_index+1]
                if current_index+2 < len(feat_list):
                    current_index += 2
                else:
                    break
            else:
                feat_dict[feat] = feat_list[current_index+1]
                if current_index+2 < len(feat_list):
                    current_index += 2
                else:
                    break
    return feat_dict
                


# def go_to_next_page():
#     """Finds the 'Next' button and clicks it"""
#     count_exceptions = 0
#     while True:
#         try:
#             #Find "Next" button and click it
#             find_next = driver.find_element_by_xpath("//a[@rel='next']")
#             find_next.click()
#             time.sleep(7)
#         except:
#             #Count the exceptions
#             count_exceptions += 1
#             #If the there haven't been 3 exceptions yet, sleep for a bit and then continue
#             if count_exceptions < 3:
#                 time.sleep(10)
#             else:
#                 break
    
    

def log_in():
    """Uses the credentials defined in the .env file to log in and saves properties matching desired criteria to the favourites"""
    # This is to load environment variables
    load_dotenv()
    
#     acct_url = 'https://www.funda.nl/en/navigation/account/'
    login_url = os.environ.get('login_url')
    url = os.environ.get('url')
    
    driver.get(login_url)
#     driver.find_element_by_id('appheader-inloggen-link').click()
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.ID, "Username")))
    
    username_field = driver.find_element_by_id('Username')
    username_field.send_keys(os.environ.get('username'))
    password_field = driver.find_element_by_id('Password')
    password_field.send_keys(os.environ.get('secret_f'))
    driver.find_element_by_xpath("//button[contains(text(), 'Log in')]").click()
    driver.get(url)
    
def log_out():
    """Logs out once the actions are done"""
    # This is to load environment variables
    load_dotenv()
    
    logout_url = os.environ.get('logout_url')
    driver.get(logout_url)
    
def save_to_favourites(html):
    """Saves the given html of the property to favourites"""
    # Goes to the provided html (link)
    driver.get(html)
    
    # Find the "heart" icon and clicks it once
    driver.find_element_by_class_name('user-save-object').click()
    

def scrape_data(ads_list=[], url_index=0):
    """Reads in the data about each property from a given html files.
    Returns a list of dictionaries of all scraped ads (one dictionary per add)."""

    if ads_list != []:
        feat_dict_list = ads_list
    else:
        feat_dict_list = []
        
    ad_count = 0
    
    try:
        with open('./Cellar/full_url_list.pkl', 'rb') as full_url_pickle:
            url_list = pickle.load(full_url_pickle)
    except:
        url_list = get_url_list()
        page_count = len(url_list)
        if page_count >= 220:
            with open('./Cellar/full_url_list.pkl', 'wb') as full_url_pickle:
                pickle.dump(url_list, full_url_pickle)
        else:
            with open('./Cellar/partial_url_list.pkl', 'wb') as partial_url_pickle:
                pickle.dump(url_list, partial_url_pickle)

    if url_index != 0:
        original_url_list = url_list
        url_list = url_list[url_index:]
    
    for url in url_list:
        try:
            driver.get(url)
            html_list = get_htmls()

            for html in html_list:
                ad_count += 1
    #             print(html) USED FOR TESTING ONLY
                property_dict = {}

                # To open the property ad
                driver.get(html)

                # To scrape initial data points
                property_dict['property_link'] = html
                property_dict['property_id'] = get_id(html)
                property_dict['title'] = driver.find_element_by_class_name('object-header__title').text
                property_dict['address'] = driver.find_element_by_class_name('object-header__subtitle').text
                property_dict['price'] = driver.find_element_by_class_name('object-header__price').text
                property_dict['neighbourhood'] = driver.find_element_by_class_name('object-buurt__name').text

                other_features = get_feat_dict(html)
                for key, value in other_features.items():
                    if key not in property_dict.keys():
                        property_dict[key] = value
#                 print(property_dict) USED FOR TESTING ONLY
                if property_dict not in feat_dict_list:
                    feat_dict_list.append(property_dict)
    #             print(feat_dict_list) USED FOR TESTING ONLY
#                 go_to_next_page()
#                 page_count += 1
                last_url = url
        except:
            total_ad_count = len(feat_dict_list)
            print(last_url, f'This URL is number {original_url_list.index(last_url)} in the original url_list.')
            print(f'Finished with an error. Number of ads scraped {ad_count}, total number of ads in the list is {total_ad_count}.')
            return feat_dict_list
    print(f'Finished without errors. Number of ads scraped {ad_count}, total number of ads in the list is {total_ad_count}.')
    return feat_dict_list
    

def get_todays_ads():
    """Retrieves only adverts posted today"""
    

In [164]:
with open('./Cellar/ads_so_far.pkl', 'rb') as ads_list_pickle:
            ads_list = pickle.load(ads_list_pickle)

len(ads_list)

3563

In [138]:
# with open('./Cellar/full_url_list.pkl', 'rb') as full_url_list:
#             full_url_list = pickle.load(full_url_list)

# full_url_list

In [161]:
driver = start_webdriver()
log_in()

In [162]:
# filters = {'location': 'Amsterdam', 'radius': '0', 'min_price': '0', 'max_price': 'ignore_filter'}
# apply_basic_filters(filters)
scraped_data_list = scrape_data(ads_list, 216)
len(scraped_data_list)

https://www.funda.nl/en/koop/amsterdam/p216/ This URL is number 216 in the original url_list.
Finished with an error. Number of ads scraped 504, total number of ads in the list is 3563.


3563

In [140]:
last_html = scraped_data_list[-1]['property_link']

In [163]:
with open('./Cellar/ads_so_far.pkl', 'wb') as ads_so_far_pickle:
             pickle.dump(scraped_data_list, ads_so_far_pickle)

In [153]:
# with open('./Cellar/full_url_list.pkl', 'rb') as full_url_pickle:
#             url_list = pickle.load(full_url_pickle)
# len(url_list)

In [159]:
driver.close()

In [175]:
# THIS CELL STARTS UP THE WEB-DRIVER AND GOES TO THE WEBSITE

#Load .env file contents
# load_dotenv()

# chromedriver = os.environ.get('webdriver_loc')           #  path to the chromedriver executable
# os.environ["webdriver.chrome.driver"] = chromedriver
# url = os.environ.get('url')

# # Go to the website
# driver = webdriver.Chrome(chromedriver)
# driver.get(url)

In [8]:

# except:
#     driver.find_element_by_class_name('recaptcha-checkbox-border').click()
# except:
#     WebElement iFrame_checkbox = driver.findElement(By.xpath("xpath_of_reCaptcha_checkbox"));
#     iFrame_checkbox.click();



In [91]:
# url_list = get_url_list()
# with open('./Cellar/pages_url_list.pkl', 'wb') as url_pickle:
#     pickle.dump(url_list, url_pickle)

In [101]:
# Save the data into a pickle file
# with open('./Cellar/property_ad_data_list.pkl', 'wb') as scraped_pickle:
#     pickle.dump(scrape_data, scraped_pickle)

In [130]:
# with open('./Cellar/pages_url_list.pkl', 'rb') as url_pickle:
#     url_list = pickle.load(url_pickle)

# print(f'Imported {len(url_list)} URLs.')

In [150]:
# count = 0

# html_list = []
# url_list_limited = url_list[126:160]

# for url in url_list_limited:
#     driver.get(url)
#     elems = driver.find_elements_by_xpath("//a[@href]")
#     for elem in elems:
#         html = elem.get_attribute("href")
#         if html == last_html:
#             print(url)
        
        
        
#         if (bool(re.search(r'appartement-\d+', html)) or bool(re.search('huis-\d+', html))) and html not in html_list: 
#             html_list.append(html)

https://www.funda.nl/en/koop/amsterdam/p148/
https://www.funda.nl/en/koop/amsterdam/p148/
https://www.funda.nl/en/koop/amsterdam/p148/


In [21]:
# test_case = 'https://www.funda.nl/en/koop/amsterdam/huis-41401267-lijnbaansgracht-92/'

# # To open the property ad
# driver.get(test_case)

# title = driver.find_element_by_class_name('object-header__title').text
# address = driver.find_element_by_class_name('object-header__subtitle').text
# price = driver.find_element_by_class_name('object-header__price').text

# # Click to see all the features available
# WebDriverWait(driver, 7).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".object-kenmerken-open-button")))
# driver.find_element_by_class_name('object-kenmerken-open-button').click()

# # Retrieve all the other features now
# all_features = driver.find_element_by_class_name('object-kenmerken-body').text
# all_feat_dict = get_each_feature(all_features)

# neighbourhood = all_features = driver.find_element_by_class_name('object-buurt__name').text



In [114]:
test_case_html = 'https://www.funda.nl/en/koop/amsterdam/appartement-41528373-boterdiepstraat-27-h/'

In [120]:
# Open the html
driver.get(test_case_html)

# Click to see all the features available
WebDriverWait(driver, 7).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".object-kenmerken-open-button")))
driver.find_element_by_class_name('object-kenmerken-open-button').click()

# Retrieve all the other features now
time.sleep(5)
feature_elems = driver.find_elements_by_class_name('object-kenmerken-body')
feature_string = ''
for elem in feature_elems:
    if feature_string == '':
        feature_string = elem.text
    else:
        feature_string += '\n' + str(elem.text)

categories = ['Auction', 'Transfer of ownership', 'Construction', 'Surface areas and volume', 'Areas', 'Layout', 'Energy', 'What does this mean?', 'Cadastral data', 'Exterior space', 'Storage space', 'Parking', 'VVE (Owners Association) checklist', 'Garage', 'Commercial property']
special_categories = ['Cadastral data', 'Commercial property']
to_delete = ['Cadastral map']
features = feature_string.split('\n')
feat_list = [feat for feat in features if feat not in to_delete]

feat_dict = {}   
current_index = 0

for feat in feat_list: 
#     print(feat)
    feat = feat_list[current_index]
    if feat in categories:
        feat_dict[feat] = ''
        current_category = feat
        current_index += 1        
    else:
        if current_category in special_categories:
            feat_dict[current_category] += str(feat) + '; '
            if current_index+2 < len(feat_list):
                current_index += 1
            else:
                break
        elif feat in feat_dict.keys():
            feat_dict[feat+'_'+current_category] = feat_list[current_index+1]
            if current_index+2 < len(feat_list):
                current_index += 2
            else:
                break
        else:
            feat_dict[feat] = feat_list[current_index+1]
            if current_index+2 < len(feat_list):
                current_index += 2
            else:
                break
            
feat_dict

Auction
Price
Price on request (auction: <[veilingsdatum]>)
Auction date
Monday, December 16, 2019
Transfer of ownership
Listed since
November 27, 2019
Status
Available
Acceptance
Available in consultation
VVE (Owners Association) contribution
€ 75 per month
Construction
Type apartment
Ground-floor apartment (apartment)
Building type
Resale property
Year of construction
1934
Type of roof
Flat roof covered with asphalt roofing
Surface areas and volume
Areas
Living area
57 m²
External storage space
6 m²
Volume in cubic meters
171 m³
Layout
Number of rooms
3 rooms (2 bedrooms)
Number of bath rooms
1 bathroom
Bathroom facilities
Shower and toilet
Number of residential layers (stories)
1 residential layer (story)
Located at
Ground floor
Facilities
TV via cable
Energy
Provisional energy label
G What does this mean?
Insulation
No insulation
Heating
Gas heater
Hot water


{'Auction': '',
 'Price': 'Price on request (auction: <[veilingsdatum]>)',
 'Auction date': 'Monday, December 16, 2019',
 'Transfer of ownership': '',
 'Listed since': 'November 27, 2019',
 'Status': 'Available',
 'Acceptance': 'Available in consultation',
 'VVE (Owners Association) contribution': '€ 75 per month',
 'Construction': '',
 'Type apartment': 'Ground-floor apartment (apartment)',
 'Building type': 'Resale property',
 'Year of construction': '1934',
 'Type of roof': 'Flat roof covered with asphalt roofing',
 'Surface areas and volume': '',
 'Areas': '',
 'Living area': '57 m²',
 'External storage space': '6 m²',
 'Volume in cubic meters': '171 m³',
 'Layout': '',
 'Number of rooms': '3 rooms (2 bedrooms)',
 'Number of bath rooms': '1 bathroom',
 'Bathroom facilities': 'Shower and toilet',
 'Number of residential layers (stories)': '1 residential layer (story)',
 'Located at': 'Ground floor',
 'Facilities': 'TV via cable',
 'Energy': '',
 'Provisional energy label': 'G What d