In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

### Import necessary packages

In [3]:
#Essentials
import numpy as np
import pandas as pd
import pickle
import re
import datetime as dt
import time

# For scraping
import requests
import os
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

#SQL related - NEED TO DECIDE WHICH ONE I'LL BE USING AND DELETE THE REST
import psycopg2
import sqlite3
from sqlalchemy import create_engine
import pandas.io.sql as pd_sql

#Visualization
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns


%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 25, 6
rcParams.update({'font.size': 14})

## Gather data

### Helper functions

In [10]:
def is_for_sale_active(url):
    """Checks if 'For Sale' tab is selected, to make sure we're not looking at rentals
    Returns a Boolean (True for For Sale is active, False - if it isn't active)"""
    
def get_todays_ads():
    """Retrieves only adverts posted today"""
    
#Get urls for all the pages and put them into a list
def get_url_list():
    """Records the current url and goes through the website, clicking Next as many times as there are pages.
    Returns a list of urls to be used in the get_htmls function."""
    #Creates a list of urls for all pages
    url_list = []
    
    #Reads the url of the page the driver is currently in and adds it into the list
    current_page_url = driver.current_url
    url_list.append(current_page_url)
    
    curr_page_num = 0
    count_exceptions = 0
    while True:
        try:
            curr_page_num += 1
            #Find "Next" button and click it
            find_next = driver.find_element_by_xpath("//a[@rel='next']")
            find_next.click()
            current_page_url = driver.current_url
            url_list.append(current_page_url)
            time.sleep(10)
        except:
            #Count the exceptions
            count_exceptions += 1
            curr_page_num -= 1 #since we are coming back to the same page
            #If the there haven't been 3 exceptions yet, sleep for a bit and then continue
            if count_exceptions < 3:
                time.sleep(10)
            else:
                #If "Next" button isn't there anymore or an error occurs, return the list
                #driver.close()
                print(f'Last page number was {curr_page_num}.')
                return url_list
    print(f'Last page number was {curr_page_num}.')
    #driver.close()
    return url_list   
    
def go_to_next_page():
    """Finds the 'Next' button and clicks it"""
    count_exceptions = 0
    while True:
        try:
            #Find "Next" button and click it
            find_next = driver.find_element_by_xpath("//a[@rel='next']")
            find_next.click()
            time.sleep(5)
        except:
            #Count the exceptions
            count_exceptions += 1
            #If the there haven't been 3 exceptions yet, sleep for a bit and then continue
            if count_exceptions < 3:
                time.sleep(10)
            else:
                break
    
    
#Get all the html files for each property ad and put it into a list
def get_htmls(html_list):
    """Takes current html list.
    Returns updated html list with all the property ad htmls available on the page."""
    #Find all property ad htmls
    elems = driver.find_elements_by_xpath("//a[@href]")
    for elem in elems:
        html = elem.get_attribute("href")
        if (bool(re.search(r'appartement-\d+', html)) or bool(re.search('huis-\d+', html))) and html not in html_list: 
            html_list.append(html)
    return html_list

def scrape_data():
    """Reads in the data about each property from a given html files.
    Returns a dictionary (MIGHT HAVE TO CHANGE THIS IN THE FUTURE)"""
    #Reads the url of the page the driver is currently in
    
    feature_dict = {}
    
    while true:
        try:
            html_list = []
            html_list.append(get_htmls(html_list))
            get_each_feature()
            go_to_next_page()
        except:
            return feature_dict
    return feature_dict
    
    
    
    return current_html

def get_id(html):
    """Given an html, this function retrieves the property id from the html link.
    Returns the link"""
    property_id = ''
    
    

def get_feat_list(html):
    """Given an html, it retrieves a list of all the features that is further processed in get_each_feature function.
    Returns a list of all feature names and features themselves"""
    # To open the property ad
    driver.get(html)

    property_id = 
    title = driver.find_element_by_class_name('object-header__title').text
    address = driver.find_element_by_class_name('object-header__subtitle').text
    price = driver.find_element_by_class_name('object-header__price').text

    # Click to see all the features available
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".object-kenmerken-open-button")))
    driver.find_element_by_class_name('object-kenmerken-open-button').click()

    # Retrieve all the other features now
    all_features = driver.find_element_by_class_name('object-kenmerken-body').text
    all_feat_dict = get_each_feature(all_features)

    neighbourhood = all_features = driver.find_element_by_class_name('object-buurt__name').text
    

def get_each_feature(feat_list):
    """Takes a string with all highlighted features and puts them into a dictionary.
    Returns the dictionary"""
    categories = ['Transfer of ownership', 'Construction', 'Surface areas and volume', 'Areas', 'Layout', 'Energy', 'What does this mean?', 'Cadastral map', 'Exterior space', 'Storage space', 'Parking', 'VVE (Owners Association) checklist']
    feature_list = feat_list.split('\n')[1:]
    trim_feat_list = [feat for feat in feature_list if feat not in categories]
    
    counter = 0
    feat_dict = {}
    
    for feat in trim_feat_list[::2]:
        feat_index = trim_feat_list.index(feat)
        if feat in feat_dict.keys():
            repeat_tag = str(list(feat_dict.keys()).count(feat) + 1)
            feat_dict[feat+'_'+repeat_tag] = trim_feat_list[feat_index+1]
        else:
            feat_dict[feat] = trim_feat_list[feat_index+1]
        del trim_feat_list[feat_index:feat_index+2]
    return feat_dict

# It could be an idea to log the categories in the dictionary and leave the value empty, that way the table will have separators indicating which feature the next columns are about

def log_in():
    """Uses the credentials defined in the .env file to log in and saves properties matching desired criteria to the favourites"""
    # This is to load environment variables
    load_dotenv()
    
#     acct_url = 'https://www.funda.nl/en/navigation/account/'
    login_url = 'https://www.funda.nl/en/mijn/login/?ReturnUrl=%2Fen%2F'
    driver.get(login_url)
#     driver.find_element_by_id('appheader-inloggen-link').click()
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.ID, "Username")))
    
    username_field = driver.find_element_by_id('Username')
    username_field.send_keys(os.environ.get('username'))
    password_field = driver.find_element_by_id('Password')
    password_field.send_keys(os.environ.get('secret_f'))
    driver.find_element_by_xpath("//button[contains(text(), 'Log in')]").click()
    
def log_out():
    """Logs out once the actions are done"""
    logout_url = 'https://www.funda.nl/en/mijn/logout'
    driver.get(logout_url)
    
def save_to_favourites(html):
    """Saves the given html of the property to favourites"""
    # Goes to the provided html (link)
    driver.get(html)
    
    # Find the "heart" icon and clicks it once
    driver.find_element_by_class_name('user-save-object').click()

SyntaxError: invalid syntax (<ipython-input-10-7562fb86807e>, line 105)

In [8]:
scrape_data()

'https://www.funda.nl/en/mijn/logout'

In [49]:
print(html_list[4])
match = re.search(r'-\d*-', html_list[4])
match.group()

https://www.funda.nl/en/koop/amsterdam/huis-41564793-willem-gertenbachstraat-84/?navigateSource=resultlist


'-41564793-'

In [11]:
# At the moment only getting the data from Funda, could have a list of websites in the future
general_url = 'https://www.funda.nl/en/'

chromedriver = "/Applications/chromedriver"           #  path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [12]:
# Go to the website
driver = webdriver.Chrome(chromedriver)
driver.get(general_url)

In [13]:


# Now apply desired filters (e.g. set location to Amsterdam)
# try:
filter_loc = driver.find_element_by_name('filter_location')
filter_loc.send_keys('Amsterdam')

# Wait for the first dropdown option to appear and select the first option
WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".autocomplete-list")))
filter_loc.send_keys(Keys.ARROW_DOWN)
filter_loc.send_keys(Keys.ENTER)

filter_radius = Select(driver.find_element_by_id('Straal'))
filter_radius.select_by_value('0')

filter_price_from = Select(driver.find_element_by_name('filter_KoopprijsVan'))
filter_price_from.select_by_value('200000')

filter_price_upto = Select(driver.find_element_by_name('filter_KoopprijsTot'))
filter_price_upto.select_by_value('400000')

find_search = driver.find_element_by_xpath("//button[contains(text(), 'Search')]")
find_search.click()

# Now that we applied basic filters, let's narrow the search down more

# except:
#     driver.find_element_by_class_name('recaptcha-checkbox-border').click()
# except:
#     WebElement iFrame_checkbox = driver.findElement(By.xpath("xpath_of_reCaptcha_checkbox"));
#     iFrame_checkbox.click();


# car_type_form.send_keys(Keys.RETURN)

In [91]:
# url_list = get_url_list()
# with open('./Cellar/pages_url_list.pkl', 'wb') as url_pickle:
#     pickle.dump(url_list, url_pickle)

In [96]:
with open('./Cellar/pages_url_list.pkl', 'rb') as url_pickle:
    url_list = pickle.load(url_pickle)

print(f'Imported {len(url_list)} URLs.')

Imported 77 URLs.


In [14]:
count = 0

html_list = []

elems = driver.find_elements_by_xpath("//a[@href]")
for elem in elems:
    html = elem.get_attribute("href")
    if (bool(re.search(r'appartement-\d+', html)) or bool(re.search('huis-\d+', html))) and html not in html_list: 
        html_list.append(html)

In [18]:
test_case = html_list[4]

# To open the property ad
driver.get(test_case)

title = driver.find_element_by_class_name('object-header__title').text
address = driver.find_element_by_class_name('object-header__subtitle').text
price = driver.find_element_by_class_name('object-header__price').text

# Click to see all the features available
WebDriverWait(driver, 7).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".object-kenmerken-open-button")))
driver.find_element_by_class_name('object-kenmerken-open-button').click()

# Retrieve all the other features now
all_features = driver.find_element_by_class_name('object-kenmerken-body').text
all_feat_dict = get_each_feature(all_features)

neighbourhood = all_features = driver.find_element_by_class_name('object-buurt__name').text



In [21]:
html_list[4]

'https://www.funda.nl/en/koop/amsterdam/huis-41564793-willem-gertenbachstraat-84/?navigateSource=resultlist'

In [76]:
driver.close()