In [9]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

### Import necessary packages

In [10]:
#Essentials
import numpy as np
import pandas as pd
import pickle
import re
import datetime as dt
import time

# For scraping
import requests
import os
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

#Visualization
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns


%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 25, 6
rcParams.update({'font.size': 14})

## Gather data

### Helper functions

In [67]:
def get_webdriver():
    """Simply starts a new Chrome browser instance"""
    #Load .env file contents
    load_dotenv()
    
    chromedriver = os.environ.get('webdriver_loc')           #  path to the chromedriver executable
    os.environ["webdriver.chrome.driver"] = chromedriver
#   Define optional settings and go to the website
    chrome_options = ChromeOptions()
    
#     options.add_argument("--headless")
#     options.add_argument("--disable-notifications")
#     driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chrome_options, options=options)
    
    driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chrome_options)
    return driver
    

def login_google(driver):
    """In order to avoid reCAPTCHA's I need to first log in to my google account.
    This function does just that"""
    #Load .env file contents
    load_dotenv()
    google_url = os.environ.get('google_url')
    
    driver.get(google_url)
    time.sleep(1)
    
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.ID, "identifierId")))
    
    username_field = driver.find_element_by_id('identifierId')
    username_field.send_keys(os.environ.get('username'))
    username_field.send_keys(Keys.RETURN)
    time.sleep(1)
    password_field = driver.find_element_by_name('password')
    password_field.send_keys(os.environ.get('secret_g'))
    password_field.send_keys(Keys.RETURN)
    return driver


def solve_recaptcha():
    """Click reCAPTCHA checkbox. If the test with images comes up, I have no solution."""
    frame = driver.find_element_by_xpath('//iframe[contains(@src, "recaptcha")]')
    driver.switch_to.frame(frame)
    driver.find_element_by_xpath("//*[@id='recaptcha-anchor']").click()
    time.sleep(3)
    driver.switch_to.default_content()
    try:
        frame = driver.find_element_by_xpath('//iframe[contains(@src, "recaptcha")]')
        driver.switch_to.frame(frame)
        print('Human intervention needed')
        return 'Human intervention needed'
    except:
        print('ReCAPTHA solved successfully')
        return 'ReCAPTHA solved successfully'
    
def set_cookies():
    """This function sets the minimal cookie settings and takes us to the filters page"""
    
    
def open_funda(driver):
    """This function simply starts the chrome driver (from a defined location on the machine)."""
#     Load .env file contents
    load_dotenv()
    funda_url = os.environ.get('funda_url')
    
    time.sleep(2)
    driver.get(funda_url)

    try:
        current_selection = driver.find_element_by_class_name('is-active').text
        # Makes sure that "For Sale" category is selected
        if current_selection == 'For Sale':
            return driver
        else:
            driver.get(funda_url+'koop/')
            return driver
    except:
        if solve_recaptcha() == 'ReCAPTHA solved successfully':
            current_selection = driver.find_element_by_class_name('is-active').text
            # Makes sure that "For Sale" category is selected
            if current_selection == 'For Sale':
                return driver
            else:
                driver.get(funda_url+'koop/')
                return driver
        else:
            print('Could not solve reCAPTCHA :(')
            return driver

def log_in():
    """Uses the credentials defined in the .env file to log in and saves properties matching desired criteria to the favourites"""
    # This is to load environment variables
    load_dotenv()
    
#     acct_url = 'https://www.funda.nl/en/navigation/account/'
    login_url = os.environ.get('login_url')
    url = os.environ.get('funda_url')
    
    driver.get(login_url)
#     driver.find_element_by_id('appheader-inloggen-link').click()
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.ID, "Username")))
    
    username_field = driver.find_element_by_id('Username')
    username_field.send_keys(os.environ.get('username'))
    password_field = driver.find_element_by_id('Password')
    password_field.send_keys(os.environ.get('secret_f'))
    driver.find_element_by_xpath("//button[contains(text(), 'Log in')]").click()
    driver.get(url)
    
def log_out():
    """Logs out once the actions are done"""
    # This is to load environment variables
    load_dotenv()
    
    logout_url = os.environ.get('logout_url')
    driver.get(logout_url)
    
def apply_basic_filters(filter_dict={'location': 'Amsterdam', 'radius': '0', 'min_price': '0', 'max_price': 'ignore_filter'}):
    """Takes a dictionary of basic filters, applies them and searches for the properties"""
    # Now apply desired filters (e.g. set location to Amsterdam)
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.NAME, "filter_location")))
    filter_loc = driver.find_element_by_name('filter_location')
    filter_loc.send_keys(filter_dict['location'])

    # Wait for the first dropdown option to appear and select the first option
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".autocomplete-list")))
    filter_loc.send_keys(Keys.ARROW_DOWN)
    filter_loc.send_keys(Keys.ENTER)

    filter_radius = Select(driver.find_element_by_id('Straal'))
    filter_radius.select_by_value(filter_dict['radius'])

    filter_price_from = Select(driver.find_element_by_name('filter_KoopprijsVan'))
    filter_price_from.select_by_value(filter_dict['min_price'])

    filter_price_upto = Select(driver.find_element_by_name('filter_KoopprijsTot'))
    filter_price_upto.select_by_value(filter_dict['max_price'])

    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//button[@class='button-primary-alternative']")))
    find_search = driver.find_element_by_xpath("//button[contains(text(), 'Search')]")
    find_search.click()
    
def get_id(html):
    """Given an html this function retrieves the property id from the html link.
    Returns the id"""
    return int(re.findall(r'-\d*-', html)[0].strip('-'))



# Get urls for all the pages and put them into a list
def get_url_list():
    """Records the current url and goes through the website, clicking Next as many times as there are pages.
    Returns a list of urls to be used in the get_htmls function."""
    #Creates a list of urls for all pages
    url_list = []
    
    #Reads the url of the page the driver is currently in and adds it into the list
    current_page_url = driver.current_url
    url_list.append(current_page_url)
    
    curr_page_num = 1
    count_exceptions = 0
    while True:
        try:
            curr_page_num += 1
            #Find "Next" button and click it
            find_next = driver.find_element_by_xpath("//a[@rel='next']")
            find_next.click()
            current_page_url = driver.current_url
            url_list.append(current_page_url)
            time.sleep(10)
        except:
            #Count the exceptions
            count_exceptions += 1
            curr_page_num -= 1 #since we are coming back to the same page
            #If the there haven't been 3 exceptions yet, sleep for a bit and then continue
            if count_exceptions < 3:
                time.sleep(10)
            else:
                #If "Next" button isn't there anymore or an error occurs, return the list
                #driver.close()
                print(f'Url list collection - complete. Last page number was {curr_page_num}.')
                return url_list    
    
    print(f'Url list collection - complete. Last page number was {curr_page_num}.')
    return url_list   


#Get all the html files for each property ad and put it into a list
def get_htmls():
    """Takes current html list.
    Returns updated html list with all the property ad htmls available on the page."""    
    #Find all property ad htmls
    html_list = []
    elems = driver.find_elements_by_xpath("//a[@href]")
    for elem in elems:
        html = elem.get_attribute("href")
        if (bool(re.search(r'appartement-\d+', html)) or bool(re.search('huis-\d+', html))) and html not in html_list: 
            html_list.append(html)
    return html_list


def get_feat_dict(html):
    """Takes a string with all highlighted features and puts them into a dictionary.
    Returns the dictionary"""
    
    # Open the html
    driver.get(html)

    # Click to see all the features available (if button doesn't appear, continue)
    try:
        WebDriverWait(driver, 7).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".object-kenmerken-open-button")))
        driver.find_element_by_class_name('object-kenmerken-open-button').click()
    except:
        pass
    
    # Retrieve all the other features now
    time.sleep(5)
    feature_elems = driver.find_elements_by_class_name('object-kenmerken-body')
    feature_string = ''
    for elem in feature_elems:
        if feature_string == '':
            feature_string = elem.text
        else:
            feature_string += '\n' + str(elem.text)

    categories = ['Auction', 'Transfer of ownership', 'Construction', 'Surface areas and volume', 'Areas', 'Layout', 'Energy', 'What does this mean?', 'Cadastral data', 'Exterior space', 'Storage space', 'Parking', 'VVE (Owners Association) checklist', 'Garage', 'Commercial property']
    special_categories = ['Cadastral data', 'Commercial property']
    to_delete = ['Cadastral map']
    features = feature_string.split('\n')
    feat_list = [feat for feat in features if feat not in to_delete]

    feat_dict = {}   
    current_index = 0
    current_category = ''

    for feat in feat_list: 
        feat = feat_list[current_index]
    #     print(feat, current_index)
        if feat in categories:
            feat_dict[feat] = ''
            current_category = feat
            current_index += 1        
        else:
            if current_category in special_categories:
                feat_dict[current_category] += str(feat) + '; '
                if current_index+2 < len(feat_list):
                    current_index += 1
                else:
                    break
            elif feat in feat_dict.keys():
                feat_dict[feat+'_'+current_category] = feat_list[current_index+1]
                if current_index+2 < len(feat_list):
                    current_index += 2
                else:
                    break
            else:
                feat_dict[feat] = feat_list[current_index+1]
                if current_index+2 < len(feat_list):
                    current_index += 2
                else:
                    break
    return feat_dict    
    

def scrape_data(ads_list=[], url_list_name='', url_index=0):
    """Reads in the data about each property from a given html files.
    Returns a list of dictionaries of all scraped ads (one dictionary per add)."""

    today_timestamp = str(dt.date.today().year) + str(dt.date.today().month) + str(dt.date.today().day)
    
    if ads_list != []:
        feat_dict_list = ads_list
    else:
        feat_dict_list = []
        
    new_ad_count = 0
    
    if url_list_name != '':
        with open(f'./Cellar/{url_list_name}.pkl', 'rb') as url_pickle:
            url_list = pickle.load(url_pickle)
    else:
        url_list = get_url_list()
        page_count = len(url_list)
        with open(f'./Cellar/url_list_{today_timestamp}.pkl', 'wb') as url_pickle:
            pickle.dump(url_list, url_pickle)

    if url_index != 0:
        original_url_list = url_list
        url_list = url_list[url_index:]
    
    for url in url_list:
        try:
            driver.get(url)
            html_list = get_htmls()

            for html in html_list:
                new_ad_count += 1
    #             print(html) USED FOR TESTING ONLY
                property_dict = {}

                # To open the property ad
                driver.get(html)

                # To scrape initial data points
                property_dict['property_link'] = html
                property_dict['property_id'] = get_id(html)
                property_dict['title'] = driver.find_element_by_class_name('object-header__title').text
                property_dict['address'] = driver.find_element_by_class_name('object-header__subtitle').text
                property_dict['price'] = driver.find_element_by_class_name('object-header__price').text
                property_dict['neighbourhood'] = driver.find_element_by_class_name('object-buurt__name').text
                property_dict['scraped_date'] = dt.date.today()

                other_features = get_feat_dict(html)
                for key, value in other_features.items():
                    if key not in property_dict.keys():
                        property_dict[key] = value
#                 print(property_dict) USED FOR TESTING ONLY
                if property_dict not in feat_dict_list:
                    feat_dict_list.append(property_dict)
#                 print(feat_dict_list) USED FOR TESTING ONLY
                last_url = url
        except:
            total_ad_count = len(feat_dict_list)
            if url_index != 0:
                print(last_url, f'This URL is number {original_url_list.index(last_url)} in the original url_list.')
            else:
                print(last_url, f'This URL is number {url_list.index(last_url)} in the supplied url_list.')
            print(f'Finished with an error. Number of ads scraped {new_ad_count}, total number of ads in the list is {total_ad_count}.')
            return feat_dict_list
    total_ad_count = len(feat_dict_list)
    print(f'Finished without errors. Number of ads scraped {new_ad_count}, total number of ads in the list is {total_ad_count}.')
    return feat_dict_list



def get_recent_ads(days='1', ads_list=[], url_list_name='', url_index=0):
    """Retrieves only property adverts posted passed in number of days from today (one being today).
    The only options for days are 1, 3, 5, 10 and 30.
    The default is today (1)."""
    apply_ad_filters({'filter_type': 'days', 'filter': days})
    return scrape_data(ads_list=ads_list, url_list_name=url_list_name, url_index=url_index)
     

def apply_ad_filters(filters_dict={'filter_type': 'days', 'filter': '10'}):
    """Applies specific filters like the size of the property, facilities, etc.
    Available filter types: days, status"""
    type_css_dict = {'days': 'PublicatieDatum-'}
    
    filters_button_css = ".search-content-header-button-filters.button-tertiary"
    days_on_funda_css = f"label[class='radio-group-item-label label-text'][for='{type_css_dict[filter]}{days}']"
    remove_filter_css = "button[class='mobile-filter-reset-button button-tertiary is-enhanced']"
    
    filters_button = driver.find_element_by_css_selector(filters_button_css)
    filter_dof = driver.find_element_by_css_selector(days_on_funda_css)
    remove_dof_filter = driver.find_element_by_css_selector(remove_filter_css)
    close_filters_pane = driver.find_element_by_css_selector('.button-tertiary.close-search-sidebar')    
    
    # Click on filters button
    try:
        time.sleep(3)
        filters_button.click()
        print('Filters button clicked successfully')
    except:
        print('No filters button available, trying to filter directly.')
        pass
        
    try:
        time.sleep(4)
        filter_dof.click()
        print('Days on funda set to "days" successfully')
    except:
        print('No "Days on funda" section available, trying to remove existing filter.')
        pass
        
    try:
        WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, remove_filter_css)))
        remove_dof_filter.click()
        print('Original filter removed')
    except:
        print('Not able to remove filters.')
        pass

    try:
        WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, days_on_funda_css)))
        next_section = driver.find_element_by_xpath("//legend[contains(text(), 'Number of rooms')]")
        ActionChains(driver).move_to_element(next_section).perform()
        filter_dof.click()
        print('New filter applied successfully')
        # This is to close the filters pane
        time.sleep(2)
        close_filters_pane.click()
        print('Filters pane closed successfully')
    except:
        print('Could not set the filter.')
    
    
def save_to_favourites(html):
    """Saves the given html of the property to favourites"""
    # Goes to the provided html (link)
    driver.get(html)
    
    # Find the "heart" icon and clicks it once
    driver.find_element_by_class_name('user-save-object').click()

SyntaxError: invalid syntax (<ipython-input-67-516ad860dd14>, line 64)

In [183]:
with open('./Cellar/ads_so_far_2020328.pkl', 'rb') as ads_list_pickle:
            current_ads_list = pickle.load(ads_list_pickle)

len(current_ads_list)

8306

In [175]:
# First start the web driver, log into google and open funda
# If reCAPTCHA appears, solve it manually
driver = get_webdriver()
login_google(driver)
open_funda(driver);

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//iframe[contains(@src, "recaptcha")]"}
  (Session info: chrome=80.0.3987.149)
  (Driver info: chromedriver=2.40.565386 (45a059dc425e08165f9a10324bd1380cc13ca363),platform=Mac OS X 10.13.6 x86_64)


In [160]:
# load_dotenv()
# funda_url = os.environ.get('funda_url')

# time.sleep(2)
# driver.get(funda_url)

In [161]:

# cookie_url = funda_url + '/cookiebeleid/?ReturnUrl=%2f'
# driver.get(cookie_url)

# current_tab = driver.find_element_by_class_name('active').text

# if current_tab == 'Standaard':
#     element = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//a[contains(text(), 'Aangepast')]")))
#     element.click()
#     find_save_cookies_button = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//div[@class='cookie-preference-aangepast]'//button[@type='submit')]")))
#     find_save_cookies_button.click()


In [176]:
# Now that funda is not angry at us anymore
log_in()
filters = {'location': 'Amsterdam', 'radius': '0', 'min_price': '0', 'max_price': 'ignore_filter'}
apply_basic_filters(filters)

In [179]:
all_data_df = scrape_data(ads_list=current_ads_list, url_list_name='url_list_2020328', url_index=172)

Finished without errors. Number of ads scraped 104, total number of ads in the list is 8306.


In [34]:
# days = '3'
# apply_ad_filters({'filter_type': 'days', 'filter': days})

In [None]:
# updated_data_df = get_recent_ads(days='10', ads_list=ads_list)

In [181]:
driver.close()

In [182]:
with open('./Cellar/ads_so_far_2020328.pkl', 'wb') as ads_so_far_pickle:
             pickle.dump(all_data_df, ads_so_far_pickle)

In [47]:
# with open('./Cellar/ads_so_far_2020223.pkl', 'wb') as ads_so_far_pickle:
#              pickle.dump(all_data_df, ads_so_far_pickle)

In [138]:
# with open('./Cellar/full_url_list.pkl', 'rb') as full_url_list:
#             full_url_list = pickle.load(full_url_list)

# full_url_list

In [6]:
# driver = start_webdriver()
# log_in()

In [54]:
last_html = scraped_data_list[-1]['property_link']
last_html

NameError: name 'scraped_data_list' is not defined

In [153]:
# with open('./Cellar/full_url_list.pkl', 'rb') as full_url_pickle:
#             url_list = pickle.load(full_url_pickle)
# len(url_list)

In [175]:
# THIS CELL STARTS UP THE WEB-DRIVER AND GOES TO THE WEBSITE

#Load .env file contents
# load_dotenv()

# chromedriver = os.environ.get('webdriver_loc')           #  path to the chromedriver executable
# os.environ["webdriver.chrome.driver"] = chromedriver
# url = os.environ.get('url')

# # Go to the website
# driver = webdriver.Chrome(chromedriver)
# driver.get(url)

In [8]:

# except:
#     driver.find_element_by_class_name('recaptcha-checkbox-border').click()
# except:
#     WebElement iFrame_checkbox = driver.findElement(By.xpath("xpath_of_reCaptcha_checkbox"));
#     iFrame_checkbox.click();



In [91]:
# url_list = get_url_list()
# with open('./Cellar/pages_url_list.pkl', 'wb') as url_pickle:
#     pickle.dump(url_list, url_pickle)

In [101]:
# Save the data into a pickle file
# with open('./Cellar/property_ad_data_list.pkl', 'wb') as scraped_pickle:
#     pickle.dump(scrape_data, scraped_pickle)

In [130]:
# with open('./Cellar/pages_url_list.pkl', 'rb') as url_pickle:
#     url_list = pickle.load(url_pickle)

# print(f'Imported {len(url_list)} URLs.')

In [150]:
# count = 0

# html_list = []
# url_list_limited = url_list[126:160]

# for url in url_list_limited:
#     driver.get(url)
#     elems = driver.find_elements_by_xpath("//a[@href]")
#     for elem in elems:
#         html = elem.get_attribute("href")
#         if html == last_html:
#             print(url)
        
        
        
#         if (bool(re.search(r'appartement-\d+', html)) or bool(re.search('huis-\d+', html))) and html not in html_list: 
#             html_list.append(html)

https://www.funda.nl/en/koop/amsterdam/p148/
https://www.funda.nl/en/koop/amsterdam/p148/
https://www.funda.nl/en/koop/amsterdam/p148/


In [21]:
# test_case = 'https://www.funda.nl/en/koop/amsterdam/huis-41401267-lijnbaansgracht-92/'

# # To open the property ad
# driver.get(test_case)

# title = driver.find_element_by_class_name('object-header__title').text
# address = driver.find_element_by_class_name('object-header__subtitle').text
# price = driver.find_element_by_class_name('object-header__price').text

# # Click to see all the features available
# WebDriverWait(driver, 7).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".object-kenmerken-open-button")))
# driver.find_element_by_class_name('object-kenmerken-open-button').click()

# # Retrieve all the other features now
# all_features = driver.find_element_by_class_name('object-kenmerken-body').text
# all_feat_dict = get_each_feature(all_features)

# neighbourhood = all_features = driver.find_element_by_class_name('object-buurt__name').text



In [176]:
test_case_html = 'https://www.funda.nl/en/koop/amsterdam/appartement-87991120-vleutenstraat-44/'

In [200]:
# # Open the html
# driver.get(test_case_html)

# # Click to see all the features available
# try:
#     WebDriverWait(driver, 7).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".object-kenmerken-open-button")))
#     driver.find_element_by_class_name('object-kenmerken-open-button').click()
# except:
#     pass
    
# # Retrieve all the other features now
# time.sleep(5)
# feature_elems = driver.find_elements_by_class_name('object-kenmerken-body')
# feature_string = ''
# for elem in feature_elems:
#     if feature_string == '':
#         feature_string = elem.text
#     else:
#         feature_string += '\n' + str(elem.text)

# categories = ['Auction', 'Transfer of ownership', 'Construction', 'Surface areas and volume', 'Areas', 'Layout', 'Energy', 'What does this mean?', 'Cadastral data', 'Exterior space', 'Storage space', 'Parking', 'VVE (Owners Association) checklist', 'Garage', 'Commercial property']
# special_categories = ['Cadastral data', 'Commercial property']
# to_delete = ['Cadastral map']
# features = feature_string.split('\n')
# feat_list = [feat for feat in features if feat not in to_delete]

# feat_dict = {}   
# current_index = 0

# for feat in feat_list: 
# #     print(feat)
#     feat = feat_list[current_index]
#     if feat in categories:
#         feat_dict[feat] = ''

#         current_category = feat
#         current_index += 1        
#     else:
#         if current_category in special_categories:
#             feat_dict[current_category] += str(feat) + '; '
#             if current_index+2 < len(feat_list):
#                 current_index += 1
#             else:
#                 break
#         elif feat in feat_dict.keys():
#             feat_dict[feat+'_'+current_category] = feat_list[current_index+1]
#             if current_index+2 < len(feat_list):
#                 current_index += 2
#             else:
#                 break
#         else:
#             feat_dict[feat] = feat_list[current_index+1]
#             if current_index+2 < len(feat_list):
#                 current_index += 2
#             else:
#                 break
                
# feat_dict['property_link'] = test_case_html
# feat_dict['property_id'] = get_id(test_case_html)
# feat_dict['title'] = driver.find_element_by_class_name('object-header__title').text
# feat_dict['address'] = driver.find_element_by_class_name('object-header__subtitle').text
# feat_dict['price'] = driver.find_element_by_class_name('object-header__price').text
# feat_dict['neighbourhood'] = driver.find_element_by_class_name('object-buurt__name').text
# # ads_list.append()
# feat_dict