In [73]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import bs4
import requests
import time

In [188]:
chrome_driver_path = "../bin/chromedriver"
phantom_driver_path = "../bin/phantomjs"
url = "https://www.fotocasa.es/es/comprar/viviendas/madrid-capital/puerta-del-angel/l?latitude=40.4096&longitude=-3.7322"

#browser = webdriver.Chrome(driver_path)
#browser.get(url)

In [202]:
def initialize_browser(driver_path, headless=False):
    """
    Initialize Browser using Selenium.
    
    Parameters:
        driver_path: String. Driver path, e.g. "../bin/chromedriver".
        headless: Boolean. True headless, False otherwise.
        
    Returns:
        Selenium Webdriver object.
    """
    
    opts = Options()
    
    if headless:
        opts.headless = True
    else:
        opts.headless = False
    
    #brw = webdriver.PhantomJS(driver_path)
    brw = webdriver.Chrome(driver_path, options=opts)
    return brw

In [190]:
def load_url(brw, url, secs=1, scrolls=30):
    """
    Load URL Page and scrolls down to fully load the page. 
    
    Parameters:
        brw. WebDriver object.
        url: String. URL to parse.
        secs: Integer. Wait time to fully load the page. Default 2 seconds.
        scrolls: Integer. Number of scrolls (hitting the page down key). Default 30.
        
    Returns:
        WebDriver object. Fully loaded page ready to parse.
    """
    
    brw.get(url)
    
    # Getting rid of the cookies prompt
    try:
        time.sleep(secs)
        css_selector = "#App > div.re-SharedCmp > div > div > div > footer > div > button.sui-AtomButton.sui-AtomButton--primary.sui-AtomButton--solid.sui-AtomButton--center"
        btn_cookies = brw.find_element_by_css_selector(css_selector)
        btn_cookies.click()
    except:
        pass
    finally:
        time.sleep(secs)
        
    body = browser.find_element_by_css_selector('body')

    for scroll in range(scrolls):
        body.send_keys(Keys.PAGE_DOWN)

    time.sleep(secs)

    return brw

In [192]:
def retrieve_property_links(brw) -> list:
    """
    Given a webpage (browser), retrieve all property links. Max 30 per page.
    
    Parameters:
        brw. WebDriver Object. Browser.
        
    Returns:
        List. List of all urls in the given page.
    """
    
    soup = bs4.BeautifulSoup(browser.page_source)
    search_grid = soup.find_all("section", attrs={"class", "re-Searchresult"})[0]
    articles = search_grid.find_all("a", attrs={"class", "re-CardPackMinimal-info-container"})
    property_links = ["https://www.fotocasa.es" + article.get("href") for article in articles]
    
    return property_links

In [193]:
def latitude_lomngitude(url):
    """
    Return clean latitude and longitude values
    
    Parameters:
        url: String. Fotocasa Url
    
    Returns:
        list. Latitude and Longitude.  Integer??? or String???
    """
    

In [194]:
def retrieve_pagination(brw) -> list:
    """
    Compute pages and returns a list with all pages (urls).
    
    Parameters:
        brw: WebDriver Object. Browser.
    
    Returns:
        List of urls.
    """
    
    soup = bs4.BeautifulSoup(browser.page_source)
    pages_raw = soup.find("ul", attrs={"class": "sui-MoleculePagination"})
    pages = pages_raw.find_all("li", attrs={"class": "sui-MoleculePagination-item"})
    
    # Number of pages
    final_page_number = pages[::-1][1].text
    
    # Initialize list to contain all pages
    all_pages = ["https://www.fotocasa.es" + pages[0].a.get("href")]
    next_pages = pages[1].a.get("href")
    
    # Add pages to list
    for p in range(2, int(final_page_number) + 1):
        all_pages.append("https://www.fotocasa.es" + next_pages.replace("/l/2?", f"/l/{p}?"))
        
    return all_pages

In [195]:
zones_urls = [
    "https://www.fotocasa.es/es/comprar/viviendas/madrid-capital/puerta-del-angel/l?latitude=40.4096&longitude=-3.7322"
]

In [203]:
browser = initialize_browser(driver_path=chrome_driver_path, headless=False)
t1 = time.time()

# 1- Loop each zone (Puerta del Angel, Bellas Vistas, etc.)
for zone in zones_urls:
    
    # 1.1- Pagination
    browser = load_url(brw=browser, url=zone, secs=1)
    pagination_list = retrieve_pagination(browser) # Fix here when there is onlynone or two pages.
    browser.get("data:,")
    
    # 1.2- Loop each page
    for indx, page in enumerate(pagination_list):
        browser = load_url(brw=browser, url=page, secs=1)
        property_links = retrieve_property_links(browser)
        print(len(property_links))
        
        # 1.3 Loop each property
        for prop in property_links:
            print("Page", indx+1, prop)
        
        

print("Fetching time:", round(time.time() - t1, 2), "seconds")
#clean_links = retrieve_single_search_links(browser)
#clean_links

30
Page 1 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/trastero-ascensor-no-amueblado/159544360/d
Page 1 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/aire-acondicionado-amueblado/161607841/d
Page 1 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/aire-acondicionado-parking-terraza-trastero-ascensor-parking/155652404/d
Page 1 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/aire-acondicionado-calefaccion-parking-terraza-amueblado/161595316/d
Page 1 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/calefaccion-terraza-ascensor/161496165/d
Page 1 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/calefaccion-terraza/161416432/d
Page 1 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/calefaccion-terraza/161304039/d
Page 1 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/calefaccion/160322894/d
Page 1 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/aire-acondicionado-calefaccion-terraza/161258529

30
Page 4 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/puerta-del-angel/158870711/d
Page 4 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/puerta-del-angel/160122976/d
Page 4 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/puerta-del-angel/160123114/d
Page 4 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/aire-acondicionado-calefaccion-ascensor/161431517/d
Page 4 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/aire-acondicionado-calefaccion-terraza-ascensor-parking/161425898/d
Page 4 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/aire-acondicionado-calefaccion-terraza-trastero-ascensor-no-amueblado/161418057/d
Page 4 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/aire-acondicionado-calefaccion/161416679/d
Page 4 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/aire-acondicionado-calefaccion-amueblado/161350179/d
Page 4 https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/terraza-amueblado/

In [None]:
def property_features(url):
    """
    Given a sigle property url, return its features, i.e.: price, etc
    
    Parameters:
    
    Returns:
    
    """

In [76]:
url = "https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/trastero-ascensor-no-amueblado/159544360/d"
r = requests.get(url)
soup = bs4.BeautifulSoup(r.content)

In [187]:
price = soup.find("span", attrs={"class", "re-DetailHeader-price"}).text

# Header features, such as bedroom, bathroom, square footage
header_features_raw = soup.find_all("ul", attrs={"class": "re-DetailHeader-features"})
header_features_raw = header_features_raw[0].find_all("li", attrs={"class": "re-DetailHeader-featuresItem"})
header_features = []
for indx, feature in enumerate(header_features_raw):
    header_features.append([f"header_feat_{indx+1}", feature.find_all("span")[::-1][1].text])
    
print(header_features)

# Below features such as lift, age, type, etc.
below_features = []
below_features_raw = soup.find_all("div", attrs={"class": "re-DetailFeaturesList-featureContent"})
for below in below_features_raw:
    items = [item.text for item in below]
    below_features.append(items)
    
print(below_features)

[['header_feat_1', '3 habs.'], ['header_feat_2', '2 baños'], ['header_feat_3', '164 m²']]
[['Tipo de inmueble', 'Piso'], ['Antigüedad', '50 a 70 años'], ['Ascensor', 'Sí'], ['Amueblado', 'No'], ['Consumo energía', '', 'G999 kW h m² / año'], ['Emisiones', '', 'G999 kg CO₂ m² / año']]


In [201]:
# Probar bien esto de las features, ponerlo bonito y ver que pasa cuando hay otras distintas
# Ver como poner esto con headless