In [2]:
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting selenium
  Downloading selenium-4.3.0-py3-none-any.whl (981 kB)
[K     |████████████████████████████████| 981 kB 7.6 MB/s 
[?25hCollecting trio~=0.17
  Downloading trio-0.21.0-py3-none-any.whl (358 kB)
[K     |████████████████████████████████| 358 kB 66.6 MB/s 
[?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting urllib3[secure,socks]~=1.26
  Downloading urllib3-1.26.10-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 55.4 MB/s 
[?25hCollecting outcome
  Downloading outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.1.0-py3-none-any.whl (24 kB)
Collecting pyOpenSSL>=0.14
  Downloading py

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait,TimeoutException
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [4]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=chrome_options)

In [5]:
url_target = 'https://www.idealista.com/venta-viviendas/madrid-madrid/'
driver.get(url_target)

In [None]:
total_properties = int(driver.find_elements(By.CSS_SELECTOR,'span.breadcrumb-navigation-element-info')[-1])
total_properties

In [None]:
total_pages = total_properties/30 # I know there are 30 properties per page max.
total_pages

In [None]:
def get_properties_url_list(driver:webdriver,url:str) -> list:
  driver.get(url)
  
  url_list = []
  for page in range(total_pages):
    try:
      links = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.item-link')))
    except TimeoutException:
      print(f'failed to retrieve data from {url}')

    url_list.extend([link for link.href in links])
    siguiente_button = driver.find_element(by=By.LINK_TEXT,value='Siguiente')
    driver.get(siguiente_button.href)

  return url_list

In [None]:
def get_property_data(driver:webdriver, url:str) -> pd.DataFrame:
  driver.get(url)
  utag_data = driver.execute_script('return window.utag_data')  # price, size, n_bedrooms, type(indirectly), floor, elevator, garden, terrace, energetic_class, garage, status

  data = {
      'price':int(utag_data.ad_price),
      'size':int(utag_data.ad_characteristics_constructedArea),
      'n_bedrooms':int(utag_data.ad_characteristics_bathNumber),
      'n_bathrooms':int(utag_data.ad_characteristics_roomNumber),
      'has_garage' : int(utag_data.ad_characteristics_hasParking),
      'good_condition': int(utag_data.ad_condition_isGoodCondition),
      'needs_renovating': int(utag_data.ad_condition_isNeedsRenovating),
      'new_development': int(utag_data.ad_condition_isNewDevelopment),
      'energy_cert':utag_data.get('ad_energyCertification_type','not_available'),
      'has_lift': int(utag_data.get('ad_characteristics_hasLift',0)),
      'has_garden': int(utag_data.get('ad_characteristics_hasGarden',0)),
      'has_terrace':int(utag_data.get('ad_characteristics_hasTerrace',0)),
      'has_swimming_pool': int(utag_data.get('ad_characteristics_hasSwimmingPool',0))  
  }

  if 'ad_characteristics_hasLift' in utag_data.leys():
    data['type'] = 'apartment'
  else:
    data['type'] = 'house'

  try:
    tags = driver.find_element(By.CSS_SELECTOR,'div.info-features-tags')
    data['luxury'] = ('lujo' in tags)
  except:
    data['luxury'] = False

    return pd.DataFrame(data).to_csv('idealista_real_estate_data.csv',mode='a')

In [None]:
url_list = get_properties_url_list(driver,url_target)

for url in url_list:
  get_property_data(driver,url)