In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd

def scrape_booking_madrid(checkin, checkout):
    # Inicia el navegador
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.maximize_window()

    # P√°gina de b√∫squeda con fechas incluidas
    url = f"https://www.booking.com/searchresults.html?ss=Madrid&checkin_year_month_monthday={checkin}&checkout_year_month_monthday={checkout}"
    driver.get(url)
    
    time.sleep(5)  # espera a que cargue

    hotels = []
    
    # Extrae los resultados principales
    cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='property-card']")
    
    for card in cards:
        try:
            name = card.find_element(By.CSS_SELECTOR, "div[data-testid='title']").text
        except:
            name = "N/A"
        try:
            price = card.find_element(By.CSS_SELECTOR, "span[data-testid='price-and-discounted-price']").text
        except:
            price = "N/A"
        
        hotels.append({"hotel": name, "price": price})

    driver.quit()

    # Guardar en CSV
    df = pd.DataFrame(hotels)
    df.to_csv("hoteles_madrid_2024.csv", index=False, encoding="utf-8")
    print("Datos guardados en hoteles_madrid_2024.csv")

# Ejemplo de uso:
# scrape_booking_madrid("2024-09-01", "2024-09-03")


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import os

def scrape_booking_madrid(checkin, checkout):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.maximize_window()

    url = f"https://www.booking.com/searchresults.html?ss=Madrid&checkin={checkin}&checkout={checkout}"
    driver.get(url)

    time.sleep(7)  # espera para que cargue

    hotels = []
    cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='property-card']")

    print("Hoteles detectados:", len(cards))

    for card in cards:
        try:
            name = card.find_element(By.CSS_SELECTOR, "div[data-testid='title']").text.strip()
        except:
            name = "N/A"

        try:
            price = card.find_element(By.CSS_SELECTOR, "span[data-testid='price-and-discounted-price']").text.strip()
        except:
            price = "N/A"

        hotels.append({
            "checkin": checkin,
            "checkout": checkout,
            "hotel": name,
            "price": price
        })

    driver.quit()

    df = pd.DataFrame(hotels)

    filename = f"hoteles_madrid_{checkin}_to_{checkout}.csv"
    filepath = os.path.join(os.getcwd(), filename)

    df.to_csv(filepath, index=False, encoding="utf-8")
    print("‚úÖ CSV guardado en:", filepath)

scrape_booking_madrid("2024-07-01", "2024-07-03")


Hoteles detectados: 0
‚úÖ CSV guardado en: C:\Users\46mjn\ATD\hoteles_madrid_2024-07-01_to_2024-07-03.csv


In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import os
import time

def scrape_booking_madrid(checkin, checkout):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.maximize_window()

    url = f"https://www.booking.com/searchresults.html?ss=Madrid&checkin={checkin}&checkout={checkout}"
    driver.get(url)

    wait = WebDriverWait(driver, 20)

    # 1) Intentar aceptar cookies si aparece el bot√≥n
    try:
        cookies_btn = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
        cookies_btn.click()
        time.sleep(2)
    except:
        print("No apareci√≥ bot√≥n de cookies (o ya estaba aceptado).")

    # 2) Esperar a que cargue el contenedor de resultados
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='property-card']")))
    except:
        print("‚ö†Ô∏è No se detectaron property-cards. Puede haber captcha o bloqueo.")
        driver.save_screenshot("booking_debug.png")
        print("üì∏ Captura guardada: booking_debug.png")
        driver.quit()
        return

    # 3) Scroll para cargar m√°s hoteles
    for _ in range(4):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    # 4) Extraer hoteles
    hotels = []
    cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='property-card']")
    print("Hoteles detectados:", len(cards))

    for card in cards:
        try:
            name = card.find_element(By.CSS_SELECTOR, "div[data-testid='title']").text.strip()
        except:
            name = "N/A"

        try:
            price = card.find_element(By.CSS_SELECTOR, "span[data-testid='price-and-discounted-price']").text.strip()
        except:
            price = "N/A"

        hotels.append({
            "checkin": checkin,
            "checkout": checkout,
            "hotel": name,
            "price": price
        })

    driver.quit()

    df = pd.DataFrame(hotels)

    filename = f"hoteles_madrid_{checkin}_to_{checkout}.csv"
    filepath = os.path.join(os.getcwd(), filename)

    df.to_csv(filepath, index=False, encoding="utf-8")
    print("‚úÖ CSV guardado en:", filepath)

scrape_booking_madrid("2024-07-01", "2024-07-03")


‚ö†Ô∏è No se detectaron property-cards. Puede haber captcha o bloqueo.
üì∏ Captura guardada: booking_debug.png


In [4]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import os

def scrape_booking_madrid(checkin, checkout):
    options = uc.ChromeOptions()
    options.add_argument("--start-maximized")

    driver = uc.Chrome(options=options)
    wait = WebDriverWait(driver, 25)

    url = f"https://www.booking.com/searchresults.html?ss=Madrid&checkin={checkin}&checkout={checkout}"
    driver.get(url)

    time.sleep(6)

    # Intentar aceptar cookies
    try:
        btn = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
        btn.click()
        time.sleep(2)
    except:
        pass

    # Esperar resultados
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='property-card']")))
    except:
        driver.save_screenshot("booking_debug.png")
        print("Bloqueo/captcha. Captura guardada: booking_debug.png")
        driver.quit()
        return

    # Scroll
    for _ in range(4):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='property-card']")
    print("Hoteles detectados:", len(cards))

    hotels = []
    for card in cards:
        try:
            name = card.find_element(By.CSS_SELECTOR, "div[data-testid='title']").text.strip()
        except:
            name = "N/A"

        try:
            price = card.find_element(By.CSS_SELECTOR, "span[data-testid='price-and-discounted-price']").text.strip()
        except:
            price = "N/A"

        hotels.append({
            "checkin": checkin,
            "checkout": checkout,
            "hotel": name,
            "price": price
        })

    driver.quit()

    df = pd.DataFrame(hotels)
    file = f"hoteles_madrid_{checkin}_to_{checkout}.csv"
    df.to_csv(file, index=False, encoding="utf-8")
    print("CSV guardado:", os.path.abspath(file))

scrape_booking_madrid("2024-07-01", "2024-07-03")


ModuleNotFoundError: No module named 'undetected_chromedriver'

In [5]:
import time
import os
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


def scrape_airbnb_madrid(checkin, checkout, scrolls=6):
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    wait = WebDriverWait(driver, 25)

    # URL de b√∫squeda
    url = (
        "https://www.airbnb.es/s/Madrid--Espa%C3%B1a/homes"
        f"?checkin={checkin}&checkout={checkout}"
        "&adults=2"
    )

    driver.get(url)
    time.sleep(6)

    # Aceptar cookies si aparecen
    try:
        btn_cookies = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(.,'Aceptar')]"))
        )
        btn_cookies.click()
        time.sleep(2)
    except:
        pass

    # Esperar a que haya tarjetas (si no hay, puede ser bloqueo)
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='card-container']")))
    except:
        driver.save_screenshot("airbnb_debug.png")
        print("‚ö†Ô∏è No se detectaron resultados. Captura guardada: airbnb_debug.png")
        driver.quit()
        return

    # Scroll para cargar m√°s
    for i in range(scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)

    cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='card-container']")
    print("Cards detectadas:", len(cards))

    results = []
    for card in cards:
        # Nombre
        try:
            title = card.text.split("\n")[0].strip()
        except:
            title = "N/A"

        # Precio (Airbnb suele mostrarlo como "xx ‚Ç¨ noche")
        try:
            price = card.find_element(By.XPATH, ".//*[contains(text(),'‚Ç¨')]").text.strip()
        except:
            price = "N/A"

        # Link
        try:
            link = card.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
        except:
            link = "N/A"

        results.append({
            "checkin": checkin,
            "checkout": checkout,
            "title": title,
            "price_raw": price,
            "link": link
        })

    driver.quit()

    df = pd.DataFrame(results)

    filename = f"airbnb_madrid_{checkin}_to_{checkout}.csv"
    filepath = os.path.join(os.getcwd(), filename)
    df.to_csv(filepath, index=False, encoding="utf-8")
    print("‚úÖ CSV guardado en:", filepath)


# EJEMPLO (Julio 2024)
scrape_airbnb_madrid("2024-07-01", "2024-07-03", scrolls=8)


Cards detectadas: 18
‚úÖ CSV guardado en: C:\Users\46mjn\ATD\airbnb_madrid_2024-07-01_to_2024-07-03.csv


In [6]:
import time
import os
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def scrape_airbnb_prices(checkin, checkout, scrolls=6):
    options = webdriver.ChromeOptions()
    # Evitar que se detecte Selenium (no siempre funciona pero ayuda)
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_argument("--start-maximized")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    wait = WebDriverWait(driver, 20)

    # Construir la URL de Airbnb para Madrid
    url = (
        f"https://www.airbnb.es/s/Madrid--Espa√±a/homes?"
        f"checkin={checkin}&checkout={checkout}&adults=2"
    )
    driver.get(url)

    # Aceptar cookies si aparece
    try:
        btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Aceptar')]")))
        btn.click()
        time.sleep(2)
    except:
        pass

    # Esperar a que aparezcan resultados
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='card-container']")))
    except:
        driver.save_screenshot("airbnb_debug.png")
        print("‚ö†Ô∏è No se detectaron resultados o hay bloqueo. Captura en airbnb_debug.png")
        driver.quit()
        return

    # Hacer scroll para cargar m√°s resultados
    for _ in range(scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='card-container']")
    print("Total de tarjetas encontradas:", len(cards))

    data = []

    for card in cards:
        try:
            title = card.find_element(By.CSS_SELECTOR, "div[role='group']").text.split("\n")[0]
        except:
            title = "N/A"

        # Intentar extraer el precio por noche
        price_text = "N/A"
        try:
            # Buscar cualquier texto en la tarjeta que contenga "‚Ç¨"
            spans = card.find_elements(By.TAG_NAME, "span")
            for s in spans:
                text = s.text
                if "‚Ç¨" in text:
                    price_text = text
                    break
        except:
            price_text = "N/A"

        try:
            link = card.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            link = "N/A"

        data.append({
            "checkin": checkin,
            "checkout": checkout,
            "title": title,
            "price": price_text,
            "link": link
        })

    driver.quit()

    df = pd.DataFrame(data)
    filename = f"airbnb_madrid_prices_{checkin}_to_{checkout}.csv"
    filepath = os.path.join(os.getcwd(), filename)
    df.to_csv(filepath, index=False, encoding="utf-8")

    print("‚úÖ CSV guardado en:", filepath)


# EJEMPLO con fechas de 2024
scrape_airbnb_prices("2024-07-01", "2024-07-03")


Total de tarjetas encontradas: 18
‚úÖ CSV guardado en: C:\Users\46mjn\ATD\airbnb_madrid_prices_2024-07-01_to_2024-07-03.csv


In [8]:
import pandas as pd

# Leer el archivo
df = pd.read_csv('airbnb_madrid_prices_2024-07-01_to_2024-07-03.csv')

# Procesar la columna de precio para extraer solo el precio final
def extract_final_price(price_text):
    # Buscar el patr√≥n "XXX ‚Ç¨ en total"
    import re
    matches = re.findall(r'(\d+)\s*‚Ç¨\s*en total', str(price_text))
    if matches:
        return int(matches[0])
    return None

df['final_price'] = df['price'].apply(extract_final_price)

# Tambi√©n podr√≠as extraer el precio original si existe
def extract_original_price(price_text):
    import re
    # Buscar el primer n√∫mero con ‚Ç¨ que no tenga "en total"
    matches = re.findall(r'^(\d+)\s*‚Ç¨', str(price_text))
    if matches:
        return int(matches[0])
    return None

df['original_price'] = df['price'].apply(extract_original_price)

In [9]:
# Ver las primeras filas con las nuevas columnas
print(df[['title', 'original_price', 'final_price']].head())

# Ver estad√≠sticas b√°sicas de los precios
print(f"\nEstad√≠sticas del precio final:")
print(f"Media: {df['final_price'].mean():.2f} ‚Ç¨")
print(f"M√≠nimo: {df['final_price'].min()} ‚Ç¨")
print(f"M√°ximo: {df['final_price'].max()} ‚Ç¨")
print(f"Mediana: {df['final_price'].median()} ‚Ç¨")

# Ver cu√°ntos registros tienen precio original vs final
print(f"\nTotal registros: {len(df)}")
print(f"Con precio final: {df['final_price'].notna().sum()}")
print(f"Con precio original: {df['original_price'].notna().sum()}")

                       title  original_price  final_price
0                        NaN             159          159
1                        NaN             497          422
2  Recomendaci√≥n del viajero             358          358
3                        NaN             457          402
4             Superanfitri√≥n             516          334

Estad√≠sticas del precio final:
Media: 351.56 ‚Ç¨
M√≠nimo: 159 ‚Ç¨
M√°ximo: 564 ‚Ç¨
Mediana: 352.0 ‚Ç¨

Total registros: 18
Con precio final: 18
Con precio original: 18


In [10]:
import time
import os
import pandas as pd
from datetime import date, timedelta

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


def scrape_airbnb_prices(checkin, checkout, scrolls=6):
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_argument("--start-maximized")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    wait = WebDriverWait(driver, 20)

    url = f"https://www.airbnb.es/s/Madrid--Espa√±a/homes?checkin={checkin}&checkout={checkout}&adults=2"
    driver.get(url)

    # Aceptar cookies si aparece
    try:
        btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Aceptar')]")))
        btn.click()
        time.sleep(2)
    except:
        pass

    # Esperar a que aparezcan resultados
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='card-container']")))
    except:
        driver.save_screenshot(f"airbnb_debug_{checkin}.png")
        print(f"‚ö†Ô∏è No se detectaron resultados para {checkin}-{checkout}. Captura guardada.")
        driver.quit()
        return []

    # Scroll para cargar m√°s resultados
    for _ in range(scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='card-container']")
    print(f"Total de tarjetas encontradas para {checkin}-{checkout}:", len(cards))

    data = []

    for card in cards:
        try:
            title = card.find_element(By.CSS_SELECTOR, "div[role='group']").text.split("\n")[0]
        except:
            title = "N/A"

        price_text = "N/A"
        try:
            spans = card.find_elements(By.TAG_NAME, "span")
            for s in spans:
                text = s.text
                if "‚Ç¨" in text:
                    price_text = text
                    break
        except:
            price_text = "N/A"

        try:
            link = card.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            link = "N/A"

        data.append({
            "checkin": checkin,
            "checkout": checkout,
            "title": title,
            "price": price_text,
            "link": link
        })

    driver.quit()
    return data


# ==============================================
# GENERAR FECHAS DEL A√ëO 2024 (viernes ‚Üí domingo)
# ==============================================
start_date = date(2024, 1, 5)  # primer viernes de 2024
end_date = date(2024, 12, 31)
delta = timedelta(days=7)  # para cada fin de semana

all_data = []

current = start_date
while current <= end_date:
    checkin = current.strftime("%Y-%m-%d")
    checkout = (current + timedelta(days=2)).strftime("%Y-%m-%d")  # viernes‚Üídomingo

    print(f"\n‚è≥ Extrayendo datos para {checkin} ‚Üí {checkout}")
    week_data = scrape_airbnb_prices(checkin, checkout, scrolls=6)
    all_data.extend(week_data)

    current += delta

# Guardar todo en CSV
if all_data:
    df = pd.DataFrame(all_data)
    filename = f"airbnb_madrid_prices_2024.csv"
    filepath = os.path.join(os.getcwd(), filename)
    df.to_csv(filepath, index=False, encoding="utf-8")
    print("\n‚úÖ CSV final guardado con todo 2024 en:", filepath)
else:
    print("‚ö†Ô∏è No se extrajeron datos para ning√∫n fin de semana de 2024.")



‚è≥ Extrayendo datos para 2024-01-05 ‚Üí 2024-01-07
Total de tarjetas encontradas para 2024-01-05-2024-01-07: 18

‚è≥ Extrayendo datos para 2024-01-12 ‚Üí 2024-01-14
Total de tarjetas encontradas para 2024-01-12-2024-01-14: 18

‚è≥ Extrayendo datos para 2024-01-19 ‚Üí 2024-01-21
Total de tarjetas encontradas para 2024-01-19-2024-01-21: 18

‚è≥ Extrayendo datos para 2024-01-26 ‚Üí 2024-01-28
Total de tarjetas encontradas para 2024-01-26-2024-01-28: 18

‚è≥ Extrayendo datos para 2024-02-02 ‚Üí 2024-02-04
Total de tarjetas encontradas para 2024-02-02-2024-02-04: 18

‚è≥ Extrayendo datos para 2024-02-09 ‚Üí 2024-02-11
Total de tarjetas encontradas para 2024-02-09-2024-02-11: 18

‚è≥ Extrayendo datos para 2024-02-16 ‚Üí 2024-02-18
Total de tarjetas encontradas para 2024-02-16-2024-02-18: 18

‚è≥ Extrayendo datos para 2024-02-23 ‚Üí 2024-02-25
Total de tarjetas encontradas para 2024-02-23-2024-02-25: 18

‚è≥ Extrayendo datos para 2024-03-01 ‚Üí 2024-03-03
Total de tarjetas encontradas para 