### Scrapping

In [4]:
from concurrent.futures import ThreadPoolExecutor  
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
#  Function to Start Firefox Browser
def start_browser(link, dfolder, geko_path, window_size='1500,1080'):
    os.makedirs(dfolder, exist_ok=True)
    options = Options()
    options.add_argument('--start-maximized')
    options.set_preference('privacy.trackingprotection.enabled', True)
    service = Service(geko_path)
    browser = webdriver.Firefox(service=service, options=options)
    
    width, height = map(int, window_size.split(','))
    browser.set_window_size(width, height)
    
    browser.get(link)
    time.sleep(3)  # Let page load
    
    return browser

# Function to Search, Scroll & Scrape Hotels
def search_and_scrape(browser, city, year, month, month_text, arrival_day, departure_day, filename):
    try:
        print(f"Searching for {city} ({year}-{month}-{arrival_day} to {year}-{month}-{departure_day})")

        # Click search bar and enter city
        WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id=":rh:"]'))).click()
        time.sleep(1)
        search_box = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id=":rh:"]')))
        search_box.send_keys(city)
        time.sleep(1)
        browser.find_element(By.XPATH, '//*[@class="ebbedaf8ac ab26a5d2bd e33c97ff6b"]').click()
        time.sleep(2)

        # Select correct month and year
        target_date = month_text + ' ' + year
        while browser.find_element(By.XPATH, '//h3[contains(@class, "e1eebb6a1e ee7ec6b631")]').text.strip() != target_date:
            browser.find_element(By.XPATH, '//*[@class="a83ed08757 c21c56c305 f38b6daa18 d691166b09 f671049264 f4552b6561 dc72a8413c f073249358"]').click()
            time.sleep(1)

        # Select arrival and departure dates
        dates = browser.find_elements(By.XPATH, '//table[@class="eb03f3f27f"]//td[@class="b80d5adb18"]//span[@class="cf06f772fa ef091eb985"]')
        for date in dates:
            if date.get_attribute("data-date") == f"{year}-{month}-{arrival_day}":
                date.click()
            if date.get_attribute("data-date") == f"{year}-{month}-{departure_day}":
                date.click()
                break
        time.sleep(2)

        # Click search button
        browser.find_element(By.XPATH, '//button[@class="a83ed08757 c21c56c305 a4c1805887 f671049264 a2abacf76b c082d89982 cceeb8986b b9fd3c6b3c"]').click()
        time.sleep(2)

        #Scroll and load all results
        print(f"Scrolling and clicking 'See More' for {city}...")
        i = 0
        while True:
            try:
                total_height = browser.execute_script("return document.body.scrollHeight")
                browser.execute_script(f"window.scrollTo(0, {total_height});")
                time.sleep(2)  # Allow content to load

                load_more_button = WebDriverWait(browser, 5).until(
                    EC.element_to_be_clickable((By.XPATH, '//button[@class="a83ed08757 c21c56c305 bf0537ecb5 f671049264 af7297d90d c0e0affd09"]'))
                )
                load_more_button.click()
                i += 1
            except (NoSuchElementException, TimeoutException):
                break
        
        print(f' The "See More" button was clicked {i} times')

        # Scrape hotels
        print(f"Scraping hotels for {city}...")
        hotel_list_data = scrape_hotel_list_page_from_selenium(browser)

        # **Extract descriptions**
        descriptions = []
        for hotel in hotel_list_data:
            if hotel[3] != "NA":
                descriptions.append(scrape_hotel_detail_page(hotel[3]))  # Append description
            else:
                descriptions.append("NA")  # No link, add "NA" to description
        
        # Convert to DataFrame
        if hotel_list_data:
            df = pd.DataFrame(hotel_list_data, columns=['Name', 'Price', 'Rating', 'Detail Link','stars','center', 'num_coments'])
            df['Description'] = descriptions  # Add descriptions column
            df.to_csv(filename, index=False)
            print(f" Hotel data saved to {filename}")
        else:
            print(" No hotels found.")

    except Exception as e:
        print(f" Error during search and scrape: {e}")

    finally:
        print(f" Closing browser for {city}")
        browser.quit()

# Function to Scrape Hotel List
def scrape_hotel_list_page_from_selenium(browser):
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    
    hotel_data = []
    hotels = soup.find_all('div', {'data-testid': 'property-card'})
    
    # for hotel in hotels:
    #     name = hotel.find('div', {'data-testid': 'title'}).get_text(strip=True) if hotel.find('div', {'data-testid': 'title'}) else "NA"
    #     price = hotel.find('span', {'class': 'f6431b446c fbfd7c1165 e84eb96b1f'}).get_text(strip=True) if hotel.find('span', {'class': 'f6431b446c fbfd7c1165 e84eb96b1f'}) else "NA"
    #     rating_container = hotel.find('div', {'data-testid': 'review-score'})
    #     rating = rating_container.find('div').get_text(strip=True) if rating_container else "NA"
    #     detail_link = hotel.find('a', {'data-testid': 'title-link'})['href'] if hotel.find('a', {'data-testid': 'title-link'}) else None
    #     detail_link = detail_link if detail_link else "NA"
    #     stars = hotel.find('div', {'class': 'b3f3c831be'}).get('aria-label', 'NA').split()[0] if hotel.find('div', {'class': 'b3f3c831be'}) else "NA"

    #     center_element = hotel.select_one('span[data-testid="distance"]')
    #     center = center_element.get_text(strip=True).replace("a ", "") if center_element else "NA"

       
        
    #     hotel_data.append([name, price, rating, detail_link,stars,center])

    for hotel in hotels:
        # Name extraction
        name_elem = hotel.find('div', {'data-testid': 'title'})
        name = name_elem.get_text(strip=True) if name_elem else "NA"
        
        # Price extraction
        price_elem = hotel.find('span', {'class': 'f6431b446c fbfd7c1165 e84eb96b1f'})
        price = price_elem.get_text(strip=True) if price_elem else "NA"
        
        # Rating extraction
        rating_container = hotel.find('div', {'data-testid': 'review-score'})
        rating = rating_container.find('div').get_text(strip=True) if rating_container else "NA"
        
        # Detail link extraction
        title_link = hotel.find('a', {'data-testid': 'title-link'})
        detail_link = title_link['href'] if title_link and title_link.has_attr('href') else "NA"
        
        # Stars extraction
        stars_elem = hotel.find('div', {'class': 'b3f3c831be'})
        stars = stars_elem.get('aria-label', 'NA').split()[0] if stars_elem else "NA"
        
        # Center (distance) extraction: try CSS selector first, then fallback
        center_element = hotel.select_one('span[data-testid="distance"]')
        if not center_element:
            center_element = hotel.find(lambda tag: tag.name == "span" and "km" in tag.get_text() and "centro" in tag.get_text())
        center = center_element.get_text(strip=True).replace("a ", "") if center_element else "NA"
        
        # Number of comments extraction: look for a <div> containing "comentarios"
        comments_div = hotel.find(lambda tag: tag.name == "div" and "comentarios" in tag.get_text())
        if comments_div:
            comments_text = comments_div.get_text(strip=True)
            match = re.search(r'(\d+\.?\d*) comentario?s', comments_text)
            num_comments = match.group(1) if match else "NA"
        else:
            num_comments = "NA"
        
        hotel_data.append([name, price, rating, detail_link, stars, center, num_comments])
    
    
    return hotel_data

#  Function to Scrape Hotel Description
def scrape_hotel_detail_page(detail_url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    try:
        response = requests.get(detail_url, headers=headers, timeout=10)
        if response.status_code != 200:
            return "Failed to load page"

        soup = BeautifulSoup(response.text, 'html.parser')
        
        # **Try multiple possible description locations**
        description = soup.find('p', {'data-testid': 'property-description'})
        if not description:
            description = soup.find('div', {'class': 'a53cbfa6de b3efd73f69'})
        if not description:
            description = soup.find('span', {'class': 'e84eb96b1f'})
        
        return description.get_text(strip=True) if description else "NA"
    
    except Exception as e:
        print(f"Error scraping details from {detail_url}: {e}")
        return "Error"
def scrape_with_browser(browser_id):
    dfolder = './downloads'
    geko_path = ''  # Add the correct GeckoDriver path
    link = 'https://www.booking.com/index.es.html'
    
    browser = start_browser(link, dfolder, geko_path)

    locations = [
        ("Barcelona", "2025", "03", "marzo", "01", "07", "hotels_barcelona_MWC.csv"),
        ("Madrid", "2025", "03", "marzo", "01", "07", "hotels_madrid_MWC.csv"),
        ("Barcelona", "2025", "03", "marzo", "22", "28", "hotels_barcelona_after_MWC.csv"),
        ("Madrid", "2025", "03", "marzo", "22", "28", "hotels_madrid_after_MWC.csv")
    ]

    city, year, month, month_text, arrival_day, departure_day, filename = locations[browser_id]
    search_and_scrape(browser, city, year, month, month_text, arrival_day, departure_day, filename)
if __name__ == "__main__":
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(scrape_with_browser, range(4))

Searching for Barcelona (2025-03-22 to 2025-03-28)
 Error during search and scrape: Message: Element <input id=":rh:" class="eb46370fe1" name="ss"> is not clickable at point (409,365) because another element <div class="eb33ef7c47"> obscures it
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
ElementClickInterceptedError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:337:5
webdriverClickElement@chrome://remote/content/marionette/interaction.sys.mjs:177:11
interaction.clickElement@chrome://remote/content/marionette/interaction.sys.mjs:136:11
clickElement@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:344:29
receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:220:31

 Closing browser for Barcelona
Searching for Madrid (2025-03-22 to 2025-03-28)
Searching for Madrid (2025-03-01 to 2025-03-07)
 Error during searc

In [None]:
stars = hotel.find('div', {'class': 'b3f3c831be'}).get('aria-label', 'NA').split()[0] if hotel.find('div', {'class': 'b3f3c831be'}) else "NA"
center = hotel.find('span', {'data-testid': 'distance'}).get_text(strip=True).replace("a ", "") if hotel.find('span', {'data-testid': 'distance'}) else "NA"

In [1]:
from concurrent.futures import ThreadPoolExecutor  
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

# Function to Start Firefox Browser
def start_browser(link, dfolder, geko_path, window_size='1500,1080'):
    os.makedirs(dfolder, exist_ok=True)
    options = Options()
    options.add_argument('--start-maximized')
    options.set_preference('privacy.trackingprotection.enabled', True)
    service = Service(geko_path)
    browser = webdriver.Firefox(service=service, options=options)
    
    width, height = map(int, window_size.split(','))
    browser.set_window_size(width, height)
    
    browser.get(link)
    time.sleep(3)  # Let page load
    return browser

# Function to Search, Scroll & Scrape Hotels
def search_and_scrape(browser, city, year, month, month_text, arrival_day, departure_day, filename):
    try:
        print(f"Searching for {city} ({year}-{month}-{arrival_day} to {year}-{month}-{departure_day})")

        # Click search bar and enter city
        WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id=":rh:"]'))).click()
        time.sleep(1)
        search_box = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id=":rh:"]')))
        search_box.send_keys(city)
        time.sleep(1)
        browser.find_element(By.XPATH, '//*[@class="ebbedaf8ac ab26a5d2bd e33c97ff6b"]').click()
        time.sleep(2)

        # Select correct month and year
        target_date = month_text + ' ' + year
        while browser.find_element(By.XPATH, '//h3[contains(@class, "e1eebb6a1e ee7ec6b631")]').text.strip() != target_date:
            browser.find_element(By.XPATH, '//*[@class="a83ed08757 c21c56c305 f38b6daa18 d691166b09 f671049264 f4552b6561 dc72a8413c f073249358"]').click()
            time.sleep(1)

        # Select arrival and departure dates
        dates = browser.find_elements(By.XPATH, '//table[@class="eb03f3f27f"]//td[@class="b80d5adb18"]//span[@class="cf06f772fa ef091eb985"]')
        for date in dates:
            if date.get_attribute("data-date") == f"{year}-{month}-{arrival_day}":
                date.click()
            if date.get_attribute("data-date") == f"{year}-{month}-{departure_day}":
                date.click()
                break
        time.sleep(2)

        # Click search button
        browser.find_element(By.XPATH, '//button[@class="a83ed08757 c21c56c305 a4c1805887 f671049264 a2abacf76b c082d89982 cceeb8986b b9fd3c6b3c"]').click()
        time.sleep(2)

        # Scroll and load all results
        print(f"Scrolling and clicking 'See More' for {city}...")
        i = 0
        while True:
            try:
                total_height = browser.execute_script("return document.body.scrollHeight")
                browser.execute_script(f"window.scrollTo(0, {total_height});")
                time.sleep(2)  # Allow content to load

                load_more_button = WebDriverWait(browser, 5).until(
                    EC.element_to_be_clickable((By.XPATH, '//button[@class="a83ed08757 c21c56c305 bf0537ecb5 f671049264 af7297d90d c0e0affd09"]'))
                )
                load_more_button.click()
                i += 1
            except (NoSuchElementException, TimeoutException):
                break
        
        print(f'The "See More" button was clicked {i} times')

        # Scrape hotels
        print(f"Scraping hotels for {city}...")
        hotel_list_data = scrape_hotel_list_page_from_selenium(browser)

        # Extract descriptions from detail pages
        descriptions = []
        for hotel in hotel_list_data:
            if hotel[3] != "NA":
                descriptions.append(scrape_hotel_detail_page(hotel[3]))
            else:
                descriptions.append("NA")
        
        # Build DataFrame with the updated 7 columns and add the descriptions
        if hotel_list_data:
            df = pd.DataFrame(hotel_list_data, columns=['Name', 'Price', 'Rating', 'Detail Link', 'Stars', 'Center', 'Num_Comments'])
            df['Description'] = descriptions
            df.to_csv(filename, index=False)
            print(f"Hotel data saved to {filename}")
        else:
            print("No hotels found.")

    except Exception as e:
        print(f"Error during search and scrape: {e}")

    finally:
        print(f"Closing browser for {city}")
        browser.quit()

# Function to Scrape Hotel List with Center and Number of Comments
def scrape_hotel_list_page_from_selenium(browser):
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    hotel_data = []
    hotels = soup.find_all('div', {'data-testid': 'property-card'})
    
    for hotel in hotels:
        # Name extraction
        name_elem = hotel.find('div', {'data-testid': 'title'})
        name = name_elem.get_text(strip=True) if name_elem else "NA"
        
        # Price extraction
        price_elem = hotel.find('span', {'class': 'f6431b446c fbfd7c1165 e84eb96b1f'})
        price = price_elem.get_text(strip=True) if price_elem else "NA"
        
        # Rating extraction
        rating_container = hotel.find('div', {'data-testid': 'review-score'})
        rating = rating_container.find('div').get_text(strip=True) if rating_container else "NA"
        
        # Detail link extraction
        title_link = hotel.find('a', {'data-testid': 'title-link'})
        detail_link = title_link['href'] if title_link and title_link.has_attr('href') else "NA"
        
        # Stars extraction
        stars_elem = hotel.find('div', {'class': 'b3f3c831be'})
        stars = stars_elem.get('aria-label', 'NA').split()[0] if stars_elem else "NA"
        
        # Center (distance) extraction: try CSS selector first, then fallback
        center_element = hotel.select_one('span[data-testid="distance"]')
        if not center_element:
            center_element = hotel.find(lambda tag: tag.name == "span" and "km" in tag.get_text() and "centro" in tag.get_text())
        center = center_element.get_text(strip=True).replace("a ", "") if center_element else "NA"
        
        # Number of comments extraction: look for a <div> containing "comentarios"
        comments_div = hotel.find(lambda tag: tag.name == "div" and "comentarios" in tag.get_text())
        if comments_div:
            comments_text = comments_div.get_text(strip=True)
            match = re.search(r'([\d.,]+)', comments_text)
            num_comments = match.group(1) if match else "NA"
        else:
            num_comments = "NA"
        
        hotel_data.append([name, price, rating, detail_link, stars, center, num_comments])
    
    return hotel_data

# Function to Scrape Hotel Description from Detail Page
def scrape_hotel_detail_page(detail_url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        response = requests.get(detail_url, headers=headers, timeout=10)
        if response.status_code != 200:
            return "Failed to load page"
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Try multiple possible description locations
        description = soup.find('p', {'data-testid': 'property-description'})
        if not description:
            description = soup.find('div', {'class': 'a53cbfa6de b3efd73f69'})
        if not description:
            description = soup.find('span', {'class': 'e84eb96b1f'})
        
        return description.get_text(strip=True) if description else "NA"
    
    except Exception as e:
        print(f"Error scraping details from {detail_url}: {e}")
        return "Error"

def scrape_with_browser(browser_id):
    dfolder = './downloads'
    geko_path = ''  # Add the correct GeckoDriver path
    link = 'https://www.booking.com/index.es.html'
    browser = start_browser(link, dfolder, geko_path)
    locations = [
        ("Barcelona", "2025", "03", "marzo", "01", "07", "hotels_barcelona_MWC.csv"),
        ("Madrid", "2025", "03", "marzo", "01", "07", "hotels_madrid_MWC.csv"),
        ("Barcelona", "2025", "03", "marzo", "22", "28", "hotels_barcelona_after_MWC.csv"),
        ("Madrid", "2025", "03", "marzo", "22", "28", "hotels_madrid_after_MWC.csv")
    ]
    
    city, year, month, month_text, arrival_day, departure_day, filename = locations[browser_id]
    search_and_scrape(browser, city, year, month, month_text, arrival_day, departure_day, filename)

if __name__ == "__main__":
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(scrape_with_browser, range(4))



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/envs/DS_enviroment/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/envs/DS_enviroment/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/envs/DS_enviroment/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/envs/DS_enviroment/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/envs/DS_enviroment/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/envs/DS_enviroment/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



Searching for Madrid (2025-03-01 to 2025-03-07)
Searching for Barcelona (2025-03-01 to 2025-03-07)
Searching for Barcelona (2025-03-22 to 2025-03-28)
Searching for Madrid (2025-03-22 to 2025-03-28)
Scrolling and clicking 'See More' for Madrid...
Scrolling and clicking 'See More' for Barcelona...
Scrolling and clicking 'See More' for Barcelona...
Scrolling and clicking 'See More' for Madrid...
The "See More" button was clicked 24 times
Scraping hotels for Madrid...
The "See More" button was clicked 35 times
Scraping hotels for Barcelona...
The "See More" button was clicked 37 times
Scraping hotels for Madrid...
The "See More" button was clicked 37 times
Scraping hotels for Barcelona...
Hotel data saved to hotels_madrid_MWC.csv
Closing browser for Madrid
Hotel data saved to hotels_barcelona_MWC.csv
Closing browser for Barcelona
Hotel data saved to hotels_madrid_after_MWC.csv
Closing browser for Madrid
Hotel data saved to hotels_barcelona_after_MWC.csv
Closing browser for Barcelona
