In [125]:
import json
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException
from selenium import webdriver
import os
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
import requests
import re
from datetime import datetime

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
dfolder='/home/pere/ITMNLP_DSDM/booking'
geko_path='/home/pere/ITMNLP_DSDM/geckodriver-v0.34.0-linux64/geckodriver'
link='https://www.booking.com/index.es.html'

In [126]:
# functions

def ffx_preferences(dfolder, download=False):
    '''
    Sets the preferences of the firefox browser: download path.
    '''
    profile = webdriver.FirefoxProfile()
    # set download folder:
    profile.set_preference("browser.download.dir", dfolder)
    profile.set_preference("browser.download.folderList", 2)
    profile.set_preference("browser.download.manager.showWhenStarting", False)
    profile.set_preference("browser.helperApps.neverAsk.saveToDisk",
                           "application/msword,application/rtf, application/csv,text/csv,image/png ,image/jpeg, application/pdf, text/html,text/plain,application/octet-stream")
    
    #profile.add_extension('/Users/luisignaciomenendezgarcia/Dropbox/CLASSES/class_bse_text_mining/class_scraping_bse/booking/booking/ublock_origin-1.55.0.xpi')


    # this allows to download pdfs automatically
    if download:
        profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf,application/x-pdf")
        profile.set_preference("pdfjs.disabled", True)

    options = Options()
    options.profile = profile
    return options


def start_up(link, dfolder, geko_path,donwload=True):
    # geko_path='/Users/luisignaciomenendezgarcia/Dropbox/CLASSES/class_bse_text_mining/class_scraping_bse/booking/geckodriver'
    # download_path='./downloads'
    os.makedirs(dfolder, exist_ok=True)

    options = ffx_preferences(dfolder,donwload)
    service = Service(geko_path)
    browser = webdriver.Firefox(service=service, options=options)
    # Enter the website address here
    browser.get(link)
    time.sleep(5)  # Adjust sleep time as needed
    return browser


def check_and_click(browser, xpath, type):
    '''
    Function that checks whether the object is clickable and, if so, clicks on
    it. If not, waits one second and tries again.
    '''
    ck = False
    ss = 0
    while ck == False:
        ck = check_obscures(browser, xpath, type)
        time.sleep(1)
        ss += 1
        if ss == 15:
            # warn_sound()
            # return NoSuchElementException
            ck = True
            # browser.quit()

def check_obscures(browser, xpath, type):
    '''
    Function that checks whether the object is being "obscured" by any element so
    that it is not clickable. Important: if True, the object is going to be clicked!
    '''
    try:
        if type == "xpath":
            browser.find_element('xpath',xpath).click()
        elif type == "id":
            browser.find_element('id',xpath).click()
        elif type == "css":
            browser.find_element('css selector',xpath).click()
        elif type == "class":
            browser.find_element('class name',xpath).click()
        elif type == "link":
            browser.find_element('link text',xpath).click()
    except (ElementClickInterceptedException, NoSuchElementException, StaleElementReferenceException) as e:
        print(e)
        return False
    return True

In [127]:
# own functions


#function assumes that dates selected are within 2024
def select_dates(from_month, from_day, to_month, to_day):
    
    current_datetime = datetime.now()
    current_month = current_datetime.month
    iters = int(from_month) - current_month
    for i in range(iters):
        browser.find_element(by='xpath',value='//div[@id="calendar-searchboxdatepicker"]//button[@class="a83ed08757 c21c56c305 f38b6daa18 d691166b09 f671049264 deab83296e f4552b6561 dc72a8413c f073249358"]').click()
    path='//div[@id="calendar-searchboxdatepicker"]//table[@class="eb03f3f27f"]//tbody//td[@class="b80d5adb18"]//span[@class="cf06f772fa"]'
    dates = browser.find_elements('xpath',path)
    return dates

def get_number_pages(browser):
    '''
    Get the number of pages. 
    '''
    a = browser.find_elements('xpath',
        '//li[@class="b16a89683f"]')
    return(int(a[-1].text))

def accommodation_per_page(soup):
    # Find all the hotel elements in the HTML document
    hotels = soup.findAll('div', {'data-testid': 'property-card'})

    hotels_data = []
    # Loop over the hotel elements and extract the desired data
    for hotel in hotels:
        # Names
        name_element = hotel.find('div', {'data-testid': 'title'})
        name = name_element.text.strip()

        # Location
        location_element = hotel.find('span', {'data-testid': 'address'})
        location = location_element.text.strip()

        # Distance from center
        distance_element = hotel.find('span', {'data-testid': 'distance'})
        distance = distance_element.text.strip()

        #Rating
        rating_element = hotel.find('div', {'class': 'a3b8729ab1 d86cee9b25'})
        try:
            rating = rating_element.text.strip()
        except AttributeError:
            rating = None
        
        #Price
        price_element = hotel.find('span', {'class': 'f6431b446c fbfd7c1165 e84eb96b1f'})
        price = price_element.text.strip()
        
        #Description
        description_element = hotel.find('h4', {'class': 'abf093bdfe e8f7c070a7'})
        try:
            description = description_element.text.strip()
        except AttributeError:
            description = None

        # Append hotels_data with info about hotel
        hotels_data.append({
            'name': name,
            'location': location,
            'distance': distance,
            'rating_score': rating,
            'price': price,
            'description': description
        })
    return hotels_data

In [128]:
#main pipeline function

def query_to_df(place, from_month, from_day, to_month, to_day):

    
    #accept cookies
    browser.find_element(by='xpath',value='//button[@id="onetrust-accept-btn-handler"]').click()

    #close google account pop-up
    iframe_element = browser.find_element(by='xpath',value='//div[@class="google-one-tap"]//div[@id="credential_picker_container"]//iframe[@title="Cuadro de diálogo Iniciar sesión con Google"]')
    browser.switch_to.frame(iframe_element) # Switch to the iframe
    browser.find_element(by='xpath', value='//div[@id="close"]').click()
    browser.switch_to.default_content() #back to default
    
    #click on box and input place
    browser.find_element(by='xpath',value='//*[@id=":re:"]').click()
    search1 = browser.find_element(by='xpath',value='//*[@id=":re:"]')
    search1.send_keys(place)
    
    #select dates and click search 
    css='button.ebbedaf8ac:nth-child(2) > span:nth-child(1)'
    browser.find_element('css selector',css).click()
    time.sleep(1)
    dates = select_dates(from_month, from_day, to_month, to_day)
    for date in dates:
            if date.get_attribute("data-date") == f"2024-{from_month}-{from_day}":
                date.click()
            if date.get_attribute("data-date") == f"2024-{to_month}-{to_day}":
                date.click()
                break
    my_xpath='/html/body/div[3]/div[2]/div/form/div[1]/div[4]/button/span'
    check_obscures(browser,my_xpath , type='xpath')
    check_and_click(browser,my_xpath , type='xpath')
    
    #close pop-up for booking sign-in if it appears
    try:
        browser.find_element(by='xpath',value='//div[@class="abcc616ec7 cc1b961f14 c180176d40 f11eccb5e8 ff74db973c"]').click()
    except NoSuchElementException:
        pass
    
    #obtain number of pages and current url to use BS
    pages = get_number_pages(browser)
    url = browser.current_url
    
    # Create an empty list to add the hotels
    hotel_data = []
    
    #loop over pages and extract the necessary information
    for i in range(0,25*pages,25):
        result = requests.get(url + "&offset={}".format(i),headers = headers)
        soup = BeautifulSoup(result.text,'html.parser')
        hotex = accommodation_per_page(soup)
        hotel_extended_data.extend(hotex)
        
    # Convert into a DataFrame
    hotels = pd.DataFrame(hotel_extended_data)
    
    return hotels

In [129]:
#example query


#open booking
browser=start_up(dfolder=dfolder,link=link,geko_path=geko_path)
hotels = query_to_df("Barcelona", "06", "17", "06", "23")

Message: Unable to locate element: /html/body/div[3]/div[2]/div/form/div[1]/div[4]/button/span; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:189:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:507:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:132:16

Message: Unable to locate element: /html/body/div[3]/div[2]/div/form/div[1]/div[4]/button/span; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:189:5
NoSuchElementError@chrome://remote/content/shared/webdriver/E

In [130]:
hotels.head()

Unnamed: 0,name,location,distance,rating_score,price,description
0,Majestic 4BR/4BATH apartment in Sagrada Famili...,"Gracia, Barcelona",a 2 km del centro,10.0,€ 7.646,Apartamento
1,Le Palacete powered by Sonder,"Gracia, Barcelona","a 2,4 km del centro",81.0,€ 1.642,Habitación con cama extragrande.
2,Nice apartment BCN two rooms wifi,"Sarrià-Sant Gervasi, Barcelona",a 2 km del centro,,€ 467,Apartamento de 1 dormitorio
3,Hotel & Spa Villa Olimpica Suites,"Sant Martí, Barcelona","a 2,2 km del centro",80.0,€ 1.267,Habitación Doble con acceso al spa
4,Catalonia Sagrada Familia,"Sant Martí, Barcelona","a 2,6 km del centro",82.0,€ 1.228,Room Assigned on Arrival
