In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import re
import random

# Define the variables
base_url = "https://www.booking.com/searchresults.html"
initial_params = {
    'checkin_monthday': '11',
    'checkin_month': '7',
    'checkin_year': '2024',
    'checkout_monthday': '12',  # check-out יום אחרי check-in
    'checkout_month': '7',
    'checkout_year': '2024',
    'group_adults': '2',
    'group_children': '0',
    'no_rooms': '1',
    'ss': 'New York'
}

# Browser and page setup
options = webdriver.ChromeOptions()
# options.add_argument('--headless')  # Uncomment to run in headless mode
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


# Function to extract hotel data from HTML
def get_hotel_data(soup, checkin_date, checkout_date, search_date, order):
    hotels = []
    for hotel in soup.find_all('div',
                               class_='e01df12ddf a0914461b0 d46a3604b5 ba1c6fdc7f f550b7da28 b9a2fd8068 cb4a416743'):
        name_div = hotel.find('div', class_='e037993315 f5f8fe25fa')
        name = name_div.get_text(strip=True) if name_div else 'N/A'

        grade_div = hotel.find('div', class_='d0522b0cca fd44f541d8')
        grade_text = grade_div.get_text(strip=True) if grade_div else 'N/A'
        grade = re.findall(r'\d+\.\d+', grade_text)[0] if re.findall(r'\d+\.\d+', grade_text) else 'N/A'

        descriptive_grades_div = hotel.find('div', class_='d0522b0cca eb02592978 f374b67e8c')
        descriptive_grades = descriptive_grades_div.get_text(strip=True) if descriptive_grades_div else 'N/A'

        # Example usage:
        num_of_reviews_div = hotel.find('div', class_='e8acaa0d22 ab107395cb c60bada9e4')
        num_of_reviews_text = num_of_reviews_div.get_text(strip=True) if num_of_reviews_div else 'N/A'
        num_of_reviews = extract_number_with_commas(num_of_reviews_text)

        # Extract neighborhood information
        neighborhood_span = hotel.find('span', {'class': 'cf35c10683 d57d1b7d64', 'data-testid': 'address'})
        neighborhood = neighborhood_span.get_text(strip=True) if neighborhood_span else 'N/A'

        try:
            bed_type_li = hotel.find('li', class_='cdddd9123a')
            if bed_type_li:
                bed_type_div = bed_type_li.find('div', class_='e8acaa0d22')
                bed_type = bed_type_div.text.strip() if bed_type_div else "N/A"
            else:
                bed_type = "N/A"
        except AttributeError:
            bed_type = "N/A"

        try:
            room_type = hotel.find('h4', class_='e8acaa0d22 e7baf22fe8').text.strip()
        except AttributeError:
            room_type = "N/A"

        try:
           breakfast_span = hotel.find('span', class_='d335eeb005')
           breakfast = "Yes" if breakfast_span else "No"
        except AttributeError:
           breakfast = "No"
        try:
            location_div = hotel.find('div', class_='e8acaa0d22 c4cbd52971')
            location_span = location_div.find('span', class_='cdebd92b49')
            location_text = location_span.find('span', {'data-testid': 'distance'}).text.strip() if location_span else "N/A"
            distance_from_center = re.findall(r'\d+\.\d+', location_text)[0] if re.findall(r'\d+\.\d+', location_text) else 'N/A'
        except AttributeError:
            distance_from_center = "N/A"

        try:
            location_rating_link = hotel.find('a', class_='dba1b3bddf d371fb5186 e2ebd44c68')
            location_rating_text = location_rating_link.find('span',
                                                             class_='ad3c4dc079').text.strip() if location_rating_link else "N/A"
            location_rating = re.findall(r'\d+\.\d+', location_rating_text)[0] if re.findall(r'\d+\.\d+',
                                                                                             location_rating_text) else 'N/A'
        except TypeError:
            location_rating = "N/A"

        # Extract star rating
        star_rating_div = hotel.find('div', {'data-testid': 'rating-stars'})
        if star_rating_div:
            stars = star_rating_div.find_all('span', class_='dbf48415a7 bf229ead1b d8273c7b5a')
            star_rating = len(stars)
        else:
            star_rating = 'N/A'

        try:
            price = hotel.find('span', class_='e037993315 ab91cb3011 d9315e4fb0').text.strip()
        except AttributeError:
            price = "N/A"

        try:
            price_before_discount = hotel.find('span', class_='f018fa3636 d9315e4fb0').text.strip()
        except AttributeError:
            price_before_discount = "N/A"

        try:
            free_cancellation_div = hotel.find('div', class_='e8acaa0d22 d40b1dc96f')
            free_cancellation = "Yes" if free_cancellation_div else "No"
        except AttributeError:
            free_cancellation = "No"

        try:
            no_prepayment_div = hotel.find('div', class_='b290e5dfa6 b0eee6023f')
            no_prepayment = "Yes" if no_prepayment_div else "No"
        except AttributeError:
            no_prepayment = "No"

        try:
            limited_rooms_text = hotel.find('div', class_='dc47954a96').text.strip() if hotel.find('div',
                                                                                                    class_='dc47954a96') else "No"
            how_many_rooms_left = re.findall(r'\d+', limited_rooms_text)[0] if re.findall(r'\d+', limited_rooms_text) else "No"
        except AttributeError:
            how_many_rooms_left = "No"

        search_date_str = search_date.strftime('%Y-%m-%d')

        # Calculate TTT - Time to Travel
        ttt = (datetime.strptime(checkin_date, '%Y-%m-%d') - datetime.strptime(search_date_str, '%Y-%m-%d')).days

        # Calculate LOS - Length of Stay
        los = (datetime.strptime(checkout_date, '%Y-%m-%d') - datetime.strptime(checkin_date, '%Y-%m-%d')).days

        order += 1
        hotels.append({
            'name': name,
            'grade': grade,
            'descriptive_grades':descriptive_grades,
            'num_of_reviews': num_of_reviews,
            'breakfast': breakfast,
            'distance_from_center': distance_from_center,
            'location_rating': location_rating,
            'neighborhood': neighborhood,
            'stars': star_rating,
            'bed_type': bed_type,
            'room_type': room_type,
            'price': price,
            'price_before_discount': price_before_discount,
            'free_cancellation': free_cancellation,
            'no_prepayment': no_prepayment,
            'how_many_rooms_left': how_many_rooms_left,
            'checkin_date': checkin_date,
            'checkout_date': checkout_date,
            'TTT': ttt,
            'LOS': los,
            'Snapshot Date': search_date.strftime('%Y-%m-%d'),
            'order': order
        })
    return hotels


# Function to update check-in and check-out dates
def update_dates(params, checkin_days_to_add, checkout_days_to_add):
    checkin_date = datetime(int(params['checkin_year']), int(params['checkin_month']), int(params['checkin_monthday']))
    checkin_date += timedelta(days=checkin_days_to_add)

    checkout_date = checkin_date + timedelta(days=checkout_days_to_add)

    params['checkin_monthday'] = str(checkin_date.day)
    params['checkin_month'] = str(checkin_date.month)
    params['checkin_year'] = str(checkin_date.year)

    params['checkout_monthday'] = str(checkout_date.day)
    params['checkout_month'] = str(checkout_date.month)
    params['checkout_year'] = str(checkout_date.year)

    return params, checkin_date.strftime('%Y-%m-%d'), checkout_date.strftime('%Y-%m-%d')


def extract_number_with_commas(text):
    # Search for a pattern of digits with optional commas
    match = re.search(r'\d{1,3}(?:,\d{3})*', text)
    if match:
        # Replace commas and convert to integer
        return int(match.group().replace(',', ''))
    else:
        return 'N/A'

# Function to close popup if it exists
def close_popup_if_exists():
    try:
        popup_close_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Dismiss sign in information.']"))
        )
        popup_close_button.click()
        print("Popup closed")
    except (TimeoutException, NoSuchElementException):
        print("No popup to close")

# New function to scroll the page
def scroll_to_button():
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(2, 4))
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# New function to check for CAPTCHA
def check_for_captcha():
    try:
        captcha = driver.find_element(By.ID, "captcha")
        if captcha.is_displayed():
            print("CAPTCHA detected. Manual intervention required.")
            input("Press Enter after solving the CAPTCHA...")
    except NoSuchElementException:
        pass

# List to store all results
all_hotels = []

max_checkin_days = 30

# Record the search date
search_date = datetime.now()

for checkin_day in range(max_checkin_days):
    # Update check-in dates
    params, checkin_date, checkout_date = update_dates(initial_params.copy(), checkin_day, 1)

    order = 0

    for checkout_day in range(1, 6):
        params, checkin_date, checkout_date = update_dates(params.copy(), 0, checkout_day)

        # Build the URL with parameters
        url_with_params = f"{base_url}?checkin_monthday={params['checkin_monthday']}&checkin_month={params['checkin_month']}&checkin_year={params['checkin_year']}&checkout_monthday={params['checkout_monthday']}&checkout_month={params['checkout_month']}&checkout_year={params['checkout_year']}&group_adults={params['group_adults']}&group_children={params['group_children']}&no_rooms={params['no_rooms']}&ss={params['ss']}"

        # Open the page
        driver.get(url_with_params)
        time.sleep(random.uniform(7, 10))  # Random wait

        # Close the popup if it exists
        close_popup_if_exists()

        # Check for CAPTCHA
        check_for_captcha()

        max_click_LOAD_MORE = 4
        count = 0

        while count < max_click_LOAD_MORE:
            try:
                # Scroll to make the button visible
                scroll_to_button()

                # Find and click the 'Load more results' button
                load_more_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[.//span[contains(text(), 'Load more results')]]"))
                )
                load_more_button.click()
                print(f"Button clicked #{count + 1}")
                count += 1
                time.sleep(random.uniform(7, 10))  # Random wait after clicking

            except (TimeoutException, NoSuchElementException) as e:
                print(f"Stopping scroll due to: {str(e)}")
                break

        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extract hotel data from the current page
        hotels = get_hotel_data(soup, checkin_date, checkout_date, search_date, order)
        all_hotels.extend(hotels)
        order += 1
        

# Close the browser
driver.quit()

# Create a DataFrame to display the data
df = pd.DataFrame(all_hotels)
print(df)

# Save the data to a CSV file
df.to_csv('booking_hotels_ny.csv', index=False, encoding='utf-8-sig')