In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import re

# Define the variables
base_url = "https://www.expedia.com"
initial_params = {
    'destination': 'New York (and vicinity), New York, United States of America',
    'startDate': '2024-07-11',
    'endDate': '2024-07-12',
    'adults': '2',
    'rooms': '1'
}

# Browser and page setup
options = webdriver.ChromeOptions()
# options.add_argument('--headless')  # Uncomment to run in headless mode
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


# Function to extract hotel data from HTML
# Function to extract hotel data from HTML


def get_hotel_data(driver, soup, checkin_date, checkout_date, search_date, order):
    hotels = []
    hotel_list = soup.find_all('div',
                               class_='uitk-card uitk-card-roundcorner-all uitk-card-has-border uitk-card-has-primary-theme')

    for hotel in hotel_list:
        # Extract hotel name
        name_div = hotel.find('h3',
                              class_='uitk-heading uitk-heading-5 overflow-wrap uitk-layout-grid-item uitk-layout-grid-item-has-row-start')
        name = name_div.get_text(strip=True) if name_div else 'N/A'

        # Extract neighborhood
        neighborhood_div = hotel.find('div',
                                      class_='uitk-text uitk-text-spacing-half truncate-lines-2 uitk-type-300 uitk-text-default-theme')
        neighborhood = neighborhood_div.get_text(strip=True) if neighborhood_div else 'N/A'

        # Extract current price
        current_price_div = hotel.find('div',
                                       class_='uitk-text uitk-type-500 uitk-type-medium uitk-text-emphasis-theme')
        current_price = current_price_div.get_text(strip=True).replace('€', '').strip() if current_price_div else 'N/A'

        # Extract original price before discount
        original_price_del = hotel.find('del')
        original_price = original_price_del.get_text(strip=True).replace('€',
                                                                         '').strip() if original_price_del else 'N/A'

        # Extract number of reviews
        #num_reviews_div = hotel.find('div',
        #                             class_='uitk-layout-flex uitk-layout-flex-align-items-center uitk-layout-flex-gap-one')

        #num_reviews_div = hotel.find('div',
        #                             class_='uitk-text uitk-type-200 uitk-type-regular uitk-text-default-theme')

        reviews_desc_span = hotel.find('span',
                                       class_='uitk-text uitk-type-300 uitk-type-medium uitk-text-emphasis-theme')
        reviews_desc = reviews_desc_span.get_text(strip=True).split()[0] if reviews_desc_span else 'N/A'

        num_reviews_span = hotel.find('span',
                                      class_='uitk-text uitk-type-200 uitk-type-regular uitk-text-default-theme')
        num_reviews = num_reviews_span.get_text(strip=True).split()[0] if num_reviews_span else 'N/A'

        break_pool_div = hotel.find('div',
                                    class_='uitk-text truncate-lines-2 uitk-type-200 uitk-text-default-theme')
        break_pool = break_pool_div.get_text(strip=True).split()[0] if break_pool_div else 'N/A'

        # Extract breakfast information
        breakfast_div = hotel.find('div',
                                   class_='uitk-layout-flex uitk-layout-flex-align-items-center uitk-layout-flex-gap-two uitk-spacing uitk-spacing-padding-inlineend-three uitk-spacing-padding-block-one')
        breakfast_info = breakfast_div.find('div',
                                            class_='uitk-text truncate-lines-2 uitk-type-200 uitk-text-default-theme') if breakfast_div else None

        #breakfast_info = hotel.find('div',
        #                            class_='uitk-text truncate-lines-2 uitk-type-200 uitk-text-default-theme')

        breakfast = breakfast_info.get_text(
            strip=True) if breakfast_info and 'Breakfast included' in breakfast_info.get_text(strip=True) else 'No'

        # Extract free cancellation information
        free_cancellation = 'No'  # Default value
        try:
            free_cancellation_div = hotel.find('div', class_='uitk-text uitk-type-300 uitk-text-positive-theme')
            free_cancellation = "Yes" if free_cancellation_div and 'Fully refundable' in free_cancellation_div.get_text(
                strip=True) else "No"
        except AttributeError:
            free_cancellation = "No"

        # Extract rooms left information
        badge_span = hotel.find('span', class_='uitk-badge-text')
        rooms_left = None
        if badge_span:
            badge_text = badge_span.get_text(strip=True)
            match = re.search(r'we have (\d+) left at', badge_text, re.IGNORECASE)
            if match:
                rooms_left = int(match.group(1))

        # Extract hotel grade
        grade_div = hotel.find('div', class_='uitk-layout-flex uitk-layout-flex-align-items-center')
        grade_text = grade_div.get_text(strip=True) if grade_div else 'N/A'
        grade = re.findall(r'\d+\.\d+', grade_text)[0] if re.findall(r'\d+\.\d+', grade_text) else 'N/A'

        # Extract "Fully refundable" and "Reserve now, pay later" information
        fully_refundable = "No"
        reserve_now_pay_later = "No"
        if hotel.find('span', string="Fully refundable"):
            fully_refundable = "Yes"
        if hotel.find('span', string="Reserve now, pay later"):
            reserve_now_pay_later = "Yes"

        price_after_taxes_div = hotel.find('div',
                                           class_='uitk-text uitk-type-end uitk-type-200 uitk-text-default-theme')
        price_after_taxes = price_after_taxes_div.get_text(strip=True).replace('$',
                                                                               '').strip() if current_price_div else 'N/A'
        search_date_str = search_date.strftime('%Y-%m-%d')

        # Calculate TTT - Time to Travel
        ttt = (datetime.strptime(checkin_date, '%Y-%m-%d') - datetime.strptime(search_date_str, '%Y-%m-%d')).days

        # Calculate LOS - Length of Stay
        los = (datetime.strptime(checkout_date, '%Y-%m-%d') - datetime.strptime(checkin_date, '%Y-%m-%d')).days

        order += 1

        hotels.append({
            'name': name,
            'neighborhood': neighborhood,
            'current_price': current_price,
            'original_price': original_price,
            'reviews_desc': reviews_desc,
            'num_reviews': num_reviews,
            'breakfast': breakfast,
            'free_cancellation': free_cancellation,
            'rooms_left': rooms_left,
            'grade': grade,
            'fully_refundable': fully_refundable,
            'reserve_now_pay_later': reserve_now_pay_later,
            'price_after_taxes': price_after_taxes,
            'checkin_date': checkin_date,
            'checkout_date': checkout_date,
            'TTT': ttt,
            'LOS': los,
            'Snapshot Date': search_date.strftime('%Y-%m-%d'),
            'order': order
        })

    return hotels


# Function to update check-in and check-out dates
def update_dates(params, checkin_days_to_add, checkout_days_to_add):
    checkin_date = datetime.strptime(params['startDate'], '%Y-%m-%d') + timedelta(days=checkin_days_to_add)
    checkout_date = checkin_date + timedelta(days=checkout_days_to_add)

    params['startDate'] = checkin_date.strftime('%Y-%m-%d')
    params['endDate'] = checkout_date.strftime('%Y-%m-%d')

    return params, checkin_date.strftime('%Y-%m-%d'), checkout_date.strftime('%Y-%m-%d')


# Function to close popup if it exists
def close_popup_if_exists():
    try:
        popup_close_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Dismiss sign in information.']"))
        )
        popup_close_button.click()
        print("Popup closed")
    except (TimeoutException, NoSuchElementException):
        print("No popup to close")


# List to store all results
all_hotels = []

max_checkin_days = 30  # Max days to search in advance

# Record the search date
search_date = datetime.now()

for checkin_day in range(max_checkin_days):
    # Update check-in dates
    params, checkin_date, checkout_date = update_dates(initial_params.copy(), checkin_day, 1)

    order = 0

    for checkout_day in range(1, 6):  # Check up to 5 days after the check-in
        params, checkin_date, checkout_date = update_dates(params.copy(), 0, checkout_day)

        # Build the URL with parameters
        url_with_params = f"{base_url}/Hotel-Search?destination={params['destination']}&flexibility=0_DAY&d1={params['startDate']}&startDate={params['startDate']}&d2={params['endDate']}&endDate={params['endDate']}&adults={params['adults']}&rooms={params['rooms']}&regionId=178293&theme=&userIntent=&semdtl=&useRewards=false&sort=RECOMMENDED"

        # Open the page
        driver.get(url_with_params)
        time.sleep(7)  # Wait a bit to allow the HTML to load

        # Close the popup if it exists
        close_popup_if_exists()

        # Scroll down to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for the page to load after scrolling

        # Click on 'Show more' button to load additional results
        max_click_LOAD_MORE = 1
        count = 0

        while count < max_click_LOAD_MORE:
            try:
                show_more_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH,
                                                "//button[contains(@class, 'uitk-button uitk-button-medium uitk-button-has-text uitk-button-secondary') and contains(text(), 'Show more')]"))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
                time.sleep(1)
                show_more_button.click()
                print(f"Button clicked #{count + 1}")
                count += 1
                time.sleep(7)  # Wait for the page to load after clicking 'Show more'
            except (TimeoutException, NoSuchElementException) as e:
                print(f"Failed to click 'Show more' button: {e}")
                break  # Exit the loop if no more "Show more" buttons

        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extract hotel data from the current page
        hotels = get_hotel_data(driver, soup, checkin_date, checkout_date, search_date, order)
        all_hotels.extend(hotels)
        order += 1

# Close the browser
driver.quit()

# Create a DataFrame to display the data
df = pd.DataFrame(all_hotels)
print(df)

# Save the data to a CSV file
df.to_csv('expedia_hotels_ny.csv', index=False, encoding='utf-8-sig')