In [None]:
import logging
import time
import pandas as pd
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import re

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("kayak050325.log"),  
        logging.StreamHandler() 
    ]
)

OUTPUT_CSV_PATH = "flight_prices_kayak050325_withStops.csv"
CURRENCY = "USD"
SNAPSHOTS = 1
TTT_RANGE = range(1, 31)
LOS_RANGE = range(1, 6)
ROUTES = [
    ("PAR", "LON"),
    ("LON", "PAR"),
    ("ROM", "PAR"),
    ("PAR", "ROM"),
    ("LON", "ROM"),
    ("ROM", "LON")
]
MAX_SEARCHES = 1000
MAX_WORKERS = 10

def init_driver():
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    return driver

def convert_duration_to_hours(duration):
    total_minutes = 0
    parts = duration.replace(' ', '')
    
    if 'h' in parts:
        hours_parts = parts.split('h')
        if hours_parts[0]:
            total_minutes += int(hours_parts[0]) * 60
        parts = hours_parts[1] 
    
    if 'm' in parts:
        minutes_parts = parts.split('m')
        if minutes_parts[0]:
            total_minutes += int(minutes_parts[0])
    
    return round(total_minutes / 60, 2)

def generate_dates(ttt, los, snapshot_date):
    departure_date = snapshot_date + timedelta(days=ttt)
    return_date = departure_date + timedelta(days=los)
    return departure_date.strftime("%Y-%m-%d"), return_date.strftime("%Y-%m-%d")

def scrape_flight_data(origin, destination, ttt, los, snapshot_date):
    driver = init_driver()
    flight_data = []

    try:
        departure_date, return_date = generate_dates(ttt, los, snapshot_date)
        search_url = (
            f"https://www.kayak.com/flights/{origin}-{destination}/{departure_date}/{return_date}?stops=1"
        )

        logging.info(f"Fetching flights: {origin} → {destination} ({departure_date} - {return_date})")
        driver.get(search_url)
        time.sleep(5) 

        max_clicks = 10
        click_count = 0  

        while click_count < max_clicks:
            try:
                view_more_button = driver.find_element(By.XPATH, "//div[contains(@class, 'show-more-button')]")
                view_more_button.click()
                click_count += 1  
                time.sleep(3)  
                logging.info(f"Clicked 'Show more' button {click_count} times.")
            except NoSuchElementException:
                break


        flights = driver.find_elements(By.CSS_SELECTOR, "div.yuAt.yuAt-pres-rounded.yuAt-mod-box-shadow.yuAt-mod-responsive-margins")

        for _ in range(10):
            driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_DOWN)
            time.sleep(1)

        for flight in flights[:50]:
            try:
                airline = flight.find_element(By.CSS_SELECTOR, "div.c_cgF.c_cgF-mod-variant-default").text
                price = flight.find_element(By.CSS_SELECTOR, "div.f8F1-price-text").text
                duration_str = driver.find_element(By.CSS_SELECTOR, "div.xdW8").text
                duration_hours = convert_duration_to_hours(duration_str)
                stops = flight.find_element(By.CSS_SELECTOR, "span.JWEO-stops-text").text.strip()
                airports = flight.find_element(By.CSS_SELECTOR, "div.EFvI").text
                departure_airport, arrival_airport = airports.split("-")
                time_element = flight.find_element(By.CSS_SELECTOR, "div.vmXl.vmXl-mod-variant-large")
                times = time_element.find_elements(By.TAG_NAME, "span")
                departure_time = times[0].text
                arrival_time = times[-1].text
                layover_element = driver.find_element(By.CSS_SELECTOR, ".c_cgF span span")
                layover_text = layover_element.get_attribute("title") 
                match = re.search(r"\d+h \d+m layover", layover_text)
                layover_duration = match.group(0) if match else "0"
                layover_duration_in_hours = convert_duration_to_hours(layover_duration)

                flight_data.append({
                    "Website": "Kayak",
                    "Snapshot date": snapshot_date.strftime("%Y-%m-%d"),
                    "Airline": airline,
                    "Price": price,
                    "TTT": ttt,
                    "LOS": los,
                    "Origin": origin,
                    "Destination": destination,
                    "Departure date": departure_date,
                    "Departure Airport": departure_airport,
                    "Departure Time": departure_time,
                    "Layover": layover_duration_in_hours,
                    "Stops": stops,
                    "Return date": return_date,
                    "Arrival Airport": arrival_airport,
                    "Arrival Time": arrival_time,
                    "Duration": duration_hours,
                })
            except Exception as e:
                logging.error(f"Error extracting flight data for {origin} → {destination}: {e}")

    except TimeoutException:
        logging.error(f"Timeout loading page for {origin} → {destination}")

    except Exception as e:
        logging.error(f"Unexpected error while scraping {origin} → {destination}: {e}")

    finally:
        driver.quit()

    logging.info(f"Completed: {origin} → {destination} (TTT: {ttt}, LOS: {los}), Found {len(flight_data)} flights.")
    return flight_data

def main():
    """Main function to coordinate web scraping."""
    data = []
    search_count = 0

    snapshot_dates = [datetime.now() + timedelta(days=7 * i) for i in range(SNAPSHOTS)]
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = []
        for snapshot_date in snapshot_dates:
            for origin, destination in ROUTES:
                for ttt in TTT_RANGE:
                    for los in LOS_RANGE:
                        if search_count >= MAX_SEARCHES:
                            break
                        futures.append(
                            executor.submit(scrape_flight_data, origin, destination, ttt, los, snapshot_date)
                        )
                        search_count += 1
                        logging.info(f"Scheduled search {search_count} of {MAX_SEARCHES}")

                    if search_count >= MAX_SEARCHES:
                        break
                if search_count >= MAX_SEARCHES:
                    break
            if search_count >= MAX_SEARCHES:
                break

        for future in as_completed(futures):
            try:
                flight_data = future.result()
                data.extend(flight_data)
            except Exception as e:
                logging.error(f"Error in future execution: {e}")

    df = pd.DataFrame(data)
    df.to_csv(OUTPUT_CSV_PATH, index=False)
    logging.info(f"Data successfully saved to {OUTPUT_CSV_PATH}")

if __name__ == "__main__":
    main()