In [2]:
import logging
import time
import pandas as pd
import random
import json
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from fake_useragent import UserAgent
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from fp.fp import FreeProxy
import re

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("kayak.log"),  
        logging.StreamHandler()  
    ]
)

# Global Variables
OUTPUT_CSV_PATH = "flight_prices_kayak.csv"
CURRENCY = "USD"
SNAPSHOTS = 1
TTT_RANGE = range(1, 2)
LOS_RANGE = range(1, 2)
ROUTES = [
    ("PAR", "LON"),
    ("LON", "PAR"),
    ("ROM", "PAR"),
    ("PAR", "ROM"),
    ("LON", "ROM"),
    ("ROM", "LON")
]
MAX_SEARCHES = 1
MAX_WORKERS = 1

def init_driver():
    """Initializes a Selenium WebDriver with anti-bot measures and optional proxy."""
    options = webdriver.ChromeOptions()
    ua = UserAgent()
    user_agent = ua.random
    options.add_argument(f"user-agent={user_agent}")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    
    proxy = get_proxy()
    if proxy:
        options.add_argument(f"--proxy-server={proxy}")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver

def get_proxy():
    """Fetches a random proxy from FreeProxy."""
    try:
        proxy = FreeProxy(rand=True).get()
        logging.info(f"Using proxy: {proxy}")
        return proxy
    except Exception as e:
        logging.warning(f"Failed to get proxy: {e}. Continuing without proxy.")
        return None

def random_human_delay(min_t=1, max_t=3):
    """Simulates a random human-like delay."""
    time.sleep(random.uniform(min_t, max_t))

def convert_duration_to_hours(duration):
    """Converts flight duration (e.g., '2h 30m') into total hours."""
    total_minutes = 0
    duration = duration.replace(' ', '')

    hours_match = re.search(r"(\d+)h", duration)
    minutes_match = re.search(r"(\d+)m", duration)

    if hours_match:
        total_minutes += int(hours_match.group(1)) * 60
    if minutes_match:
        total_minutes += int(minutes_match.group(1))

    return round(total_minutes / 60, 2)

def generate_dates(ttt, los, snapshot_date):
    """Generates departure and return dates."""
    departure_date = snapshot_date + timedelta(days=ttt)
    return_date = departure_date + timedelta(days=los)
    return departure_date.strftime("%Y-%m-%d"), return_date.strftime("%Y-%m-%d")

def human_scroll(driver):
    """Simulates human-like scrolling behavior."""
    for _ in range(random.randint(5, 10)):
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_DOWN)
        time.sleep(random.uniform(0.5, 2))

def scrape_flight_data(origin, destination, ttt, los, snapshot_date):
    """Scrapes flight data from Kayak."""
    driver = init_driver()
    flight_data = []

    try:
        departure_date, return_date = generate_dates(ttt, los, snapshot_date)
        search_url = f"https://www.kayak.com/flights/{origin}-{destination}/{departure_date}/{return_date}"

        logging.info(f"Fetching flights: {origin} → {destination} ({departure_date} - {return_date})")
        driver.get(search_url)
        random_human_delay(5, 10)

        try:
            view_more_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'View more')]"))
            )
            view_more_button.click()
            random_human_delay(5, 10)   
        except (NoSuchElementException, TimeoutException):
            logging.warning(f"No 'View more flights' button found for {origin} → {destination}")

        human_scroll(driver)

        flights = driver.find_elements(By.CSS_SELECTOR, "div.yuAt.yuAt-pres-rounded")

        for flight in flights[:100]:
            try:
                airline = flight.find_element(By.CSS_SELECTOR, "div.c_cgF").text
                price = flight.find_element(By.CSS_SELECTOR, "div.f8F1-price-text").text
                duration_str = flight.find_element(By.CSS_SELECTOR, "div.xdW8").text
                duration_hours = convert_duration_to_hours(duration_str)
                stops = flight.find_element(By.CSS_SELECTOR, "span.JWEO-stops-text").text.strip()
                airports = flight.find_element(By.CSS_SELECTOR, "div.EFvI").text
                departure_airport, arrival_airport = airports.split("-")
                time_element = flight.find_element(By.CSS_SELECTOR, "div.vmXl")
                times = time_element.find_elements(By.TAG_NAME, "span")
                departure_time = times[0].text
                arrival_time = times[-1].text

                flight_data.append({
                    "Website": "Kayak",
                    "Snapshot date": snapshot_date.strftime("%Y-%m-%d"),
                    "Airline": airline,
                    "Price": price,
                    "TTT": ttt,
                    "LOS": los,
                    "Origin": origin,
                    "Destination": destination,
                    "Departure date": departure_date,
                    "Departure Airport": departure_airport.strip(),
                    "Departure Time": departure_time,
                    "Stops": stops,
                    "Return date": return_date,
                    "Arrival Airport": arrival_airport.strip(),
                    "Arrival Time": arrival_time,
                    "Duration": duration_hours,
                })
            except Exception as e:
                logging.error(f"Error extracting flight data for {origin} → {destination}: {e}")

    except TimeoutException:
        logging.error(f"Timeout loading page for {origin} → {destination}")

    finally:
        driver.quit()

    logging.info(f"Completed: {origin} → {destination}, Found {len(flight_data)} flights.")
    return flight_data

def main():
    """Runs the scraper using multithreading."""
    data = []
    snapshot_dates = [datetime.now() + timedelta(days=7 * i) for i in range(SNAPSHOTS)]

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [
            executor.submit(scrape_flight_data, origin, destination, ttt, los, snapshot_date)
            for snapshot_date in snapshot_dates
            for ttt in TTT_RANGE
            for los in LOS_RANGE
            for origin, destination in ROUTES
        ]

        for future in as_completed(futures):
            data.extend(future.result())

    df = pd.DataFrame(data)
    df.to_csv(OUTPUT_CSV_PATH, index=False)
    logging.info(f"Data saved to {OUTPUT_CSV_PATH}")

if __name__ == "__main__":
    main()


2025-03-03 18:12:48,031 - INFO - Using proxy: http://3.96.208.91:3128
2025-03-03 18:12:48,165 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-03 18:12:48,463 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-03 18:12:48,764 - INFO - Driver [/Users/nadavcohen/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-arm64/chromedriver] found in cache
2025-03-03 18:13:22,783 - INFO - Using proxy: http://3.71.239.218:3128
2025-03-03 18:13:22,909 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-03 18:13:23,234 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-03 18:13:23,578 - INFO - Driver [/Users/nadavcohen/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-arm64/chromedriver] found in cache
2025-03-03 18:13:57,360 - INFO - Using proxy: http://13.246.184.110:3128
2025-03-03 18:13:57,475 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-03 18:13:57,798 - INFO - Get LATEST chromedrive

WebDriverException: Message: Can not connect to the Service /Users/nadavcohen/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-arm64/chromedriver
