In [None]:
import logging
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, WebDriverException
import pandas as pd
import re

OUTPUT_CSV_PATH = 'Google_flights_050325.csv'
CURRENCY = "USD"
SNAPSHOTS = 1
TTT_RANGE = range(1, 31)
LOS_RANGE = range(1, 6)
ROUTES = [
    ("Paris", "London"),
    ("Paris", "Rome"),
    ("London", "Paris"),
    ("London", "Rome"),
    ("Rome", "London"),
    ("Rome", "Paris")
]
MAX_SEARCHES = 10000
MAX_WORKERS = 12

def configure_logging():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler("Google_flights_040325.log"),
            logging.StreamHandler()
        ]
    )

def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    try:
        return webdriver.Chrome(options=options)
    except WebDriverException as e:
        logging.error(f"WebDriver error: {e}")
        return None

def generate_dates(ttt, los, snapshot_date):
    departure_date = snapshot_date + timedelta(days=ttt)
    return_date = departure_date + timedelta(days=los)
    return departure_date.strftime('%Y-%m-%d'), return_date.strftime('%Y-%m-%d')

def scrape_flight_data(origin, destination, ttt, los, snapshot_date):
    driver = init_driver()
    if not driver:
        return []

    flight_data = []
    try:
        departure_date, return_date = generate_dates(ttt, los, snapshot_date)
        search_url = f"https://www.google.com/travel/flights?q=Flights%20from%20{origin}%20to%20{destination}" \
                     f"%20on%20{departure_date}%20returning%20on%20{return_date}&curr={CURRENCY}&hl=en-US"
        driver.get(search_url)
        time.sleep(5)

        try:
            view_more_button = driver.find_element(By.XPATH, "//button[@aria-label='View more flights']")
            driver.execute_script("arguments[0].click();", view_more_button)
            time.sleep(3)
        except NoSuchElementException:
            logging.warning(f"No 'View more flights' button found for {origin} → {destination}")

        for _ in range(5):
            driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
            time.sleep(1)

        flights = driver.find_elements(By.CSS_SELECTOR, "li.pIav2d")

        for flight in flights[:100]:
            try:
                flight_text = extract_flight_text(flight)
                parsed_flight = parse_flight_details(flight_text, snapshot_date, origin, destination, ttt, los, departure_date, return_date)
                if parsed_flight:
                    flight_data.append(parsed_flight)
            except Exception as e:
                logging.error(f"Error parsing flight data: {e}")

    except Exception as e:
        logging.error(f"Error during scraping for {origin} → {destination}: {e}")

    finally:
        driver.quit()

    return flight_data

def extract_flight_text(li_element):
    flight_div = li_element.find_element(By.CLASS_NAME, "JMc5Xc")
    return flight_div.get_attribute("aria-label")

def parse_flight_details(flight_text, snapshot_date, origin, destination, ttt, los, departure_date, return_date):
    try:
        price = re.search(r'From (\d+) US dollars', flight_text)
        stops = re.search(r'(\d+ stop|Nonstop)', flight_text)
        airline = re.search(r'with ([\w\s]+)\.', flight_text)
        departure_airport = re.search(r'Leaves (.+?) at', flight_text)
        departure_time = re.search(r'at (\d{1,2}:\d{2} [AP]M)', flight_text)
        arrival_airport = re.search(r'arrives at (.+?) at', flight_text)
        arrival_time = re.search(r'at (\d{1,2}:\d{2} [AP]M) on', flight_text)
        duration = re.search(r'Total duration (\d+ hr \d+ min)', flight_text)
        layover = re.search(r'Layover \(1 of 1\) is a (\d+ hr \d+ min)', flight_text)

        return {
            "Website": "Google flights",
            "Snapshot date": snapshot_date.strftime("%Y-%m-%d"),
            "Airline": airline.group(1) if airline else "N/A",
            "Price": price.group(1) if price else "N/A",
            "TTT": ttt,
            "LOS": los,
            "Origin": origin,
            "Destination": destination,
            "Departure date": departure_date,
            "Departure Airport": departure_airport.group(1) if departure_airport else "N/A",
            "Departure Time": departure_time.group(1) if departure_time else "N/A",
            "Layover": layover.group(1) if layover else "None",
            "Stops": stops.group(1) if stops else "N/A",
            "Return date": return_date,
            "Arrival Airport": arrival_airport.group(1) if arrival_airport else "N/A",
            "Arrival Time": arrival_time.group(1) if arrival_time else "N/A",
            "Duration": duration.group(1) if duration else "N/A",
        }
    
    except Exception as e:
        logging.error(f"Error parsing flight details: {e}")
        return None

def run_scraping(snapshot_dates):
    data = []
    search_count = 0

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = []
        for snapshot_date in snapshot_dates:
            for origin, destination in ROUTES:
                for ttt in TTT_RANGE:
                    for los in LOS_RANGE:
                        if search_count >= MAX_SEARCHES:
                            break
                        futures.append(executor.submit(scrape_flight_data, origin, destination, ttt, los, snapshot_date))
                        search_count += 1
                        logging.info(f"Scheduled search {search_count} of {MAX_SEARCHES}")

        for future in as_completed(futures):
            try:
                flight_data = future.result()
                if flight_data:
                    data.extend(flight_data)
            except Exception as e:
                logging.error(f"Error in future execution: {e}")

    return data

def save_to_csv(data):
    df = pd.DataFrame(data)
    df.to_csv(OUTPUT_CSV_PATH, index=False)
    logging.info(f"Data saved to {OUTPUT_CSV_PATH}")

def main():
    configure_logging()
    snapshot_dates = [datetime.now() + timedelta(days=7 * i) for i in range(SNAPSHOTS)]
    flight_data = run_scraping(snapshot_dates)
    save_to_csv(flight_data)

if __name__ == "__main__":
    main()