In [None]:
# 🏁 RunSignUp Race Scraper

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ----------------------
# 1. Helper Functions
# ----------------------

def get_driver():
    """Initialize a headless Chrome driver."""
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

def extract_race_info(row, race_type):
    """Extract title, link, location, and date info from a single race row."""
    try:
        title_element = row.find_element(By.CSS_SELECTOR, "div.flex-1 > a")
        title = title_element.text.strip()
        race_link = title_element.get_attribute("href")
    except:
        title, race_link = "N/A", "N/A"

    try:
        date = row.find_element(By.CSS_SELECTOR, "td.ta-left.fs-sm-2").text.strip()[4:]
    except:
        date = "N/A"

    try:
        location_span = row.find_element(By.CSS_SELECTOR, "td.ta-left.fs-sm-2 > span > span")
        location_text = location_span.text.strip()
        if "," in location_text:
            city, state_country = location_text.rsplit(",", 1)
            state, _, country = state_country.strip().partition(" ")
        else:
            city, state, country = "N/A", "N/A", "N/A"
    except:
        city, state, country = "N/A", "N/A", "N/A"

    try:
        postal_code_element = row.find_element(By.CSS_SELECTOR, "div.postalCode")
        postal_code = postal_code_element.text.strip()[:6] if postal_code_element else "N/A"
    except:
        postal_code = "N/A"

    return {
        "Race Title": title,
        "Race Link": race_link,
        "City": city,
        "State": state,
        "Country": country,
        "Postal Code": postal_code,
        "Date": date,
        "Race Type": race_type
    }

def scrape_races(race_type_list, num_pages=3, num_items=250):
    """Scrape races from RunSignUp for given race types and page counts."""
    race_data = []
    for race_type in race_type_list:
        for page in range(1, num_pages + 1):
            driver = get_driver()
            url = f"https://runsignup.com/Races?name=&eventType={race_type}&radius=5&zipcodeRadius=&country=US&state=&distance=&max_distance=&units=K&start_date=2025-02-03&end_date=&num={num_items}&page={page}"
            driver.get(url)
            time.sleep(2)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            try:
                race_rows = WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.XPATH, "//tr"))
                )
            except:
                driver.quit()
                continue

            for row in race_rows:
                try:
                    race_info = extract_race_info(row, race_type)
                    if race_info["Race Title"] != "N/A":
                        race_data.append(race_info)
                except:
                    continue

            driver.quit()
    return pd.DataFrame(race_data)

# ----------------------
# 2. Run Scraper
# ----------------------

race_types = ["triathlon", "duathlon", "bike_race", "swim", "swim_run", "aqua_bike"]
df = scrape_races(race_types)

# ----------------------
# 3. Export Data
# ----------------------

output_path = "data/race_listings_RunSignUp.csv"
df.to_csv(output_path, index=False)
print(f"Scraping complete. Data saved to {output_path}")