In [4]:
'''
# Improved Ironman Elevation Data Scraper

This script extracts bike and run elevation data from multiple sources:
1. TriathlonCourseInfo.com (primary source)
2. PJammCycling.com (specialized cycling data)
3. Ironman.com official website (authoritative data)

It combines data from these sources with a priority system to ensure accuracy.
'''


import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
import random
from urllib.parse import quote, urljoin
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("elevation_scraper.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

def clean_race_name(race):
    """
    Cleans a race name for better search results.
    Removes parentheses, handles special cases, and standardizes names.
    """
    # Remove any text in parentheses and trailing whitespace
    cleaned = re.sub(r'\s*\([^)]*\)', '', race).strip().lower()
    
    # Handle special cases (extend as needed)
    special_cases = {
        'hawaii': 'kona',
        'wc hawaii': 'kona',
        'lake placid': 'lake placid',
        'mont-tremblant': 'mont tremblant',
        'whistler': 'whistler',
        'wisconsin': 'wisconsin',
        'florida': 'florida',
        'texas': 'texas',
        'arizona': 'arizona',
        'chattanooga': 'chattanooga',
        'coeur': 'coeur d\'alene',
        'maryland': 'maryland',
        'louisville': 'louisville',
        'california': 'california'
    }
    
    for key, value in special_cases.items():
        if key in cleaned:
            return value
    
    return cleaned

def clean_location_name(location):
    """
    Cleans a location name for search, focusing on the city name.
    """
    # Remove parentheses and trailing whitespace
    location = re.sub(r'\s*\([^)]*\)', '', location).strip()
    
    # Split by comma and take first part (city name)
    parts = location.split(',')
    if len(parts) > 0:
        city = parts[0].strip().lower()
        
        # Handle special cases
        special_cases = {
            "whistler/pemberton": "whistler",
            "the woodlands": "the woodlands texas",
            "mont-tremblant": "mont tremblant",
            "alcúdia": "mallorca",
            "alcudia": "mallorca",
            "mar del plata": "mar del plata argentina",
            "lake placid": "lake placid",
            "kona": "kona hawaii",
            "santa rosa": "santa rosa california",
            "panama city beach": "panama city beach florida",
            "coeur d'alene": "coeur d'alene idaho"
        }
        
        for key, value in special_cases.items():
            if key in city:
                return value
        
        return city
    
    return location.lower()

def extract_number(text):
    """
    Extracts numeric values from text, accounting for various formats.
    Returns the largest number found, or None if no numbers are found.
    """
    # Handle both comma-separated and non-comma formats (e.g., 1,500 and 1500)
    matches = re.findall(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d+(?:\.\d+)?)', text)
    
    if matches:
        # Convert matches to numbers, removing commas
        nums = [float(m.replace(',', '')) for m in matches]
        # Return the largest as an integer (most likely the total elevation)
        return int(max(nums))
    
    return None

def extract_units(text):
    """
    Identifies whether the elevation is in feet or meters.
    Returns 'ft' or 'm' or None if undetermined.
    """
    text_lower = text.lower()
    if 'feet' in text_lower or 'ft' in text_lower or "'" in text_lower:
        return 'ft'
    elif 'meter' in text_lower or 'm' in text_lower:
        return 'm'
    return None

def convert_to_feet(value, unit):
    """
    Converts elevation values to feet if they're in meters.
    """
    if unit == 'm':  # Convert meters to feet
        return int(value * 3.28084)
    return value  # Already in feet

def scrape_triathloncourseinfo(url):
    """
    Scrapes elevation data from TriathlonCourseInfo websites.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        logger.info(f"Scraping TriathlonCourseInfo: {url}")
        response = requests.get(url, headers=headers, timeout=20)
        
        if response.status_code != 200:
            logger.warning(f"Failed to access {url}. Status code: {response.status_code}")
            return None, None
        
        soup = BeautifulSoup(response.content, 'html.parser')
        bike_elev = None
        run_elev = None
        bike_unit = None
        run_unit = None
        
        # Look first inside any div.course-stats if present
        stats_section = soup.find("div", {"class": "course-stats"})
        if stats_section:
            candidates = stats_section.find_all(["div", "p", "li", "h3", "h4"])
        else:
            # Fallback: look through all relevant tags
            candidates = soup.find_all(["p", "div", "li", "h3", "h4", "span"])
            
        # Additional method: look for sections with specific headings
        bike_section = None
        run_section = None
        
        h_tags = soup.find_all(['h2', 'h3', 'h4', 'h5'])
        for h in h_tags:
            text = h.get_text().lower()
            if 'bike' in text or 'cycling' in text:
                bike_section = h
            elif 'run' in text or 'marathon' in text:
                run_section = h
                
        # Process sections if found
        if bike_section:
            siblings = []
            for sibling in bike_section.next_siblings:
                if sibling.name in ['h2', 'h3', 'h4', 'h5']:
                    break
                siblings.append(sibling)
            
            for sibling in siblings:
                if hasattr(sibling, 'get_text'):
                    text = sibling.get_text().lower()
                    if 'elevation' in text or 'gain' in text or 'climbing' in text:
                        bike_elev = extract_number(text)
                        bike_unit = extract_units(text)
                        break
        
        if run_section:
            siblings = []
            for sibling in run_section.next_siblings:
                if sibling.name in ['h2', 'h3', 'h4', 'h5']:
                    break
                siblings.append(sibling)
                
            for sibling in siblings:
                if hasattr(sibling, 'get_text'):
                    text = sibling.get_text().lower()
                    if 'elevation' in text or 'gain' in text or 'climbing' in text:
                        run_elev = extract_number(text)
                        run_unit = extract_units(text)
                        break
                        
        # Process all candidate elements
        for elem in candidates:
            text = elem.get_text(separator=" ").lower()
            
            # Extract bike elevation data
            bike_keywords = ['bike elevation', 'bike climbing', 'cycling elevation', 'bike course elevation',
                            'bike elevation gain', 'bike course climbing', 'bike vertical']
            if any(keyword in text for keyword in bike_keywords) and 'run' not in text:
                val = extract_number(text)
                unit = extract_units(text)
                if val and not bike_elev:
                    bike_elev = val
                    bike_unit = unit
                    
            # Extract run elevation data
            run_keywords = ['run elevation', 'run climbing', 'running elevation', 'run course elevation',
                           'run elevation gain', 'run course climbing', 'run vertical', 'marathon elevation']
            if any(keyword in text for keyword in run_keywords) and 'bike' not in text:
                val = extract_number(text)
                unit = extract_units(text)
                if val and not run_elev:
                    run_elev = val
                    run_unit = unit
                    
        # Convert to feet if necessary
        if bike_elev and bike_unit:
            bike_elev = convert_to_feet(bike_elev, bike_unit)
        if run_elev and run_unit:
            run_elev = convert_to_feet(run_elev, run_unit)
            
        logger.info(f"TriathlonCourseInfo data: Bike={bike_elev}ft, Run={run_elev}ft")
        return bike_elev, run_elev
        
    except Exception as e:
        logger.error(f"Error scraping {url}: {str(e)}")
        return None, None

def scrape_pjammcycling(race_name):
    """
    Scrapes elevation data from PJammCycling.com.
    This site specializes in detailed bike course information.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Attempt to find the appropriate page
    base_url = "https://pjammcycling.com/triathlon/"
    search_terms = clean_race_name(race_name).split()
    
    try:
        logger.info(f"Attempting to scrape PJammCycling for {race_name}")
        
        # First approach: Try to get the search results page
        response = requests.get("https://pjammcycling.com/search.php", 
                                params={"search": f"IRONMAN {race_name}"}, 
                                headers=headers, 
                                timeout=20)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            links = soup.find_all('a', href=True)
            
            triathlon_links = [link for link in links if '/triathlon/' in link['href']]
            
            if triathlon_links:
                # Find the most relevant link
                most_relevant = None
                for link in triathlon_links:
                    link_text = link.get_text().lower()
                    if race_name.lower() in link_text and "ironman" in link_text:
                        most_relevant = link['href']
                        break
                
                if most_relevant:
                    # Use the found link
                    url = urljoin("https://pjammcycling.com/", most_relevant)
                    detail_response = requests.get(url, headers=headers, timeout=20)
                    
                    if detail_response.status_code == 200:
                        detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
                        
                        # Find the elevation information
                        bike_elev = None
                        run_elev = None
                        
                        # PJammCycling typically has structured data in tables or specific formats
                        stats_divs = detail_soup.find_all('div', class_='stat')
                        
                        for stat in stats_divs:
                            text = stat.get_text().lower()
                            
                            if 'elevation gain' in text:
                                # Extract the bike or run elevation
                                if 'bike' in text:
                                    bike_elev = extract_number(text)
                                elif 'run' in text:
                                    run_elev = extract_number(text)
                        
                        # Alternative way: look for course summary sections
                        summary_texts = [p.get_text().lower() for p in detail_soup.find_all('p')]
                        for text in summary_texts:
                            # Look for bike elevation data
                            if 'bike' in text and ('elevation' in text or 'climbing' in text):
                                val = extract_number(text)
                                if val and not bike_elev:
                                    bike_elev = val
                                    
                            # Look for run elevation data
                            if 'run' in text and ('elevation' in text or 'climbing' in text):
                                val = extract_number(text)
                                if val and not run_elev:
                                    run_elev = val
                        
                        logger.info(f"PJammCycling data: Bike={bike_elev}ft, Run={run_elev}ft")
                        return bike_elev, run_elev
        
        logger.warning("PJammCycling - No relevant data found")
        return None, None
        
    except Exception as e:
        logger.error(f"Error scraping PJammCycling for {race_name}: {str(e)}")
        return None, None

def scrape_ironman_official(race_name):
    """
    Scrapes elevation data from the official Ironman website.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Attempt to create a URL for the race on ironman.com
    clean_name = clean_race_name(race_name).replace(" ", "-")
    
    # Try different URL patterns that Ironman.com might use
    url_patterns = [
        f"https://www.ironman.com/im-{clean_name}-course",
        f"https://www.ironman.com/im-{clean_name}",
        f"https://www.ironman.com/ironman-{clean_name}-course",
        f"https://www.ironman.com/ironman-{clean_name}"
    ]
    
    bike_elev = None
    run_elev = None
    
    for url in url_patterns:
        try:
            logger.info(f"Trying Ironman.com URL: {url}")
            response = requests.get(url, headers=headers, timeout=20)
            
            if response.status_code == 200:
                logger.info(f"Successfully accessed Ironman.com at {url}")
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Look for course sections
                course_section = None
                for section in soup.find_all('section'):
                    if section.get('id') == 'course' or 'course' in section.get('class', []):
                        course_section = section
                        break
                
                # If we found a dedicated course section, focus on it
                if course_section:
                    soup = course_section
                
                # Look for bike course information
                bike_section = None
                run_section = None
                
                # Find sections by headings
                for heading in soup.find_all(['h2', 'h3', 'h4']):
                    text = heading.get_text().lower()
                    if 'bike' in text and not bike_section:
                        bike_section = heading
                    elif 'run' in text and not run_section:
                        run_section = heading
                
                # Process bike section
                if bike_section:
                    next_elements = []
                    current = bike_section.next_sibling
                    
                    # Collect next elements until we hit another heading or section
                    while current and current.name not in ['h2', 'h3', 'h4', 'section']:
                        if hasattr(current, 'name') and current.name:
                            next_elements.append(current)
                        current = current.next_sibling
                    
                    for elem in next_elements:
                        text = elem.get_text().lower()
                        if 'elevation' in text or 'gain' in text or 'climbing' in text or 'ascent' in text:
                            bike_elev = extract_number(text)
                            unit = extract_units(text)
                            if unit:
                                bike_elev = convert_to_feet(bike_elev, unit)
                            break
                
                # Process run section
                if run_section:
                    next_elements = []
                    current = run_section.next_sibling
                    
                    # Collect next elements until we hit another heading or section
                    while current and current.name not in ['h2', 'h3', 'h4', 'section']:
                        if hasattr(current, 'name') and current.name:
                            next_elements.append(current)
                        current = current.next_sibling
                    
                    for elem in next_elements:
                        text = elem.get_text().lower()
                        if 'elevation' in text or 'gain' in text or 'climbing' in text or 'ascent' in text:
                            run_elev = extract_number(text)
                            unit = extract_units(text)
                            if unit:
                                run_elev = convert_to_feet(run_elev, unit)
                            break
                
                # If we found at least one value, we can stop trying URLs
                if bike_elev or run_elev:
                    logger.info(f"Ironman.com data: Bike={bike_elev}ft, Run={run_elev}ft")
                    return bike_elev, run_elev
        
        except Exception as e:
            logger.warning(f"Error accessing {url}: {str(e)}")
    
    logger.warning("No data found from Ironman.com")
    return None, None

def search_triathloncourseinfo(race_name, location=None):
    """
    Searches TriathlonCourseInfo.com for a specific race and returns elevation data.
    Uses both race name and location for better search results.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Try different search strategies
    search_queries = []
    
    # Option 1: Clean race name
    clean_race = clean_race_name(race_name)
    search_queries.append(f"ironman {clean_race}")
    
    # Option 2: Add location if available
    if location:
        clean_loc = clean_location_name(location)
        search_queries.append(f"ironman {clean_race} {clean_loc}")
        search_queries.append(f"ironman {clean_loc}")
    
    base_search = "https://triathloncourseinfo.com/?s="
    
    for query in search_queries:
        try:
            logger.info(f"Searching TriathlonCourseInfo for: {query}")
            search_url = base_search + quote(query)
            
            response = requests.get(search_url, headers=headers, timeout=20)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find all article results
                articles = soup.find_all("article")
                candidate_urls = []
                
                for art in articles:
                    h2 = art.find("h2", {"class": "entry-title"})
                    if not h2:
                        continue
                    
                    link = h2.find("a")
                    if not link or not link.get("href"):
                        continue
                    
                    title_text = h2.get_text().lower()
                    
                    # Only consider Ironman events and exclude 70.3 (half Ironman) if searching for full
                    ironman_keywords = ["ironman", "im ", "140.6"]
                    if any(kw in title_text for kw in ironman_keywords) and ("70.3" not in title_text or "70.3" in race_name.lower()):
                        relevance_score = 0
                        
                        # Higher score if race name is in title
                        if clean_race in title_text:
                            relevance_score += 3
                            
                        # Higher score if location is in title
                        if location and clean_location_name(location) in title_text:
                            relevance_score += 2
                            
                        # Add race name keywords for additional matching
                        for word in clean_race.split():
                            if word in title_text and word not in ["ironman", "im"]:
                                relevance_score += 1
                        
                        candidate_urls.append((link["href"], relevance_score))
                
                # Sort by relevance score (highest first)
                candidate_urls.sort(key=lambda x: x[1], reverse=True)
                
                # Check up to 3 most relevant pages
                for url, _ in candidate_urls[:3]:
                    logger.info(f"Checking relevant page: {url}")
                    bike, run = scrape_triathloncourseinfo(url)
                    
                    if bike is not None or run is not None:
                        return bike, run
                        
                # If we found candidates but no elevation data, try the next search query
                if candidate_urls:
                    logger.info(f"Found candidate pages but no elevation data for: {query}")
                    continue
            
        except Exception as e:
            logger.error(f"Error searching TriathlonCourseInfo for {query}: {str(e)}")
    
    # If we get here, we didn't find anything
    return None, None

def main():
    """
    Main function to process the CSV and extract elevation data.
    """
    # Load the CSV file
    try:
        csv_path = "S3_70.3_locations.csv_coord_elevation.csv"
        logger.info(f"Loading CSV file: {csv_path}")
        df = pd.read_csv(csv_path)
    except Exception as e:
        logger.error(f"Error loading CSV: {str(e)}")
        return
    
    # Check if elevation columns exist, create them if not
    if "bike elevation" not in df.columns:
        df["bike elevation"] = None
    if "run elevation" not in df.columns:
        df["run elevation"] = None
    
    # Create a dictionary to cache results by race name
    race_data_cache = {}
    
    # Count total races that need processing
    unique_races = set()
    for _, row in df.iterrows():
        if pd.isna(row["Race"]) or str(row["Race"]).strip() == "":
            continue
        unique_races.add(row["Race"])
    
    total_races = len(unique_races)
    processed = 0
    
    # Process each unique race
    for race_name in unique_races:
        processed += 1
        logger.info(f"Processing race {processed}/{total_races}: {race_name}")
        
        # Skip if already cached
        if race_name in race_data_cache:
            logger.info(f"Using cached data for {race_name}")
            continue
        
        # Get the first location for this race (used for searching)
        location = None
        for _, row in df.iterrows():
            if row["Race"] == race_name and pd.notna(row["Location"]):
                location = row["Location"]
                break
        
        bike_elevation = None
        run_elevation = None
        
        # Strategy 1: Try TriathlonCourseInfo search
        logger.info(f"Strategy 1: Searching TriathlonCourseInfo for {race_name}")
        bike_elevation, run_elevation = search_triathloncourseinfo(race_name, location)
        
        # Strategy 2: If not found, try PJammCycling
        if not bike_elevation or not run_elevation:
            logger.info(f"Strategy 2: Searching PJammCycling for {race_name}")
            bike_pjamm, run_pjamm = scrape_pjammcycling(race_name)
            
            # Use PJammCycling data if available
            if bike_pjamm and not bike_elevation:
                bike_elevation = bike_pjamm
            if run_pjamm and not run_elevation:
                run_elevation = run_pjamm
        
        # Strategy 3: As a last resort, check Ironman's official site
        if not bike_elevation or not run_elevation:
            logger.info(f"Strategy 3: Checking Ironman.com for {race_name}")
            bike_im, run_im = scrape_ironman_official(race_name)
            
            # Use Ironman.com data if available
            if bike_im and not bike_elevation:
                bike_elevation = bike_im
            if run_im and not run_elevation:
                run_elevation = run_im
        
        # Cache the results
        race_data_cache[race_name] = (bike_elevation, run_elevation)
        logger.info(f"Final data for {race_name}: Bike={bike_elevation}ft, Run={run_elevation}ft")
        
        # Add a delay between processing races
        time.sleep(1 + random.random() * 2)
    
    # Update the DataFrame with the collected data
    update_count = 0
    for idx, row in df.iterrows():
        race_name = row["Race"]
        
        if not pd.isna(race_name) and race_name in race_data_cache:
            bike_elev, run_elev = race_data_cache[race_name]
            
            # Update bike elevation if we found a value and the current cell is empty
            if bike_elev is not None and pd.isna(row["bike elevation"]):
                df.at[idx, "bike elevation"] = bike_elev
                update_count += 1
                
            # Update run elevation if we found a value and the current cell is empty
            if run_elev is not None and pd.isna(row["run elevation"]):
                df.at[idx, "run elevation"] = run_elev
                update_count += 1
    
    # Save the updated DataFrame to a new CSV file
    output_path = "S3_70.3_locations.csv_coord_elevat_updated.csv"
    df.to_csv(output_path, index=False)
    logger.info(f"Updated {update_count} elevation values")
    logger.info(f"Saved updated data to {output_path}")
    print(f"✅ Successfully updated {update_count} elevation values and saved to {output_path}")

# Run the script
if __name__ == "__main__":
    main()
'''</VSCode.Cell>

<VSCode.Cell language="markdown">
## Key Improvements in This Script

1. **Multiple data sources**: The script now checks three different websites for elevation data, increasing the chances of finding accurate information.

2. **Enhanced text extraction**: The script now searches for elevation data using more keywords and patterns, and handles both feet and meters.

3. **Better race and location matching**: The script uses more sophisticated matching logic to find relevant pages.

4. **Structured logging**: The script logs all its actions to both the console and a file, making debugging easier.

5. **Improved error handling**: The script gracefully handles network errors and parsing difficulties.

6. **Caching**: The script caches results for each unique race name to avoid redundant work.

To run this script, simply execute this notebook cell. The script will:
1. Load your existing CSV file
2. Extract elevation data from multiple sources
3. Update any missing values in your file
4. Save the result to a new CSV file called "S3_ironman_locations_coord_elevation_updated.csv"
</VSCode.Cell>'''

2025-05-31 10:39:26,778 - INFO - Loading CSV file: S3_70.3_locations.csv_coord_elevation.csv
2025-05-31 10:39:26,877 - INFO - Processing race 1/207: Ironman 70.3 Waco
2025-05-31 10:39:26,923 - INFO - Strategy 1: Searching TriathlonCourseInfo for Ironman 70.3 Waco
2025-05-31 10:39:26,923 - INFO - Searching TriathlonCourseInfo for: ironman ironman 70.3 waco
2025-05-31 10:39:29,507 - INFO - Checking relevant page: https://triathloncourseinfo.com/ironman-waco-70-3-2024-race-stats/
2025-05-31 10:39:29,507 - INFO - Scraping TriathlonCourseInfo: https://triathloncourseinfo.com/ironman-waco-70-3-2024-race-stats/
2025-05-31 10:39:30,880 - INFO - TriathlonCourseInfo data: Bike=2723ft, Run=1112ft
2025-05-31 10:39:30,883 - INFO - Final data for Ironman 70.3 Waco: Bike=2723ft, Run=1112ft
2025-05-31 10:39:31,910 - INFO - Processing race 2/207: Ironman 70.3 Muncie
2025-05-31 10:39:32,070 - INFO - Strategy 1: Searching TriathlonCourseInfo for Ironman 70.3 Muncie
2025-05-31 10:39:32,070 - INFO - Search

✅ Successfully updated 266 elevation values and saved to S3_70.3_locations.csv_coord_elevat_updated.csv


'</VSCode.Cell>\n\n<VSCode.Cell language="markdown">\n## Key Improvements in This Script\n\n1. **Multiple data sources**: The script now checks three different websites for elevation data, increasing the chances of finding accurate information.\n\n2. **Enhanced text extraction**: The script now searches for elevation data using more keywords and patterns, and handles both feet and meters.\n\n3. **Better race and location matching**: The script uses more sophisticated matching logic to find relevant pages.\n\n4. **Structured logging**: The script logs all its actions to both the console and a file, making debugging easier.\n\n5. **Improved error handling**: The script gracefully handles network errors and parsing difficulties.\n\n6. **Caching**: The script caches results for each unique race name to avoid redundant work.\n\nTo run this script, simply execute this notebook cell. The script will:\n1. Load your existing CSV file\n2. Extract elevation data from multiple sources\n3. Update a