We need to collect some athletes from the internet - let's do a power analysis to determine the minimum amount of height and weights I would need - some assumptions are that there is a medium effect size between height and weight 

In [21]:
from statsmodels.stats.power import TTestIndPower

# Parameters for the power analysis
effect_size = 0.5  # medium effect size
alpha = 0.05  # 95% confidence level
power = 0.95  # 95% power

# Create a TTestIndPower object
analysis = TTestIndPower()

# Calculate required sample size
sample_size = analysis.solve_power(effect_size=effect_size, alpha=alpha, power=power, alternative='two-sided')
sample_size


104.92794663444455

105 athletes is the goal - I found a rankings site that has the height and weight of multiple atheltes - Let's make sure I can scrape the website by pulling the robots.txt file 

In [None]:
import requests

# Fetch the robots.txt file
robots_url = 'https://stats.protriathletes.org/robots.txt'
response = requests.get(robots_url)

# Display the content of robots.txt
print(response.text)


Looks like there is nothing specific in the robots.txt file about scraping - let's move forward but respect the servers by adding a delay between the requests 

In [None]:
import asyncio
import time
import pandas as pd
from playwright.async_api import async_playwright
from tqdm.asyncio import tqdm
import random

# Add a delay function for respectful scraping
async def delay():
    await asyncio.sleep(random.uniform(1.5, 3))  # Sleep for 1.5 to 3 seconds between requests

async def scrape_athlete_profile(page, url):
    """Scrape an individual athlete's profile for name, height, and weight."""
    try:
        start_time = time.time()
        print(f"Fetching {url}")

        # Visit the athlete profile page
        await page.goto(url)
        
        # Wait for the page to load the name, height, and weight information
        await page.wait_for_selector("div.athlete-info", timeout=60000)

        # Extract athlete name
        athlete_name = await page.text_content("h2.headline.font-weight-bold")
        
        # Extract height and weight
        height = await page.text_content("div.attribute:has-text('Height') .value")
        weight = await page.text_content("div.attribute:has-text('Weight') .value")

        # Store athlete data
        athlete_data = {
            "name": athlete_name.strip(),
            "height": height.strip(),
            "weight": weight.strip(),
            "url": url
        }

        elapsed_time = time.time() - start_time
        print(f"Successfully scraped {athlete_name} in {elapsed_time:.2f} seconds")
        return athlete_data

    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None

async def scrape_all_athletes():
    """Scrape all athletes from the men's rankings page."""
    base_url = "https://stats.protriathletes.org"
    athlete_urls = []
    athletes_data = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # Step 1: Scrape the athlete URLs from the rankings page
        await page.goto(f"{base_url}/rankings/men")
        await page.wait_for_selector('.rankings', timeout=60000)

        # Extract athlete profile URLs
        athlete_links = await page.query_selector_all('div.trow a.athlete-pic-group')
        athlete_urls = [base_url + await link.get_attribute('href') for link in athlete_links]

        print(f"Found {len(athlete_urls)} athlete URLs.")
        
        # Step 2: Scrape each athlete's profile
        for url in tqdm(athlete_urls, desc="Scraping Athletes", unit="profile"):
            athlete_data = await scrape_athlete_profile(page, url)
            if athlete_data:
                athletes_data.append(athlete_data)
            await delay()  # Add delay between requests to respect the server

        await browser.close()

    # Save the results to a CSV file
    df = pd.DataFrame(athletes_data)
    output_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/body_composition/athlete_profiles.csv'
    df.to_csv(output_path, index=False)

    print(f"Scraping completed. Total athletes scraped: {len(athletes_data)}")
    print(f"File saved successfully at: {output_path}")

# Run the scraping function
async def main():
    start_time = time.time()
    await scrape_all_athletes()
    elapsed_time = time.time() - start_time
    print(f"Total scraping time: {elapsed_time:.2f} seconds")

await main()


All available height and weights from the PTO rankings were aquired - let's combine with a dataset I found from the Rio 2016 Olympics

In [11]:
import pandas as pd
import re

# File paths
file_rio_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/body_composition/all-rio-2016-athletes-excel.csv'
file_profiles_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/body_composition/athlete_profiles.csv'
output_combined_file_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/body_composition/All_athletes_height_weight.csv'

# Load the files
file_rio_df = pd.read_csv(file_rio_path)
file_profiles_df = pd.read_csv(file_profiles_path)

# Function to clean height and weight columns
def clean_height_weight(value):
    if isinstance(value, str):
        match = re.search(r'[\d.]+', value)
        if match:
            return float(match.group())
    return None

# Clean and adjust height and weight for Rio 2016 dataset
file_rio_df['height (m)'] = file_rio_df['height (m)'].apply(lambda x: x / 100 if x > 2.5 else x)  # Convert heights like 180 to 1.80
file_rio_df['weight (kg)'] = file_rio_df['weight (kg)'].apply(lambda x: x / 10 if x > 300 else x)  # Convert weights like 158 to 75.8 kg

# Filter male triathletes from Rio 2016 dataset
file_rio_df_triathletes = file_rio_df[(file_rio_df['sport'] == 'triathlon') & (file_rio_df['sex'] == 'male')]

# Clean and adjust height and weight for athlete profiles dataset
file_profiles_df['Cleaned_Height_Profiles'] = file_profiles_df['height'].apply(clean_height_weight)
file_profiles_df['Cleaned_Weight_Profiles'] = file_profiles_df['weight'].apply(clean_height_weight)

file_profiles_df['Cleaned_Height_Profiles'] = file_profiles_df['Cleaned_Height_Profiles'].apply(lambda x: x / 100 if x > 2.5 else x)
file_profiles_df['Cleaned_Weight_Profiles'] = file_profiles_df['Cleaned_Weight_Profiles'].apply(lambda x: x / 10 if x > 300 else x)

# Ensure 'name' column is consistent across datasets
if 'Name' in file_profiles_df.columns:
    file_profiles_df.rename(columns={"Name": "name"}, inplace=True)

# Merging the cleaned dataframes
merged_df = pd.merge(
    file_rio_df_triathletes[['name', 'sport', 'nationality', 'date_of_birth', 'height (m)', 'weight (kg)']],
    file_profiles_df[['name', 'Cleaned_Height_Profiles', 'Cleaned_Weight_Profiles']],
    on='name',
    how='outer'
)

# Combine height and weight columns (use profiles data if available, then Rio data)
merged_df['Final_Height'] = merged_df.apply(lambda row: row['Cleaned_Height_Profiles'] if pd.notna(row['Cleaned_Height_Profiles']) 
                                             else row['height (m)'], axis=1)
merged_df['Final_Weight'] = merged_df.apply(lambda row: row['Cleaned_Weight_Profiles'] if pd.notna(row['Cleaned_Weight_Profiles']) 
                                             else row['weight (kg)'], axis=1)

# Combine the source columns into one
merged_df['Source'] = merged_df.apply(lambda row: 'PRO Triathletes' if pd.notna(row['Cleaned_Height_Profiles']) 
                                      else 'Rio 2016 Olympics', axis=1)

# Drop extra columns
merged_df.drop(columns=['height (m)', 'weight (kg)', 'Cleaned_Height_Profiles', 'Cleaned_Weight_Profiles'], inplace=True)

# Save the final dataset to a CSV file
merged_df.to_csv(output_combined_file_path, index=False)

print(f"File saved successfully at: {output_combined_file_path}")


File saved successfully at: /Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/body_composition/All_athletes_height_weight.csv


In the end I was able to collect 267 heights and 239 weights from elite male triathletes - this should be more than enough to complete the analysis 