In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:

def scrape_athlete_data(url):
    """Scrapes athlete names and heights from a given URL.

    Args:
        url (str): The URL of the roster page.

    Returns:
        pd.DataFrame: A DataFrame containing athlete names and heights, or None if an error occurs.
    """

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        raw_heights = soup.find_all('td', class_='height')
        raw_names = soup.find_all('td', class_='sidearm-table-player-name')

        if not raw_heights or not raw_names:
            return None

        heights = []
        names = []
        for height, name in zip(raw_heights, raw_names):
            height_str = height.text.strip()
            if height_str:
                if '-' in height_str:
                    try:
                        feet, inches = map(float, height_str.split('-'))
                        height_in_inches = feet * 12 + inches
                        heights.append(height_in_inches)
                    except ValueError:
                        print(f"Invalid height format for {name.text}: {height_str}")
                        heights.append(None)
                else:
                    print(f"Unexpected height format for {name.text}: {height_str}")
                    heights.append(None)
            else:
                # Handle missing height values
                heights.append(None)
            names.append(name.text.strip())

        athletes = {'Name': names, 'Height': heights}
        return pd.DataFrame(athletes)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

def main():
    # List of URLs for men's swimming teams
    mens_swimming_urls = [
      'https://csidolphins.com/sports/mens-swimming-and-diving/roster',
      'https://yorkathletics.com/sports/mens-swimming-and-diving/roster',
      'https://athletics.baruch.cuny.edu/sports/mens-swimming-and-diving/roster',
      'https://www.brooklyncollegeathletics.com/sports/mens-swimming-and-diving/roster',
      'https://lindenwoodlions.com/sports/mens-swimming-and-diving/roster',
      'https://mckbearcats.com/sports/mens-swimming-and-diving/roster',
      'https://ramapoathletics.com/sports/mens-swimming-and-diving/roster',
      'https://oneontaathletics.com/sports/mens-swimming-and-diving/roster',
      'https://bubearcats.com/sports/mens-swimming-and-diving/roster/2021-22',
      'https://albrightathletics.com/sports/mens-swimming-and-diving/roster/2021-22'

    ]

    # Create an empty DataFrame to store all data
    mens_swimming_data = pd.DataFrame(columns=['Name', 'Height'])

    # Scrape data from each URL and handle errors
    for url in mens_swimming_urls:
        data = scrape_athlete_data(url)
        if data is not None:
            mens_swimming_data = pd.concat([mens_swimming_data, data], ignore_index=True)

    # Save data to CSV file
    mens_swimming_data.to_csv('mens_swimming_data.csv', index=False)

if __name__ == '__main__':
    main()

Invalid height format for 
Aziz Abdusamiev
: -
Invalid height format for 
Aaron Brijbukhan
: -
Invalid height format for 
Zurab Chkhartishvili
: -
Invalid height format for 
Carlos Garcia-Nunez
: -
Invalid height format for 
Daniel Kalinin
: -
Invalid height format for 
Michael Kravets
: -
Invalid height format for 
Nicholai Krylyuk
: -
Invalid height format for 
Ian Lichaniu
: -
Invalid height format for 
Allen Mardakhayev
: -
Invalid height format for 
Mujibar Shaad
: -
Invalid height format for 
Jaxon Bradburn
: -
Invalid height format for 
Ondrej Dusa
: -
Invalid height format for 
Lance Godard
: -
Invalid height format for 
Piotr Kowalczyk
: -
Invalid height format for 
Bennet Loving
: -
Invalid height format for 
Polat Tasbasi
: -
Invalid height format for 
Max Wetteland
: -
Invalid height format for 
Nick Williams
: -


Data loading and data cleaning

In [4]:
men_swimming_data = pd.read_csv('mens_swimming_data.csv')
men_swimming_data = men_swimming_data.dropna(subset=['Height'])

Find the average height in each of the 4 dataframes (so you should have 4 averages in total). Print these values in your program

In [5]:
average_men_swimming_height = men_swimming_data['Height'].mean()
average_men_swimming_height = average_men_swimming_height.round(2)
print(f'Average height of men swimming: {average_men_swimming_height}')

Average height of men swimming: 71.36


Find the names and the heights of the athletes with the 5 tallest and 5 shortest heights for both the men’s and women’s teams. Note: There could be ties among the athletes. For example, 8 athletes could have the top 5 tallest heights. You must have 8 lists in total: tallest men swimmers, tallest men volleyball players, tallest women swimmers, tallest women volleyball players, shortest men swimmers, shortest men volleyball players, shortest women volleyball players, shortest women swimmers. Print these names and heights ONLY in your program.

In [6]:
#tallest men swimmers
sorted_men_swimmers = men_swimming_data.sort_values(by = 'Height',ascending = False)
top_5_heights_men = sorted_men_swimmers['Height'].unique()[:5]
top_5_heights_men
tallest_men_swimmers = sorted_men_swimmers[sorted_men_swimmers['Height'].isin(top_5_heights_men)]
tallest_men_swimmers
print(f'The top 5 heights of the men swimmers:{top_5_heights_men}')
print(f'The count of the number of players with top 5 heights:{tallest_men_swimmers.count()}')
print(tallest_men_swimmers[['Name','Height']])

The top 5 heights of the men swimmers:[79. 77. 76. 75. 74.]
The count of the number of players with top 5 heights:Name      34
Height    34
dtype: int64
                  Name  Height
115        Tyson Upton    79.0
92       Colton Grimes    79.0
164         Eric Kroon    77.0
26       Peter Gavroff    76.0
75       Adam Szczerba    76.0
180  Nicholas Griffith    76.0
62   Mattia Giurgevich    76.0
31      Bennett Maczka    76.0
110     Matthew Stooke    76.0
160      Brian Harding    76.0
129          Joe Swede    75.0
24   Youssef  Elkhouly    75.0
150      Gavin Weseman    75.0
108     Patryk Rozenek    75.0
186   Peter Vanderwerf    75.0
126          Jack Hill    75.0
43     Ryan Badre-Hume    75.0
167       Ryan Maierle    75.0
82       Owen Brubaker    75.0
175       Henry Shemet    75.0
94        Ethan Hanson    75.0
107     Nathan Pollard    74.0
55      Isaac Zambrano    74.0
112         Noah Tague    74.0
56        Ryan Boeding    74.0
120       Thomas Breen    74.0
85      Pa

In [7]:
#shortest men swimmers
sorted_men_swimmers = men_swimming_data.sort_values(by = 'Height',ascending = True)
shortest_5_heights_men = sorted_men_swimmers['Height'].unique()[:5]
shortest_5_heights_men
shortest_men_swimmers = sorted_men_swimmers[sorted_men_swimmers['Height'].isin(shortest_5_heights_men)]
shortest_men_swimmers
print(f'The shortest 5 heights of the men swimmers:{shortest_5_heights_men}')
print(f'The count of the number of players with shortest 5 heights:{shortest_men_swimmers.count()}')
print(shortest_men_swimmers[['Name','Height']])

The shortest 5 heights of the men swimmers:[63. 64. 65. 66. 67.]
The count of the number of players with shortest 5 heights:Name      15
Height    15
dtype: int64
                  Name  Height
89     Miles Fleischer    63.0
2     Charles Cusumano    64.0
97         Wyatt Kurtz    65.0
53    Joseph Scarpetta    65.0
122     Samuel De Leon    65.0
20       Richard Reyes    66.0
158   Christopher Egan    66.0
125         AJ Guevara    66.0
155    Zachary Ciriaco    66.0
142   James  Llewellyn    66.0
121      Tyler Canteen    66.0
88      Mason Enthoven    67.0
5        Eslam Hussein    67.0
36   Gregory Terentyev    67.0
14       Sam Rozenfeld    67.0
