In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
def scrape_athlete_data(url):
    """Scrapes athlete names and heights from a given URL.

    Args:
        url (str): The URL of the roster page.

    Returns:
        pd.DataFrame: A DataFrame containing athlete names and heights, or None if an error occurs.
    """

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        raw_heights = soup.find_all('td', class_='height')
        raw_names = soup.find_all('td', class_='sidearm-table-player-name')

        if not raw_heights or not raw_names:
            return None

        heights = []
        names = []
        for height, name in zip(raw_heights, raw_names):
            height_str = height.text.strip()
            if height_str:
                if '-' in height_str:
                    try:
                        feet, inches = map(float, height_str.split('-'))
                        height_in_inches = feet * 12 + inches
                        heights.append(height_in_inches)
                    except ValueError:
                        print(f"Invalid height format for {name.text}: {height_str}")
                        heights.append(None)
                else:
                    print(f"Unexpected height format for {name.text}: {height_str}")
                    heights.append(None)
            else:
                # Handle missing height values
                heights.append(None)
            names.append(name.text.strip())

        athletes = {'Name': names, 'Height': heights}
        return pd.DataFrame(athletes)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

def main():
    # List of URLs for men's swimming teams
    women_swimming_urls = [
      'https://csidolphins.com/sports/womens-swimming-and-diving/roster',
      'https://queensknights.com/sports/womens-swimming-and-diving/roster',
      'https://yorkathletics.com/sports/womens-swimming-and-diving/roster',
      'https://athletics.baruch.cuny.edu/sports/womens-swimming-and-diving/roster/2021-22?path=wswim',
      'https://www.brooklyncollegeathletics.com/sports/womens-swimming-and-diving/roster',
      'https://lindenwoodlions.com/sports/womens-swimming-and-diving/roster',
      'https://mckbearcats.com/sports/womens-swimming-and-diving/roster',
      'https://ramapoathletics.com/sports/womens-swimming-and-diving/roster',
      'https://keanathletics.com/sports/womens-swimming-and-diving/roster',
      'https://oneontaathletics.com/sports/womens-swimming-and-diving/roster',

    ]

    # Create an empty DataFrame to store all data
    women_swimming_data = pd.DataFrame(columns=['Name', 'Height'])

    # Scrape data from each URL and handle errors
    for url in women_swimming_urls:
        data = scrape_athlete_data(url)
        if data is not None:
            women_swimming_data = pd.concat([women_swimming_data, data], ignore_index=True)

    # Save data to CSV file
    women_swimming_data.to_csv('women_swimming_data.csv', index=False)

if __name__ == '__main__':
    main()

Invalid height format for 
Rupakshi Aggarwal
: -
Invalid height format for 
Claudia Cimino
: -
Invalid height format for 
Christine  Bailey
: -
Invalid height format for 
Elizabeth Bailey
: -
Invalid height format for 
Gurveen Dhallu
: -
Invalid height format for 
Ayatallah Elkotby
: -
Invalid height format for 
Erin  Forrest 
: -
Invalid height format for 
Sydney Gdanski
: -
Invalid height format for 
Julie  Huang
: -
Invalid height format for 
Hailey Johnston
: -
Invalid height format for 
Anastasia Kutuzov
: -
Invalid height format for 
Diana Plasencia
: -
Invalid height format for 
Veronika Tsiko
: -
Invalid height format for 
Evi Blennert
: -
Invalid height format for 
Kayla Bulseco
: -
Invalid height format for 
Kathryn Chaves
: -
Invalid height format for 
Selena Corona
: -
Invalid height format for 
Maddie Lynch
: -
Invalid height format for 
Lucy Price
: -
Invalid height format for 
Calahan Williams
: -
Invalid height format for 
Emma Grefski
: -


Data Loading and Data Cleaning

In [5]:
women_swimming_data = pd.read_csv('women_swimming_data.csv')
women_swimming_data = women_swimming_data.dropna(subset=['Height'])

Find the average height in each of the 4 dataframes (so you should have 4 averages in total). Print these values in your program

In [6]:
average_women_swimming_height = women_swimming_data['Height'].mean()
average_women_swimming_height = average_women_swimming_height.round(2)
print(f'Average height of women swimming: {average_women_swimming_height}')

Average height of women swimming: 65.86


Find the names and the heights of the athletes with the 5 tallest and 5 shortest heights for both the men’s and women’s teams. Note: There could be ties among the athletes. For example, 8 athletes could have the top 5 tallest heights. You must have 8 lists in total: tallest men swimmers, tallest men volleyball players, tallest women swimmers, tallest women volleyball players, shortest men swimmers, shortest men volleyball players, shortest women volleyball players, shortest women swimmers. Print these names and heights ONLY in your program.

In [7]:
#tallest women swimmers
sorted_women_swimmers = women_swimming_data.sort_values(by = 'Height',ascending = False)
top_5_heights_women = sorted_women_swimmers['Height'].unique()[:5]
top_5_heights_women
tallest_women_swimmers = sorted_women_swimmers[sorted_women_swimmers['Height'].isin(top_5_heights_women)]
tallest_women_swimmers
print(f'The top 5 heights of the women swimmers:{top_5_heights_women}')
print(f'The count of the number of players with top 5 heights:{tallest_women_swimmers.count()}')
print(tallest_women_swimmers[['Name','Height']])

The top 5 heights of the women swimmers:[75. 73. 72. 71. 70.]
The count of the number of players with top 5 heights:Name      13
Height    13
dtype: int64
                     Name  Height
78         McKenzie Fazio    75.0
66        Maja Piotrowicz    73.0
40     Hebatallah Elkotby    72.0
51        Kornelia Buszka    71.0
83      Presley Heitzmann    71.0
115           Grace Geyer    71.0
94      Karley Sonnenberg    70.0
58              Adi Luker    70.0
60        Stephanie Marks    70.0
91   Gabriela Novais Lima    70.0
88          Victoria Maki    70.0
96          Jocelyn Zgola    70.0
102           Grace Korey    70.0


In [8]:
#shortest women swimmers
sorted_women_swimmers = women_swimming_data.sort_values(by = 'Height',ascending = True)
shortest_5_heights_women = sorted_women_swimmers['Height'].unique()[:5]
shortest_5_heights_women
shortest_women_swimmers = sorted_women_swimmers[sorted_women_swimmers['Height'].isin(shortest_5_heights_women)]
shortest_women_swimmers
print(f'Theshortest 5 heights of the women swimmers:{shortest_5_heights_women}')
print(f'The count of the number of players with shortest 5 heights:{shortest_women_swimmers.count()}')
print(shortest_women_swimmers[['Name','Height']])

Theshortest 5 heights of the women swimmers:[58. 59. 61. 62. 63.]
The count of the number of players with shortest 5 heights:Name      31
Height    31
dtype: int64
                   Name  Height
25          Julia Wyman    58.0
24          Nakia Usher    58.0
23    Melanie Rodriguez    59.0
13           Asha Besaw    61.0
104       Valeria Pages    61.0
103      Camila Mercado    61.0
19       Kristen Flores    61.0
29           Amanda Lee    61.0
116     Taylor Jacobson    61.0
8         Deanna Soueid    62.0
132     Cadence  Dudley    62.0
22      Fatima Morrobel    62.0
98      Julia Caramagna    62.0
7     Kolleen Rodriguez    62.0
106       Magdalen Swat    62.0
126     Kylie Tininenko    62.0
143        Sophia Olson    62.0
30        Katherine Lee    62.0
101           Maddy Joe    62.0
100          Keila Fane    62.0
108         Cami Yovich    62.0
35             Amy Zeng    62.0
62     Beata Maruszczyk    63.0
81           Atzi Gomez    63.0
77   Terrah DeLorimiere    63.0
0   