In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def scrape_athlete_data(url):
    """Scrapes athlete names and heights from a given URL.

    Args:
        url (str): The URL of the roster page.

    Returns:
        pd.DataFrame: A DataFrame containing athlete names and heights, or None if an error occurs.
    """

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        raw_heights = soup.find_all('td', class_='height')
        raw_names = soup.find_all('td', class_='sidearm-table-player-name')

        if not raw_heights or not raw_names:
            return None

        heights = []
        names = []
        for height, name in zip(raw_heights, raw_names):
            height_str = height.text.strip()
            if height_str:
                if '-' in height_str:
                    try:
                        feet, inches = map(float, height_str.split('-'))
                        height_in_inches = feet * 12 + inches
                        heights.append(height_in_inches)
                    except ValueError:
                        print(f"Invalid height format for {name.text}: {height_str}")
                        heights.append(None)
                else:
                    print(f"Unexpected height format for {name.text}: {height_str}")
                    heights.append(None)
            else:
                # Handle missing height values
                heights.append(None)
            names.append(name.text.strip())

        athletes = {'Name': names, 'Height': heights}
        return pd.DataFrame(athletes)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

def main():
    # List of URLs for women's volleyball teams
    women_volleyball_urls = [
    'https://bmccathletics.com/sports/womens-volleyball/roster',
    'https://yorkathletics.com/sports/womens-volleyball/roster',
    'https://hostosathletics.com/sports/womens-volleyball/roster',
    'https://bronxbroncos.com/sports/womens-volleyball/roster/2021',
    'https://queensknights.com/sports/womens-volleyball/roster',
    'https://augustajags.com/sports/wvball/roster',
    'https://flaglerathletics.com/sports/womens-volleyball/roster'
    'https://pacersports.com/sports/womens-volleyball/roster',
    'https://www.golhu.com/sports/womens-volleyball/roster',
    ]

    # Create an empty DataFrame to store all data
    women_volleyball_data = pd.DataFrame(columns=['Name', 'Height'])

    # Scrape data from each URL and handle errors
    for url in women_volleyball_urls:
        data = scrape_athlete_data(url)
        if data is not None:
            women_volleyball_data = pd.concat([women_volleyball_data, data], ignore_index=True)

    # Save data to CSV file
    women_volleyball_data.to_csv('women_volleyball_data.csv', index=False)

if __name__ == '__main__':
    main()

Invalid height format for 
Marie Claire Hurtado
: -


Data Loading and Data Cleaning

In [3]:
women_volleyball_data = pd.read_csv('women_volleyball_data.csv')
women_volleyball_data = women_volleyball_data.dropna(subset=['Height'])

Find the average height in each of the 4 dataframes (so you should have 4 averages in total). Print these values in your program

In [4]:
average_women_volleyball_height = women_volleyball_data['Height'].mean()
average_women_volleyball_height = average_women_volleyball_height.round(2)
print(f'Average height of women volleyball: {average_women_volleyball_height}')

Average height of women volleyball: 66.98


Find the names and the heights of the athletes with the 5 tallest and 5 shortest heights for both the men’s and women’s teams. Note: There could be ties among the athletes. For example, 8 athletes could have the top 5 tallest heights. You must have 8 lists in total: tallest men swimmers, tallest men volleyball players, tallest women swimmers, tallest women volleyball players, shortest men swimmers, shortest men volleyball players, shortest women volleyball players, shortest women swimmers. Print these names and heights ONLY in your program.

In [5]:
#tallest women volleyball
sorted_women_volleyball = women_volleyball_data.sort_values(by = 'Height',ascending = False)
top_5_heights_women = sorted_women_volleyball['Height'].unique()[:5]
top_5_heights_women
tallest_women_volleyball = sorted_women_volleyball[sorted_women_volleyball['Height'].isin(top_5_heights_women)]
tallest_women_volleyball
print(f'The top 5 heights of the women volleyball players:{top_5_heights_women}')
print(f'The count of the number of players with top 5 heights:{tallest_women_volleyball.count()}')
print(tallest_women_volleyball[['Name','Height']])

The top 5 heights of the women volleyball players:[75. 74. 73. 72. 71.]
The count of the number of players with top 5 heights:Name      18
Height    18
dtype: int64
                    Name  Height
92          Alyssa Daley    75.0
63        Sophia Kruczko    75.0
68          Madelyn Eden    74.0
70        Marin Freeland    74.0
55    Lindsay Osterhoudt    73.0
64           Lauren Posa    73.0
67       Jada Suguturaga    73.0
74         Jacklyn Simms    72.0
14  Jachimma Onwuamaegbu    72.0
95        Katrina Cowder    71.0
86     Evie Jane Rembold    71.0
54        Alicia Cervera    71.0
58        Kendall Conrad    71.0
65       Abigail LeVines    71.0
51      Anjeline Arnakis    71.0
87      Gracelynn Wolzen    71.0
89          Gabby Gealey    71.0
94      Kyleigh McDermit    71.0


In [6]:
#shortest women volleyball
sorted_women_volleyball = women_volleyball_data.sort_values(by = 'Height',ascending = True)
shortest_5_heights_women = sorted_women_volleyball['Height'].unique()[:5]
shortest_5_heights_women
shortest_women_volleyball = sorted_women_volleyball[sorted_women_volleyball['Height'].isin(shortest_5_heights_women)]
shortest_women_volleyball
print(f'The shortest 5 heights of the women volleyball players:{shortest_5_heights_women}')
print(f'The count of the number of players with shortest 5 heights:{shortest_women_volleyball.count()}')
print(shortest_women_volleyball[['Name','Height']])

The shortest 5 heights of the women volleyball players:[60. 61. 62. 63. 64.]
The count of the number of players with shortest 5 heights:Name      27
Height    27
dtype: int64
                    Name  Height
26         Samara Correa    60.0
1       Yisneily Morales    61.0
4        Garyana Altidor    61.0
34         Rashel Torres    61.0
17         Jasmine  Vega    61.0
71         Libby Nickels    61.0
27       Genesis Sigaran    62.0
39     Brianna Rodriguez    62.0
81  Salma Villa  Morales    62.0
16       Shamonie Miller    62.0
24       Devina  Luckhoo    63.0
59            Sarah Munn    63.0
32          Nyla  Rivera    63.0
72      Guilia Rodrigues    63.0
36       Justine Pomales    63.0
8          Teonnie Blake    63.0
77         Kaitlyn Houck    63.0
35      Marisabel Pujols    64.0
9            Kamya Hayes    64.0
33          Sahara Ramos    64.0
38         Nicole  Rosas    64.0
40  Maynee De Los Santos    64.0
6    Cheyenne Gray-Taitt    64.0
30          Natalie Mora    64.0
