In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def scrape_athlete_data(url):
    """Scrapes athlete names and heights from a given URL.

    Args:
        url (str): The URL of the roster page.

    Returns:
        pd.DataFrame: A DataFrame containing athlete names and heights, or None if an error occurs.
    """

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        raw_heights = soup.find_all('td', class_='height')
        raw_names = soup.find_all('td', class_='sidearm-table-player-name')

        if not raw_heights or not raw_names:
            return None

        heights = []
        names = []
        for height, name in zip(raw_heights, raw_names):
            height_str = height.text.strip()
            if height_str:
                if '-' in height_str:
                    try:
                        feet, inches = map(float, height_str.split('-'))
                        height_in_inches = feet * 12 + inches
                        heights.append(height_in_inches)
                    except ValueError:
                        print(f"Invalid height format for {name.text}: {height_str}")
                        heights.append(None)
                else:
                    print(f"Unexpected height format for {name.text}: {height_str}")
                    heights.append(None)
            else:
                # Handle missing height values
                heights.append(None)
            names.append(name.text.strip())

        athletes = {'Name': names, 'Height': heights}
        return pd.DataFrame(athletes)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

def main():
    # List of URLs for men's volleyball teams
    mens_volleyball_urls = [
    'https://ccnyathletics.com/sports/mens-volleyball/roster',
    'https://lehmanathletics.com/sports/mens-volleyball/roster',
    'https://www.brooklyncollegeathletics.com/sports/mens-volleyball/roster',
    'https://johnjayathletics.com/sports/mens-volleyball/roster',
    'https://athletics.baruch.cuny.edu/sports/mens-volleyball/roster',
    'https://mecathletics.com/sports/mens-volleyball/roster',
    'https://www.huntercollegeathletics.com/sports/mens-volleyball/roster',
    'https://yorkathletics.com/sports/mens-volleyball/roster',
    'https://ballstatesports.com/sports/mens-volleyball/roster'
    ]

    # Create an empty DataFrame to store all data
    mens_volleyball_data = pd.DataFrame(columns=['Name', 'Height'])

    # Scrape data from each URL and handle errors
    for url in mens_volleyball_urls:
        data = scrape_athlete_data(url)
        if data is not None:
            mens_volleyball_data = pd.concat([mens_volleyball_data, data], ignore_index=True)

    # Save data to CSV file
    mens_volleyball_data.to_csv('mens_volleyball_data.csv', index=False)

if __name__ == '__main__':
    main()

Invalid height format for 
Tylar Pina
: -


Data Loading and Data Cleaning

In [4]:
men_volleyball_data = pd.read_csv('mens_volleyball_data.csv')
men_volleyball_data = men_volleyball_data.dropna(subset=['Height'])

Find the average height in each of the 4 dataframes (so you should have 4 averages in total). Print these values in your program

In [5]:
average_men_volleyball_height = men_volleyball_data['Height'].mean()
average_men_volleyball_height = average_men_volleyball_height.round(2)
print(f'Average height of men volleyball: {average_men_volleyball_height}')

Average height of men volleyball: 72.46


Find the names and the heights of the athletes with the 5 tallest and 5 shortest heights for both the men’s and women’s teams. Note: There could be ties among the athletes. For example, 8 athletes could have the top 5 tallest heights. You must have 8 lists in total: tallest men swimmers, tallest men volleyball players, tallest women swimmers, tallest women volleyball players, shortest men swimmers, shortest men volleyball players, shortest women volleyball players, shortest women swimmers. Print these names and heights ONLY in your program.

In [6]:
#tallest men volleyball
sorted_men_volleyball = men_volleyball_data.sort_values(by = 'Height',ascending = False)
top_5_heights_men = sorted_men_volleyball['Height'].unique()[:5]
top_5_heights_men
tallest_men_volleyball = sorted_men_volleyball[sorted_men_volleyball['Height'].isin(top_5_heights_men)]
tallest_men_volleyball
print(f'The top 5 heights of the men volleyball players:{top_5_heights_men}')
print(f'The count of the number of players with top 5 heights:{tallest_men_volleyball.count()}')
print(tallest_men_volleyball[['Name','Height']])

The top 5 heights of the men volleyball players:[82. 81. 79. 78. 77.]
The count of the number of players with top 5 heights:Name      23
Height    23
dtype: int64
                       Name  Height
129         Will  Patterson    82.0
130          Mateusz Karpow    81.0
128          Patrick Rogers    79.0
127  Braydon Savitski-Lynde    79.0
93           Annes Deljanin    79.0
54    Kyle Barriffe-Johnson    79.0
114     Griffin Satterfield    79.0
131         Peter  Zurawski    78.0
121             Eyal Rawitz    78.0
96             Jon Pflueger    78.0
115         Trevor Phillips    78.0
116          Vanis Buckholz    78.0
118          Rodney Wallace    78.0
117           Lucas Machado    78.0
98       Matisse Lee-Maarek    78.0
122               Dyer Ball    78.0
126   Tinaishe Ndavazocheva    78.0
69                Haoxin Hu    77.0
33                Jason Lin    77.0
102         Austinson Cooke    77.0
120              Ryan Bartz    77.0
89             Lucas George    77.0
123      

In [7]:
#shortest men volleyball
sorted_men_volleyball = men_volleyball_data.sort_values(by = 'Height',ascending = True)
shortest_5_heights_men = sorted_men_volleyball['Height'].unique()[:5]
shortest_5_heights_men
shortest_men_volleyball = sorted_men_volleyball[sorted_men_volleyball['Height'].isin(shortest_5_heights_men)]
shortest_men_volleyball
print(f'The shortest 5 heights of the men volleyball players:{shortest_5_heights_men}')
print(f'The count of the number of players with shortest 5 heights:{shortest_men_volleyball.count()}')
print(shortest_men_volleyball[['Name','Height']])

The shortest 5 heights of the men volleyball players:[63. 65. 66. 67. 68.]
The count of the number of players with shortest 5 heights:Name      26
Height    26
dtype: int64
                        Name  Height
14               Chris  Mata    63.0
27           Dion  Csikortos    65.0
26                  Wai  Lin    65.0
13   Joseph Gianfranco Teves    65.0
79            Kayshaun Higgs    66.0
25               Ali Zandani    66.0
78             Patrick Mateo    66.0
42             Benjamin Chen    66.0
66                Naoki Tani    67.0
19           Christian Agudo    67.0
16               Jimmy Evans    67.0
12             Ray Rodriguez    67.0
82               Brandon Yeh    67.0
7         David Reyes Galvez    67.0
112              Xander Pink    67.0
46            Evan Wandersee    68.0
106         Stanley  Sanchez    68.0
30                David Lema    68.0
48   Daniel Gonzalez Verdejo    68.0
43       Sebastian Rodriguez    68.0
72              Carlos Abreu    68.0
40           