In [3]:
# Requests and Beautifulsoup

import requests
from bs4 import BeautifulSoup
import pandas as pd

# List of years for which we want the rosters
years = list(range(2010, 2024))

# URL structure for each university
universities = {
    "UW_Badgers": "https://uwbadgers.com/sports/football/roster",
    #add any universities you are interested in here
}

# Loop through each university
for university_name, base_url in universities.items():
    # This will be our complete list of players across all years for current university
    all_players = []
    
    # Loop through each year
    for year in years:
        # Construct the URL for the year
        if year == 2023:
            url = base_url
        else:
            url = f"{base_url}/{year}"
        
        #if url has season isntead of year at the end:
#        year1 = int(str(year)[-2:])+1
#        url = f"{base_url}{year}-{year1}"
        
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the list item for each player
        for player in soup.find_all('li', attrs={'clasdearms': 'si-list-card-item'}):
            player_info = {}

            # Populate the player data
            player_info['Name'] = " ".join([tag.text for tag in player.find_all('span', {'class':['sidearm-roster-player-first-name', 'sidearm-roster-player-last-name']})])
            position = player.find('div', {'class': 'sidearm-list-card-details-item sidearm-roster-player-position-short'})
            player_info['Position'] = position.text.strip() if position else None
            height = player.find('span', {'class': 'sidearm-roster-player-height'})
            player_info['Height'] = height.text.strip() if height else None
            weight = player.find('span', {'class': 'sidearm-roster-player-weight'})
            player_info['Weight'] = weight.text.strip() if weight else None
            academic_year = player.find('span', {'class': 'sidearm-roster-player-academic-year'})
            player_info['Academic Year'] = academic_year.text.strip() if academic_year else None
            major = player.find('span', {'class': 'sidearm-roster-player-player-major'})
            player_info['Major'] = major.text.strip() if major else None
            hometown = player.find('span', {'class': 'sidearm-roster-player-hometown'})
            player_info['Hometown'] = hometown.text.strip() if hometown else None
            highschool = player.find('span', {'class': 'sidearm-roster-player-highschool'})
            player_info['Highschool'] = highschool.text.strip() if highschool else None
            # Add the year
            player_info['Year'] = year

            all_players.append(player_info)

    # Convert to a DataFrame
    df = pd.DataFrame(all_players)

    # Save to a CSV file named according to the university name
    df.to_csv(f'{university_name}_football_players(workshop).csv', index=False)

[]

In [None]:
#Selenium

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Initialize Chrome Driver
driver = webdriver.Chrome()

# List of years
years = list(range(2010, 2024))

# Base URL
base_url = "https://ohiostatebuckeyes.com/sports/football/roster/"
#replace with the uni you are interested in

# Create an empty DataFrame to store all players
all_years_players = pd.DataFrame()

# Loop through each year
for year in years:
    # Construct the URL for the year
    if year == 2023:
        url = "https://ohiostatebuckeyes.com/sports/football/roster/2023"
        #replace with the university you are interested in
    else:
        year1 = int(str(year)[-2:])+1
        url = f"{base_url}{year}-{year1}"
        
    # Navigate to the URL
    driver.get(url)
    
    # Click the button to change the view to a table
    try:
        button = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, "_viewType_table")))
        button.click()
    except:
        print(f"Could not find button for the year {year}")
        continue

    # Wait for the table to load
    try:
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[class^="s-table-body__row"]')))
    except:
        print(f"Timed out waiting for page to load for the year {year}")
        continue

    # Start scraping
    rows = driver.find_elements(by=By.CSS_SELECTOR, value='[class^="s-table-body__row"]')
    all_players = []

    # Your existing loop to extract player information is here
    for row in rows:
        player_info = {}
        class_attribute = row.get_attribute('class')
    
        if '--index-' in class_attribute:
            index = class_attribute.split('--index-')[1]
        else:
            index = None

        tds = row.find_elements(by=By.TAG_NAME, value='td')
    
        player_info['Index'] = index
        player_info['Name'] = tds[1].text if len(tds) > 1 else None
        player_info['Position'] = tds[2].text if len(tds) > 2 else None
        player_info['Height'] = tds[3].text if len(tds) > 3 else None
        player_info['Weight'] = tds[4].text if len(tds) > 4 else None
        player_info['Academic Year'] = tds[5].text if len(tds) > 5 else None
        player_info['Hometown / High School'] = tds[6].text if len(tds) > 6 else None
        player_info['Previous School'] = tds[7].text if len(tds) > 7 else None
        player_info['Year'] = year  # Adding the year
    
        all_players.append(player_info)    
    # Convert to DataFrame and append to all_years_players DataFrame
    year_df = pd.DataFrame(all_players)
    all_years_players = all_years_players.append(year_df, ignore_index=True)

    # Sleep for a few seconds to be polite to the server
    time.sleep(1)

# Save all_years_players DataFrame to a single CSV
all_years_players.to_csv(f'Ohio_football_players.csv', index=False)

# Close the browser
driver.quit()


In [None]:
import pandas as pd
import requests

API_KEY = "API"

# Fetch data from the Census API
url = f"https://api.census.gov/data/2019/acs/acs5?get=NAME,B01002_001E,B02001_002E,B02001_003E,B25010_001E,B01003_001E&for=place:*&in=state:*&key={API_KEY}"
response = requests.get(url)
data = response.json()

# Create a DataFrame from the fetched data
df = pd.DataFrame(data[1:], columns=data[0])

# Separate City and State from the NAME column
df['City'], df['State_Code'] = df['NAME'].str.split(',', 1).str
df['State_Code'] = df['State_Code'].str.strip()

# Additional columns
df["Average_Age"] = df["B01002_001E"].astype(float)
df["Total_Population"] = df["B01003_001E"].astype(float)
df["Average_Household_Size"] = df["B25010_001E"].astype(float)

# Calculate Percent White and Percent Black
df["Percent_White"] = (df["B02001_002E"].astype(float) / df["Total_Population"]) * 100
df["Percent_Black"] = (df["B02001_003E"].astype(float) / df["Total_Population"]) * 100

# Keep only the relevant columns
df = df[["City", "State_Code", "Average_Age", "Percent_White", "Percent_Black", "Average_Household_Size"]]

print(df)
