## WNBA Team Stats 2024

RUN THIS PROGRAM BEFORE EACH WORKSHOP TO GET THE LATEST DATA.

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set the target season
end_year = 2025
start_year = end_year - 1

# Construct URL and filename
url = f"https://www.espn.com/nhl/stats/team/_/season/{end_year}/seasontype/2"
filename = f'nhl_team_stats_{start_year}-{end_year}.csv'

print(f"🔄 Scraping season {start_year}-{end_year}...")

# Set up Selenium options to mimic a real user
options = Options()
options.set_preference(
    "general.useragent.override",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
)

# Initialize WebDriver
wd = webdriver.Firefox(options=options)
wd.get(url)

try:
    # Wait for elements to load
    WebDriverWait(wd, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, '/nhl/team/_/name/')]"))
    )

    # Extract team names
    team_elements = wd.find_elements(By.XPATH, "//a[contains(@href, '/nhl/team/_/name/')]")
    team_names = [element.text for element in team_elements if element.text]

    if not team_names:
        print(f"⚠ Warning: No team names found for {start_year}-{end_year}. Exiting...")
        wd.quit()
        exit()

    print(f"✅ Extracted {len(team_names)} teams for {start_year}-{end_year}.")

    # Read tables from the page
    tables = pd.read_html(url)

    if len(tables) < 2:
        print(f"⚠ Error: Expected tables not found for {start_year}-{end_year}. Exiting...")
        wd.quit()
        exit()

    # Process table data
    rank_team = tables[0]
    stats = tables[1]

    # Remove the 'RK' and 'Team' columns from rank_team
    rank_team = rank_team.drop(columns=['RK', 'Team'], errors='ignore')  # Ignore missing columns

    # Ensure team names list matches row count
    if len(rank_team) != len(team_names):
        print(f"⚠ Mismatch: Expected {len(rank_team)} teams, but got {len(team_names)}.")
        wd.quit()
        exit()

    # Add team names
    rank_team['Team'] = team_names

    # Merge data
    df = pd.concat([rank_team, stats], axis=1)

    # Save to CSV in the same directory as the script
    df.to_csv(filename, index=False)
    print(f"✅ Data successfully saved to {filename}")

except Exception as e:
    print(f"❌ Error processing season {start_year}-{end_year}: {e}")

finally:
    wd.quit()  # Close WebDriver

print("🎉 Scraping complete!")


🔄 Scraping season 2024-2025...
✅ Extracted 32 teams for 2024-2025.
✅ Data successfully saved to nhl_team_stats_2024-2025.csv
🎉 Scraping complete!


## Combine All Team Stats

USED AFTER UPDATING DATA FROM 2024.

In [None]:
import pandas as pd
import os

"""
Program: WNBA Team Stats Aggregator
Author: Peter
Date: 2024-05-19
Description: This program aggregates WNBA team statistics from multiple CSV files, each representing a year from 2007 to 2024. It reads each CSV file, 
adds a 'Year' column, and combines the data into a single DataFrame. The final DataFrame is sorted by year (descending) and team (ascending), and saved 
to a CSV file.
Usage: Ensure that all CSV files (wnba_team_stats_YYYY.csv) are in the same directory as this script. Run the script to generate a combined CSV file 
(wnba_team_stats_all.csv) containing all the data.
Dependencies: pandas, os
Notes: Make sure that the CSV files follow a consistent format with the expected columns.
"""

# Initialize an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

# Loop through the years from 2007 to 2024
for year in range(2007, 2024 + 1):
    filename = f'wnba_team_stats_{year}.csv'
    print(f'Processing {filename}...')
    
    try:
        if os.path.exists(filename):
            # Read the CSV file into a DataFrame
            df = pd.read_csv(filename)
            
            # Add the 'Year' column
            df['Year'] = year
            
            # Append the data to the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        else:
            print(f'File {filename} does not exist.')
    except Exception as e:
        print(f'An error occurred while processing {filename}: {e}')

# Sort the combined DataFrame by 'Year' (reversed) and 'Team'
combined_df.sort_values(by=['Year', 'Team'], ascending=[False, True], inplace=True)

# Reorder the columns
ordered_columns = ['Team', 'Year', 'GP', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OR', 'DR', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF']
combined_df = combined_df[ordered_columns]

# Define the output file path
output_file = 'wnba_team_stats_all.csv'

# Save the combined DataFrame to a CSV file
combined_df.to_csv(output_file, index=False)

# Display a success message
print("'wnba_team_stats_all.csv' created successfully.")

# Display the combined DataFrame
print(combined_df)


## Get All Team Data (2007-2024)

DO NOT USE THIS SCRIPT. ONLY USED TO INITIALLY GET ALL THE DATA. 

FOR UPDATED 2024 DATA, SEE BELOW.

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Function to scrape data for a given season
def scrape_season_data(season):
    # Initialize the WebDriver for Firefox. Ensure geckodriver is installed and in your PATH.
    wd = webdriver.Firefox()

    # Define the URL to scrape
    url = f"https://www.espn.com/wnba/stats/team/_/season/{season}/seasontype/2"

    # Open the URL in the WebDriver
    wd.get(url)

    # Extract the team names using Selenium by locating elements that match the given XPath
    team_elements = wd.find_elements(By.XPATH, "//div[@class='ResponsiveTable ResponsiveTable--fixed-left mt4 Table2__title--remove-capitalization']//a[contains(@href, '/wnba/team/_/name/')]")
    # Extract the text from each WebElement and store it in a list
    team_names = [element.text for element in team_elements if element.text]

    # Close the WebDriver
    wd.quit()

    # Print the number of team names extracted and the team names themselves for verification
    print(f"Season {season}: Number of team names extracted: {len(team_names)}")
    # print(team_names)

    # Use pandas to read the HTML tables from the webpage source
    tables = pd.read_html(url)

    # Print the number of tables found on the webpage for verification
    # print(f"Season {season}: Number of tables found: {len(tables)}")

    # Assuming the first table contains 'RK' and 'TEAM' columns, and the second table contains the rest of the statistics
    rank_team = tables[0]
    stats = tables[1]

    # Remove the 'RK' and 'Team' columns from the rank_team DataFrame
    rank_team = rank_team.drop(columns=['RK', 'Team'])
    # Add the extracted team names to the rank_team DataFrame
    rank_team['Team'] = team_names

    # Merge the rank_team and stats DataFrames on their index to create a complete dataset
    df = pd.concat([rank_team, stats], axis=1)

    # Save the DataFrame to a CSV file named f'wnba_team_stats_{season}.csv'
    df.to_csv(f'wnba_team_stats_{season}.csv', index=False)

# Loop over the range of seasons from 2007 to 2024
for season in range(2007, 2025):
    scrape_season_data(season)
    # Wait for a few seconds to avoid overloading the server
    time.sleep(5)
