## WNBA Team Stats 2024

RUN THIS PROGRAM BEFORE EACH WORKSHOP TO GET THE LATEST DATA.

In [None]:
"""
Program: WNBA Team Stats Scraper for 2024
Author: Peter Beens
Date: 2024-05-19
Description: This program scrapes WNBA team statistics from ESPN's website for the 2024 season. It uses Selenium to extract team names and pandas to 
process the statistics tables. The final dataset is saved as a CSV file.
Usage: Ensure that Firefox and geckodriver are installed. Run the script in an environment where these dependencies are properly configured.
Dependencies: 
    - selenium
    - pandas
    - Firefox browser
    - geckodriver (ensure it is installed and in your PATH)
Notes: 
    - The script assumes the structure of the webpage remains constant.
    - Modify the URL or XPath expressions if the webpage structure changes.
"""

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By

# Initialize the WebDriver for Firefox. Ensure geckodriver is installed and in your PATH.
wd = webdriver.Firefox()

# Define the URL to scrape
url = "https://www.espn.com/wnba/stats/team/_/season/2024/seasontype/2"

# Open the URL in the WebDriver
wd.get(url)

# Extract the team names using Selenium by locating elements that match the given XPath
team_elements = wd.find_elements(By.XPATH, "//div[@class='ResponsiveTable ResponsiveTable--fixed-left mt4 Table2__title--remove-capitalization']//a[contains(@href, '/wnba/team/_/name/')]")

# Extract the text from each WebElement and store it in a list
team_names = [element.text for element in team_elements if element.text]

# Close the WebDriver
wd.quit()

# Print the number of team names extracted and the team names themselves for verification
print(f"Number of team names extracted: {len(team_names)}")
print(team_names)

# Use pandas to read the HTML tables from the webpage source
tables = pd.read_html(url)

# Print the number of tables found on the webpage for verification
print(f"Number of tables found: {len(tables)}")

# Assuming the first table contains 'RK' and 'TEAM' columns, and the second table contains the rest of the statistics
rank_team = tables[0]
stats = tables[1]

# Remove the 'RK' and 'Team' columns from the rank_team DataFrame
rank_team = rank_team.drop(columns=['RK', 'Team'])

# Add the extracted team names to the rank_team DataFrame
rank_team['Team'] = team_names

# Merge the rank_team and stats DataFrames on their index to create a complete dataset
df = pd.concat([rank_team, stats], axis=1)

# Display the final DataFrame to ensure correctness
print(df)

# Save the DataFrame to a CSV file named 'wnba_team_stats_2024.csv'
df.to_csv('wnba_team_stats_2024.csv', index=False)


## Combine All Team Stats

USED AFTER UPDATING DATA FROM 2024.

In [None]:
import pandas as pd
import os

"""
Program: WNBA Team Stats Aggregator
Author: Peter
Date: 2024-05-19
Description: This program aggregates WNBA team statistics from multiple CSV files, each representing a year from 2007 to 2024. It reads each CSV file, 
adds a 'Year' column, and combines the data into a single DataFrame. The final DataFrame is sorted by year (descending) and team (ascending), and saved 
to a CSV file.
Usage: Ensure that all CSV files (wnba_team_stats_YYYY.csv) are in the same directory as this script. Run the script to generate a combined CSV file 
(wnba_team_stats_all.csv) containing all the data.
Dependencies: pandas, os
Notes: Make sure that the CSV files follow a consistent format with the expected columns.
"""

# Initialize an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

# Loop through the years from 2007 to 2024
for year in range(2007, 2024 + 1):
    filename = f'wnba_team_stats_{year}.csv'
    print(f'Processing {filename}...')
    
    try:
        if os.path.exists(filename):
            # Read the CSV file into a DataFrame
            df = pd.read_csv(filename)
            
            # Add the 'Year' column
            df['Year'] = year
            
            # Append the data to the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        else:
            print(f'File {filename} does not exist.')
    except Exception as e:
        print(f'An error occurred while processing {filename}: {e}')

# Sort the combined DataFrame by 'Year' (reversed) and 'Team'
combined_df.sort_values(by=['Year', 'Team'], ascending=[False, True], inplace=True)

# Reorder the columns
ordered_columns = ['Team', 'Year', 'GP', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OR', 'DR', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF']
combined_df = combined_df[ordered_columns]

# Define the output file path
output_file = 'wnba_team_stats_all.csv'

# Save the combined DataFrame to a CSV file
combined_df.to_csv(output_file, index=False)

# Display a success message
print("'wnba_team_stats_all.csv' created successfully.")

# Display the combined DataFrame
print(combined_df)


## Get All Team Data (2007-2024)

DO NOT USE THIS SCRIPT. ONLY USED TO INITIALLY GET ALL THE DATA. 

FOR UPDATED 2024 DATA, SEE BELOW.

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Function to scrape data for a given season
def scrape_season_data(season):
    # Initialize the WebDriver for Firefox. Ensure geckodriver is installed and in your PATH.
    wd = webdriver.Firefox()

    # Define the URL to scrape
    url = f"https://www.espn.com/wnba/stats/team/_/season/{season}/seasontype/2"

    # Open the URL in the WebDriver
    wd.get(url)

    # Extract the team names using Selenium by locating elements that match the given XPath
    team_elements = wd.find_elements(By.XPATH, "//div[@class='ResponsiveTable ResponsiveTable--fixed-left mt4 Table2__title--remove-capitalization']//a[contains(@href, '/wnba/team/_/name/')]")
    # Extract the text from each WebElement and store it in a list
    team_names = [element.text for element in team_elements if element.text]

    # Close the WebDriver
    wd.quit()

    # Print the number of team names extracted and the team names themselves for verification
    print(f"Season {season}: Number of team names extracted: {len(team_names)}")
    # print(team_names)

    # Use pandas to read the HTML tables from the webpage source
    tables = pd.read_html(url)

    # Print the number of tables found on the webpage for verification
    # print(f"Season {season}: Number of tables found: {len(tables)}")

    # Assuming the first table contains 'RK' and 'TEAM' columns, and the second table contains the rest of the statistics
    rank_team = tables[0]
    stats = tables[1]

    # Remove the 'RK' and 'Team' columns from the rank_team DataFrame
    rank_team = rank_team.drop(columns=['RK', 'Team'])
    # Add the extracted team names to the rank_team DataFrame
    rank_team['Team'] = team_names

    # Merge the rank_team and stats DataFrames on their index to create a complete dataset
    df = pd.concat([rank_team, stats], axis=1)

    # Save the DataFrame to a CSV file named f'wnba_team_stats_{season}.csv'
    df.to_csv(f'wnba_team_stats_{season}.csv', index=False)

# Loop over the range of seasons from 2007 to 2024
for season in range(2007, 2025):
    scrape_season_data(season)
    # Wait for a few seconds to avoid overloading the server
    time.sleep(5)
