## WNBA Player Stats 2024

RUN THIS PROGRAM BEFORE EACH WORKSHOP TO GET THE LATEST DATA.

THE PROGRAM PROMPTS FOR YOU TO PRESS "SHOW MORE" TWICE.

In [3]:
"""
Program: WNBA Player Stats Scraper
Author: Peter Beens
Date: 2024-05-19
Description: This program scrapes WNBA player statistics from ESPN's website using Selenium and Pandas.
             It initializes a WebDriver, navigates to the stats page, and allows the user to manually
             interact with the page to load additional data. The program then extracts player names and
             their statistics, merges the data into a single DataFrame, and saves it to a CSV file.
Usage: Run the script in an environment with Selenium, Pandas, and a WebDriver installed. Ensure geckodriver
       is installed and in your PATH for Firefox.
Dependencies: 
    - selenium
    - pandas
    - time
Notes: 
    - Ensure you have the necessary WebDriver installed (geckodriver for Firefox).
    - Manual interaction is required to load additional data by pressing the 'Show More' button twice.
"""

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By

# Initialize the WebDriver for Firefox. Ensure geckodriver is installed and in your PATH.
wd = webdriver.Firefox()

# Define the URL to scrape
url = "https://www.espn.com/wnba/stats/player/_/season/2024/seasontype/2"

# Open the URL in the WebDriver
wd.get(url)

# Pause to allow manual interaction
input("Press Enter after pressing 'Show More' button twice...")

# Extract the player names using Selenium by locating elements that match the given XPath
player_elements = wd.find_elements(By.XPATH, "//tr[contains(@class, 'Table__TR')]//td[2]//a")

# Extract the text from each WebElement and store it in a list
names = [element.text for element in player_elements if element.text]

# Print the number of player names extracted and the player names themselves for verification
print(f"Number of player names extracted: {len(names)}")
print(names)

# Use pandas to read the HTML tables from the webpage source fetched by Selenium
html_source = wd.page_source
tables = pd.read_html(html_source)

# Print the number of tables found on the webpage for verification
# print(f"Number of tables found: {len(tables)}")

# Assuming the first table contains 'RK' and 'NAME' columns, and the second table contains the rest of the statistics
players = tables[0]
stats = tables[1]

# Drop the 'RK' column from the players DataFrame
players = players.drop(columns=['RK'])

# Add the extracted player names to the players DataFrame
players['Name'] = names

# Merge the players and stats DataFrames on their index to create a complete dataset
df = pd.concat([players, stats], axis=1)

# Display the final DataFrame to ensure correctness
print(df)

# Save the DataFrame to a CSV file named 'wnba_player_stats_2024.csv'
df.to_csv('wnba_player_stats_2024.csv', index=False)

# Close the WebDriver
wd.quit()


Number of player names extracted: 111
["A'ja Wilson", 'Kahleah Copper', 'Arike Ogunbowale', 'Napheesa Collier', 'Jewell Loyd', 'Sabrina Ionescu', 'Dearica Hamby', 'Breanna Stewart', 'Kelsey Plum', 'Jackie Young', 'Nneka Ogwumike', 'Caitlin Clark', 'DeWanna Bonner', 'Kelsey Mitchell', 'Chennedy Carter', 'Kayla McBride', 'Diana Taurasi', 'Ariel Atkins', 'Jonquel Jones', 'Allisha Gray', 'Aliyah Boston', 'Marina Mabrey', 'Tina Charles', 'Angel Reese', 'Skylar Diggins-Smith', 'Ezi Magbegor', 'Brionna Jones', 'DiJonai Carrington', 'Betnijah Laney-Hamilton', 'Natasha Cloud', 'NaLyssa Smith', 'Alanna Smith', 'Teaira McCowan', 'Alyssa Thomas', 'Rickea Jackson', 'Tyasha Harris', 'Stefanie Dolson', 'Cheyenne Parker-Tyus', 'Karlie Samuelson', 'Courtney Williams', 'Aari McDonald', 'Bridget Carleton', 'Monique Billings', 'Aaliyah Edwards', 'Aerial Powers', 'Kamilla Cardoso', 'Lexie Brown', 'Julie Vanloo', 'Myisha Hines-Allen', 'Tiffany Hayes', 'Sophie Cunningham', 'Kia Nurse', 'Jordan Horston', 'Sha

  tables = pd.read_html(html_source)


## Combine All Years of WNBA Player Stats

RUN THIS PROGRAM BEFORE EACH WORKSHOP TO ADD THE LATEST DATA.

In [4]:
import pandas as pd
import glob

# Define the column order
column_order = ['Name', 'Year', 'POS', 'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'REB', 'AST', 'STL', 'BLK', 'TO']

# Initialize an empty list to hold DataFrames
all_data = []

# Use glob to get all filenames matching the pattern
file_pattern = "wnba_player_stats_*.csv"
files = glob.glob(file_pattern)

# Iterate through the list of files
for file in files:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    
    # Append the DataFrame to the list
    all_data.append(df)

# Concatenate all DataFrames in the list
combined_df = pd.concat(all_data, ignore_index=True)

# Reorder the columns
combined_df = combined_df[column_order]

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('wnba_player_stats_all.csv', index=False)

print("Files have been successfully combined and saved to 'wnba_player_stats_all.csv'.")


Files have been successfully combined and saved to 'wnba_player_stats_all.csv'.


## Get All Years of WNBA Player Stats

ONLY RUN ONCE!

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By

# Initialize the WebDriver for Firefox. Ensure geckodriver is installed and in your PATH.
wd = webdriver.Firefox()

# Define a function to scrape data for a given year
def scrape_wnba_stats(year):
    # Define the URL to scrape for the specific year
    url = f"https://www.espn.com/wnba/stats/player/_/season/{year}/seasontype/2"

    # Open the URL in the WebDriver
    wd.get(url)

    # Pause to allow manual interaction for pressing the 'Show More' button twice
    input(f"Press Enter after pressing 'Show More' button twice for the year {year}...")

    # Extract the player names using Selenium by locating elements that match the given XPath
    player_elements = wd.find_elements(By.XPATH, "//tr[contains(@class, 'Table__TR')]//td[2]//a")

    # Extract the text from each WebElement and store it in a list
    names = [element.text for element in player_elements if element.text]

    # Use pandas to read the HTML tables from the webpage source fetched by Selenium
    html_source = wd.page_source
    tables = pd.read_html(html_source)

    # Assuming the first table contains 'RK' and 'NAME' columns, and the second table contains the rest of the statistics
    players = tables[0]
    stats = tables[1]

    # Drop the 'RK' column from the players DataFrame
    players = players.drop(columns=['RK'])

    # Add the extracted player names to the players DataFrame
    players['Name'] = names

    # Merge the players and stats DataFrames on their index to create a complete dataset
    df = pd.concat([players, stats], axis=1)

    return df

# Iterate through the years from 2007 to 2024
for year in range(2007, 2024 + 1):
    print(f"Scraping data for the year {year}...")
    df_year = scrape_wnba_stats(year)
    df_year['Year'] = year  # Add a column to identify the year of the data
    
    # Save the DataFrame to a CSV file named 'wnba_player_stats_{year}.csv'
    df_year.to_csv(f'wnba_player_stats_{year}.csv', index=False)

# Close the WebDriver
wd.quit()
