# Women's Basketball Roster Scraper
Scrapes roster data (player_sr_link, number, class, pos, height) for all women's college basketball teams from Sports Reference

In [2]:
import scrapy
import numpy as np
import pandas as pd
import requests
from scrapy import Selector
import time
import re
import os

## Step 1: Get Women's Basketball Team Links

In [3]:
# Get the main college basketball page
url = "https://www.sports-reference.com/cbb/schools/"
html = requests.get(url).text
sel = Selector(text=html)

# 1. Find the wrapper
wrapper = sel.css('#all_NCAAW_schools')

# 2. Try to get the comment inside
comment = wrapper.xpath('comment()').get()

if comment:
    # It was hidden! Clean it.
    clean_html = comment.replace('<!--', '').replace('-->', '')
    table_sel = Selector(text=clean_html)
    print("Found women's table in comments. Unwrapping...")
else:
    # It wasn't hidden! Use the wrapper directly.
    table_sel = wrapper
    print("Women's table was not commented. Reading directly...")

# 3. Extract the links (these are women's basketball team pages)
wbb_links_list = [
    f"https://www.sports-reference.com{link}" 
    for link in table_sel.css('td[data-stat="school_name"] a::attr(href)').getall()
]

print(f"Successfully found {len(wbb_links_list)} women's basketball team links.")
print(f"Sample link: {wbb_links_list[0] if wbb_links_list else 'None'}")

Women's table was not commented. Reading directly...
Successfully found 380 women's basketball team links.
Sample link: https://www.sports-reference.com/cbb/schools/abilene-christian/women/


## Step 2: Define Roster Scraper Function

In [4]:
def get_roster_df(school_url, year):
    """
    Scrapes roster data for a women's basketball team for a specific year.
    
    Columns extracted:
    - player_sr_link: Link to player's Sports Reference page
    - player_name: Player's displayed name on the roster
    - number: Jersey number
    - class: Class (Fr, So, Jr, Sr)
    - pos: Position
    - height: Height
    """
    
    # Extract school slug from URL
    parts = [p for p in school_url.split('/') if p]
    school_slug = parts[-2] if parts[-1] == 'women' else parts[-1]
    
    # Construct roster URL
    roster_url = f"{school_url.rstrip('/')}/{year}.html"
    
    try:
        resp = requests.get(roster_url, timeout=10)
        
        if resp.status_code == 429:
            return "BLOCKED"
        if resp.status_code != 200:
            return pd.DataFrame()
        
        html_content = resp.text
        
        # Check if the roster table exists, if not check in comments
        if 'id="roster"' not in html_content:
            comments = re.findall(r'<!--(.+?)-->', html_content, re.DOTALL)
            for comment in comments:
                if 'id="roster"' in comment:
                    html_content = comment
                    break
        
        sel = Selector(text=html_content)
        rows = sel.css('table#roster tbody tr')
        
        all_rows = []
        for row in rows:
            # Skip header rows
            if row.css('.thead') or not row.css('td, th'):
                continue
            
            # Extract player link and name (player is in th, not td)
            player_link = row.css('th[data-stat="player"] a::attr(href)').get()
            player_name = row.css('th[data-stat="player"] a::text').get() or row.css('th[data-stat="player"]::text').get()
            player_sr_link = f"https://www.sports-reference.com{player_link}" if player_link else None
            
            # Extract other fields
            row_data = {
                'player_sr_link': player_sr_link,
                'player_name': player_name,
                'number': row.css('td[data-stat="number"]::text').get(),
                'class': row.css('td[data-stat="class"]::text').get(),
                'pos': row.css('td[data-stat="pos"]::text').get(),
                'height': row.css('td[data-stat="height"]::text').get(),
                'school': school_slug,
                'season': year
            }
            
            all_rows.append(row_data)
        
        df = pd.DataFrame(all_rows)
        
        if not df.empty:
            # Reorder columns to put key identifiers first (include player_name)
            first_cols = ['player_sr_link', 'player_name', 'number', 'class', 'pos', 'height', 'school', 'season']
            # keep only columns that exist in df to avoid KeyError
            first_cols = [c for c in first_cols if c in df.columns]
            df = df[first_cols]
        
        return df

    except Exception as e:
        print(f"Error scraping {roster_url}: {e}")
        return pd.DataFrame()


# Test with one team and year
test_df = get_roster_df(wbb_links_list[0], 2025)
print(f"Test scrape returned {len(test_df)} players")
print(test_df.head() if not test_df.empty else "No data")

Test scrape returned 12 players
                                      player_sr_link     player_name number  \
0  https://www.sports-reference.com/cbb/players/b...     Bella Earle      3   
1  https://www.sports-reference.com/cbb/players/p...     Payton Hull     10   
2  https://www.sports-reference.com/cbb/players/e...    Emma Troxell     24   
3  https://www.sports-reference.com/cbb/players/m...  Meredith Mayes     14   
4  https://www.sports-reference.com/cbb/players/e...    Erin Woodson      4   

  class pos height             school  season  
0    SR   G    5-9  abilene-christian    2025  
1    SO   G   5-11  abilene-christian    2025  
2    SO   F   5-11  abilene-christian    2025  
3    SO   C    6-2  abilene-christian    2025  
4    SO   G    6-0  abilene-christian    2025  


## Step 3: Multi-Year Scraper Function

In [5]:
def scrape_rosters_multi_year(urls, years_list, base_folder="data/rosters"):
    """
    Scrapes roster data for multiple teams across multiple years.
    Saves individual CSV files for each school and year.
    """
    # Safety catch for single strings
    if isinstance(urls, str):
        urls = [urls]
    
    # Reverse the years list to go in reverse chronological order
    years_to_scrape = sorted(years_list, reverse=True)
    
    for year in years_to_scrape:
        # Create year folder
        year_folder = os.path.join(base_folder, str(year))
        os.makedirs(year_folder, exist_ok=True)
        
        print(f"========== STARTING YEAR: {year} ==========")
        
        for url_idx, url in enumerate(urls):
            # Extract school slug
            parts = [p for p in url.split('/') if p]
            school_slug = parts[-2] if parts[-1] == 'women' else parts[-1]
            
            print(f"[{year}] Processing {url_idx + 1}/{len(urls)}: {school_slug}")
            
            # Fetch the data
            result = get_roster_df(url, year)
            
            # Block detection
            if isinstance(result, str) and result == "BLOCKED":
                print(f"!!! BLOCKED !!! Stopped at {school_slug} in {year}. Restart later.")
                return 
            
            if not result.empty:
                # Path: data/rosters/2025/school_slug_roster_2025.csv
                filename = f"{school_slug}_roster_{year}.csv"
                file_path = os.path.join(year_folder, filename)
                
                result.to_csv(file_path, index=False)
                print(f"      Saved: {file_path}")
            else:
                print(f"      No data found for {year}.")
            
            # Polite Scraper delay (4 seconds)
            time.sleep(4)
        
        print(f"========== COMPLETED YEAR: {year} ==========\n")
    
    print("Scrape complete.")

## Step 4: Run Scraper
Do not need to run - has been run already

In [None]:
# Scrape recent years
# my_years = [2026]
# scrape_rosters_multi_year(wbb_links_list, my_years)

In [None]:
# Scrape middle years
# my_years = range(2017,2026) # scraping 2017-2025
# scrape_rosters_multi_year(wbb_links_list, my_years)

[2025] Processing 1/380: abilene-christian
      Saved: data/rosters\2025\abilene-christian_roster_2025.csv
[2025] Processing 2/380: air-force
      Saved: data/rosters\2025\air-force_roster_2025.csv
[2025] Processing 3/380: akron
      Saved: data/rosters\2025\akron_roster_2025.csv
[2025] Processing 4/380: alabama
      Saved: data/rosters\2025\alabama_roster_2025.csv
[2025] Processing 5/380: alabama-am
      Saved: data/rosters\2025\alabama-am_roster_2025.csv
[2025] Processing 6/380: alabama-state
      Saved: data/rosters\2025\alabama-state_roster_2025.csv
[2025] Processing 7/380: albany-ny
      Saved: data/rosters\2025\albany-ny_roster_2025.csv
[2025] Processing 8/380: alcorn-state
      Saved: data/rosters\2025\alcorn-state_roster_2025.csv
[2025] Processing 9/380: american
      Saved: data/rosters\2025\american_roster_2025.csv
[2025] Processing 10/380: appalachian-state
      Saved: data/rosters\2025\appalachian-state_roster_2025.csv
[2025] Processing 11/380: arizona
      Saved