Sports Reference Scraper

In [None]:
# installing necessary packages. uncomment if you need to use any


# %pip install scrapy
# %pip install crochet

In [4]:
import scrapy
import numpy as np
import pandas as pd

In [9]:
import requests
from scrapy import Selector

url = "https://www.sports-reference.com/cbb/schools/"
html = requests.get(url).text
sel = Selector(text=html)

# 1. Find the wrapper
wrapper = sel.css('#all_NCAAW_schools')

# 2. Try to get the comment inside
comment = wrapper.xpath('comment()').get()

if comment:
    # It was hidden! Clean it.
    clean_html = comment.replace('', '')
    table_sel = Selector(text=clean_html)
    print("Found table in comments. Unwrapping...")
else:
    # It wasn't hidden! Use the wrapper directly.
    table_sel = wrapper
    print("Table was not commented. Reading directly...")

# 3. Extract the links
links_list = [
    f"https://www.sports-reference.com{link}" 
    for link in table_sel.css('td[data-stat="school_name"] a::attr(href)').getall()
]

print(f"Successfully found {len(links_list)} links.")

Table was not commented. Reading directly...
Successfully found 380 links.


In [10]:
links_list

['https://www.sports-reference.com/cbb/schools/abilene-christian/women/',
 'https://www.sports-reference.com/cbb/schools/air-force/women/',
 'https://www.sports-reference.com/cbb/schools/akron/women/',
 'https://www.sports-reference.com/cbb/schools/alabama/women/',
 'https://www.sports-reference.com/cbb/schools/alabama-am/women/',
 'https://www.sports-reference.com/cbb/schools/alabama-state/women/',
 'https://www.sports-reference.com/cbb/schools/albany-ny/women/',
 'https://www.sports-reference.com/cbb/schools/alcorn-state/women/',
 'https://www.sports-reference.com/cbb/schools/american/women/',
 'https://www.sports-reference.com/cbb/schools/appalachian-state/women/',
 'https://www.sports-reference.com/cbb/schools/arizona/women/',
 'https://www.sports-reference.com/cbb/schools/arizona-state/women/',
 'https://www.sports-reference.com/cbb/schools/arkansas/women/',
 'https://www.sports-reference.com/cbb/schools/arkansas-state/women/',
 'https://www.sports-reference.com/cbb/schools/arkans

In [19]:
import requests
from scrapy import Selector
import pandas as pd
import time

def get_2026_player_stats(school_url):
    # 1. Format the URL for the 2026 season
    target_url = f"{school_url.rstrip('/')}/2026.html"
    
    try:
        # 2. Get the page
        resp = requests.get(target_url, timeout=10)
        if resp.status_code != 200:
            print(f"Skipping {target_url}: Status {resp.status_code}")
            return []
        
        sel = Selector(text=resp.text)
        
        # 3. Target the specific table from your screenshot: id="players_per_game"
        # We look for rows inside the <tbody>
        rows = sel.css('table#players_per_game tbody tr')
        
        school_name = sel.css('h1 span[itemprop="name"]::text').get()
        school_data = []
        
        for row in rows:
            # Skip sub-headers or empty rows
            if row.css('.thead') or not row.css('td'):
                continue
            
            # 4. DYNAMIC COLUMN EXTRACTION
            # This creates a dictionary by looking at every cell's 'data-stat' attribute
            player_row = {'school': school_name}
            
            # Scrapy finds all <th> and <td> cells in this row
            for cell in row.css('th, td'):
                stat_name = cell.attrib.get('data-stat')
                if stat_name:
                    # Get the text inside the cell
                    player_row[stat_name] = cell.css('::text').get()
            
            school_data.append(player_row)
            
        return school_data

    except Exception as e:
        print(f"Error on {target_url}: {e}")
        return []

pd.DataFrame(get_2026_player_stats(links_list[0]))

Unnamed: 0,school,ranker,name_display,pos,games,games_started,mp_per_g,fg_per_g,fga_per_g,fg_pct,...,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,awards
0,,1,Payton Hull,G,19,19,30.2,5.8,14.4,0.405,...,0.9,3.1,4.0,2.9,2.7,0.3,3.0,2.1,17.0,
1,,2,Meredith Mayes,C,16,16,28.4,6.3,9.3,0.682,...,3.3,4.6,7.9,1.6,2.5,0.8,1.4,2.5,15.2,
2,,3,Erin Woodson,G,19,19,31.6,5.0,10.6,0.473,...,1.7,4.3,5.9,2.2,1.9,0.4,2.0,1.9,14.5,
3,,4,Emma Troxell,F,18,18,28.6,3.2,8.4,0.377,...,1.8,4.3,6.1,1.2,1.7,0.1,1.1,1.8,9.8,
4,,5,Natalia Chavez,G,19,4,15.1,2.8,5.7,0.5,...,0.9,1.2,2.2,0.6,0.7,0.2,0.6,1.0,7.5,
5,,6,Breanna Davis,G,19,19,26.6,1.4,4.2,0.329,...,0.4,1.6,2.0,3.5,1.1,0.1,2.3,2.7,3.9,
6,,7,Jazmyn Stone,G,19,0,10.7,0.8,2.6,0.306,...,0.4,0.6,1.0,1.6,0.5,0.1,1.8,1.2,2.5,
7,,8,Riley Grohman,G,19,0,12.0,1.0,3.1,0.328,...,0.4,0.8,1.2,0.7,0.6,0.1,0.7,0.9,2.5,
8,,9,Molly Daugherty,G,17,0,7.6,0.7,2.1,0.333,...,0.2,0.5,0.6,0.5,0.2,0.0,0.6,0.9,2.1,
9,,10,Jordyn Coleman,C,15,0,5.7,0.9,1.8,0.519,...,0.6,1.1,1.7,0.2,0.1,0.5,0.3,0.9,2.0,


In [18]:
import requests
from scrapy import Selector
import pandas as pd
import time

def get_2026_stats_df(school_url):
    year = 2026
    season_val = f"{year-1}-{str(year)[-2:]}"
    
    # Extract school slug
    parts = [p for p in school_url.split('/') if p]
    school_slug = parts[-2] if parts[-1] == 'women' else parts[-1]
    
    target_url = f"{school_url.rstrip('/')}/{year}.html"
    
    try:
        resp = requests.get(target_url, timeout=10)
        if resp.status_code != 200:
            return pd.DataFrame()
        
        sel = Selector(text=resp.text)
        rows = sel.css('table#players_per_game tbody tr')
        
        all_rows = []
        for row in rows:
            if row.css('.thead') or not row.css('td'):
                continue
            
            # 1. Grab everything with original keys
            row_data = {cell.attrib['data-stat']: cell.css('::text').get() 
                        for cell in row.css('th, td') if 'data-stat' in cell.attrib}
            
            # 2. Add our custom metadata
            row_data['school'] = school_slug
            row_data['season'] = season_val
            all_rows.append(row_data)
        
        df = pd.DataFrame(all_rows)
        
        if not df.empty:
            # 3. Remove unwanted columns
            cols_to_drop = [c for c in ['ranker', 'awards'] if c in df.columns]
            df = df.drop(columns=cols_to_drop)
            
            # 4. Rename the name column
            # Check for 'name_display' first, then fallback to 'player'
            if 'name_display' in df.columns:
                df = df.rename(columns={'name_display': 'player_name'})
            elif 'player' in df.columns:
                df = df.rename(columns={'player': 'player_name'})
            
            # 5. Reorder: school, season, player_name, then everything else
            cols = ['school', 'season', 'player_name']
            # Get all other columns that aren't in our "start" list
            other_cols = [c for c in df.columns if c not in cols]
            df = df[cols + other_cols]

        return df

    except Exception as e:
        print(f"Error on {school_slug}: {e}")
        return pd.DataFrame()

get_2026_stats_df(links_list[0])

Unnamed: 0,school,season,player_name,pos,games,games_started,mp_per_g,fg_per_g,fga_per_g,fg_pct,...,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g
0,abilene-christian,2025-26,Payton Hull,G,19,19,30.2,5.8,14.4,0.405,...,0.813,0.9,3.1,4.0,2.9,2.7,0.3,3.0,2.1,17.0
1,abilene-christian,2025-26,Meredith Mayes,C,16,16,28.4,6.3,9.3,0.682,...,0.714,3.3,4.6,7.9,1.6,2.5,0.8,1.4,2.5,15.2
2,abilene-christian,2025-26,Erin Woodson,G,19,19,31.6,5.0,10.6,0.473,...,0.694,1.7,4.3,5.9,2.2,1.9,0.4,2.0,1.9,14.5
3,abilene-christian,2025-26,Emma Troxell,F,18,18,28.6,3.2,8.4,0.377,...,0.804,1.8,4.3,6.1,1.2,1.7,0.1,1.1,1.8,9.8
4,abilene-christian,2025-26,Natalia Chavez,G,19,4,15.1,2.8,5.7,0.5,...,1.0,0.9,1.2,2.2,0.6,0.7,0.2,0.6,1.0,7.5
5,abilene-christian,2025-26,Breanna Davis,G,19,19,26.6,1.4,4.2,0.329,...,0.591,0.4,1.6,2.0,3.5,1.1,0.1,2.3,2.7,3.9
6,abilene-christian,2025-26,Jazmyn Stone,G,19,0,10.7,0.8,2.6,0.306,...,0.625,0.4,0.6,1.0,1.6,0.5,0.1,1.8,1.2,2.5
7,abilene-christian,2025-26,Riley Grohman,G,19,0,12.0,1.0,3.1,0.328,...,1.0,0.4,0.8,1.2,0.7,0.6,0.1,0.7,0.9,2.5
8,abilene-christian,2025-26,Molly Daugherty,G,17,0,7.6,0.7,2.1,0.333,...,0.6,0.2,0.5,0.6,0.5,0.2,0.0,0.6,0.9,2.1
9,abilene-christian,2025-26,Jordyn Coleman,C,15,0,5.7,0.9,1.8,0.519,...,0.4,0.6,1.1,1.7,0.2,0.1,0.5,0.3,0.9,2.0
