Sports Reference Scraper

In [None]:
# installing necessary packages. uncomment if you need to use any


%pip install scrapy
%pip install crochet

In [None]:
import scrapy
import numpy as np
import pandas as pd

In [3]:
import requests
from scrapy import Selector

url = "https://www.sports-reference.com/cbb/schools/"
html = requests.get(url).text
sel = Selector(text=html)

# 1. Find the wrapper
wrapper = sel.css('#all_NCAAW_schools')

# 2. Try to get the comment inside
comment = wrapper.xpath('comment()').get()

if comment:
    # It was hidden! Clean it.
    clean_html = comment.replace('', '')
    table_sel = Selector(text=clean_html)
    print("Found table in comments. Unwrapping...")
else:
    # It wasn't hidden! Use the wrapper directly.
    table_sel = wrapper
    print("Table was not commented. Reading directly...")

# 3. Extract the links
links_list = [
    f"https://www.sports-reference.com{link}" 
    for link in table_sel.css('td[data-stat="school_name"] a::attr(href)').getall()
]

print(f"Successfully found {len(links_list)} links.")

Table was not commented. Reading directly...
Successfully found 380 links.


In [4]:
import requests
from scrapy import Selector
import pandas as pd
import time
import re

def get_stats_df(school_url, year):
    # Dynamic season string (e.g., 2024 becomes "2023-24")
    season_val = f"{year-1}-{str(year)[-2:]}"
    
    parts = [p for p in school_url.split('/') if p]
    school_slug = parts[-2] if parts[-1] == 'women' else parts[-1]
    target_url = f"{school_url.rstrip('/')}/{year}.html"
    
    try:
        resp = requests.get(target_url, timeout=10)
        
        if resp.status_code == 429:
            return "BLOCKED"
        if resp.status_code != 200:
            # If a school didn't exist in a specific year, skip it gracefully
            return pd.DataFrame()
        
        html_content = resp.text
        if 'id="players_per_game"' not in html_content:
            comments = re.findall(r'', html_content, re.DOTALL)
            for comment in comments:
                if 'id="players_per_game"' in comment:
                    html_content = comment
                    break
        
        sel = Selector(text=html_content)
        rows = sel.css('table#players_per_game tbody tr')
        
        all_rows = []
        for row in rows:
            if row.css('.thead') or not row.css('td'):
                continue
            
            row_data = {cell.attrib['data-stat']: cell.css('::text').get() 
                        for cell in row.css('th, td') if 'data-stat' in cell.attrib}
            
            relative_link = row.css('td[data-stat="name_display"] a::attr(href), th[data-stat="player"] a::attr(href)').get()
            row_data['player_sr_link'] = f"https://www.sports-reference.com{relative_link}" if relative_link else None
            row_data['school'] = school_slug
            row_data['season'] = season_val
            all_rows.append(row_data)
        
        df = pd.DataFrame(all_rows)
        if not df.empty:
            if 'name_display' in df.columns: df = df.rename(columns={'name_display': 'player_name'})
            elif 'player' in df.columns: df = df.rename(columns={'player': 'player_name'})
            
            df = df.drop(columns=[c for c in ['ranker', 'awards'] if c in df.columns])
            
            first_cols = ['player_sr_link', 'player_name', 'school', 'season']
            other_cols = [c for c in df.columns if c not in first_cols]
            df = df[first_cols + other_cols]

        return df

    except Exception:
        return pd.DataFrame()

get_stats_df(links_list[0], 2025)

Unnamed: 0,player_sr_link,player_name,school,season,pos,games,games_started,mp_per_g,fg_per_g,fga_per_g,...,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g
0,https://www.sports-reference.com/cbb/players/p...,Payton Hull,abilene-christian,2024-25,G,22,21,32.3,6.6,14.7,...,0.818,0.8,2.7,3.5,2.8,1.9,0.4,3.5,2.2,19.0
1,https://www.sports-reference.com/cbb/players/b...,Bella Earle,abilene-christian,2024-25,G,34,34,34.6,5.2,12.0,...,0.728,2.2,4.9,7.1,4.2,2.4,0.2,3.8,2.1,13.9
2,https://www.sports-reference.com/cbb/players/m...,Meredith Mayes,abilene-christian,2024-25,C,32,32,28.4,4.7,7.9,...,0.58,3.3,3.8,7.2,1.3,1.5,1.1,1.9,2.3,11.3
3,https://www.sports-reference.com/cbb/players/e...,Emma Troxell,abilene-christian,2024-25,F,34,34,31.7,3.8,9.3,...,0.782,1.6,3.6,5.2,1.4,1.3,0.1,1.8,2.4,11.1
4,https://www.sports-reference.com/cbb/players/e...,Erin Woodson,abilene-christian,2024-25,G,29,14,22.2,2.8,6.7,...,0.78,0.8,1.4,2.1,1.3,1.4,0.1,1.1,1.6,8.2
5,https://www.sports-reference.com/cbb/players/b...,Breanna Davis,abilene-christian,2024-25,G,35,20,22.0,2.1,5.0,...,0.747,0.1,1.6,1.7,1.8,0.7,0.1,1.9,2.1,6.5
6,https://www.sports-reference.com/cbb/players/m...,Mia Rivers,abilene-christian,2024-25,G,18,16,23.8,2.3,6.2,...,0.696,1.2,2.4,3.6,2.6,2.2,0.2,2.2,2.1,6.0
7,https://www.sports-reference.com/cbb/players/n...,Natalia Chavez,abilene-christian,2024-25,G,35,3,14.3,1.5,4.6,...,0.857,0.7,1.1,1.8,0.5,0.4,0.1,0.6,1.0,4.2
8,https://www.sports-reference.com/cbb/players/z...,Zoe Jackson,abilene-christian,2024-25,G,19,0,13.4,1.1,2.3,...,0.6,0.4,0.8,1.2,1.2,0.5,0.1,1.5,1.0,2.8
9,https://www.sports-reference.com/cbb/players/p...,Paula Pique,abilene-christian,2024-25,G,32,1,8.1,0.6,1.7,...,0.5,0.3,0.6,0.9,0.5,0.3,0.1,0.7,1.0,1.6


In [5]:
import os
import time

import os
import time

def scrape_multi_year_data(urls, years_list, base_folder="data"):
    # Safety catch for single strings
    if isinstance(urls, str):
        urls = [urls]
        
    # Reverse the years list to go in reverse chronological order
    years_to_scrape = sorted(years_list, reverse=True)
    
    for year in years_to_scrape:
        # Create year folder (only if not exists)
        year_folder = os.path.join(base_folder, str(year))
        os.makedirs(year_folder, exist_ok=True)
        
        print(f"========== STARTING YEAR: {year} ==========")
        
        for url_idx, url in enumerate(urls):
            # Extract school slug
            parts = [p for p in url.split('/') if p]
            school_slug = parts[-2] if parts[-1] == 'women' else parts[-1]
            
            print(f"[{year}] Processing {url_idx + 1}/{len(urls)}: {school_slug}")
            
            # Fetch the data
            result = get_stats_df(url, year)
            
            # Block detection
            if isinstance(result, str) and result == "BLOCKED":
                print(f"!!! BLOCKED !!! Stopped at {school_slug} in {year}. Restart later.")
                return 
            
            if not result.empty:
                # Path: data/2026/school_slug_wbb_2026.csv
                filename = f"{school_slug}_wbb_{year}.csv"
                file_path = os.path.join(year_folder, filename)
                
                # to_csv defaults to overwriting the file
                result.to_csv(file_path, index=False)
                print(f"      Saved: {file_path}")
            else:
                print(f"      No data found for {year}.")
            
            # The 4-second "Polite Scraper" delay
            time.sleep(4)
            
        print(f"========== COMPLETED YEAR: {year} ==========\n")

    print("Scrape complete.")

In [63]:
my_years = [2025, 2026]
scrape_multi_year_data(links_list, my_years)

[2026] Processing 1/380: abilene-christian
      Saved: data\2026\abilene-christian_wbb_2026.csv
[2026] Processing 2/380: air-force
      Saved: data\2026\air-force_wbb_2026.csv
[2026] Processing 3/380: akron
      Saved: data\2026\akron_wbb_2026.csv
[2026] Processing 4/380: alabama
      Saved: data\2026\alabama_wbb_2026.csv
[2026] Processing 5/380: alabama-am
      Saved: data\2026\alabama-am_wbb_2026.csv
[2026] Processing 6/380: alabama-state
      Saved: data\2026\alabama-state_wbb_2026.csv
[2026] Processing 7/380: albany-ny
      Saved: data\2026\albany-ny_wbb_2026.csv
[2026] Processing 8/380: alcorn-state
      Saved: data\2026\alcorn-state_wbb_2026.csv
[2026] Processing 9/380: american
      Saved: data\2026\american_wbb_2026.csv
[2026] Processing 10/380: appalachian-state
      Saved: data\2026\appalachian-state_wbb_2026.csv
[2026] Processing 11/380: arizona
      Saved: data\2026\arizona_wbb_2026.csv
[2026] Processing 12/380: arizona-state
      Saved: data\2026\arizona-state_

In [64]:
my_years = [2022, 2023, 2024]
scrape_multi_year_data(links_list, my_years)

[2024] Processing 1/380: abilene-christian
      Saved: data\2024\abilene-christian_wbb_2024.csv
[2024] Processing 2/380: air-force
      Saved: data\2024\air-force_wbb_2024.csv
[2024] Processing 3/380: akron
      Saved: data\2024\akron_wbb_2024.csv
[2024] Processing 4/380: alabama
      Saved: data\2024\alabama_wbb_2024.csv
[2024] Processing 5/380: alabama-am
      Saved: data\2024\alabama-am_wbb_2024.csv
[2024] Processing 6/380: alabama-state
      Saved: data\2024\alabama-state_wbb_2024.csv
[2024] Processing 7/380: albany-ny
      Saved: data\2024\albany-ny_wbb_2024.csv
[2024] Processing 8/380: alcorn-state
      Saved: data\2024\alcorn-state_wbb_2024.csv
[2024] Processing 9/380: american
      Saved: data\2024\american_wbb_2024.csv
[2024] Processing 10/380: appalachian-state
      Saved: data\2024\appalachian-state_wbb_2024.csv
[2024] Processing 11/380: arizona
      Saved: data\2024\arizona_wbb_2024.csv
[2024] Processing 12/380: arizona-state
      Saved: data\2024\arizona-state_

In [66]:
my_years = [2021]
scrape_multi_year_data(links_list, my_years)

[2021] Processing 1/380: abilene-christian
      Saved: data\2021\abilene-christian_wbb_2021.csv
[2021] Processing 2/380: air-force
      Saved: data\2021\air-force_wbb_2021.csv
[2021] Processing 3/380: akron
      Saved: data\2021\akron_wbb_2021.csv
[2021] Processing 4/380: alabama
      Saved: data\2021\alabama_wbb_2021.csv
[2021] Processing 5/380: alabama-am
      Saved: data\2021\alabama-am_wbb_2021.csv
[2021] Processing 6/380: alabama-state
      Saved: data\2021\alabama-state_wbb_2021.csv
[2021] Processing 7/380: albany-ny
      Saved: data\2021\albany-ny_wbb_2021.csv
[2021] Processing 8/380: alcorn-state
      Saved: data\2021\alcorn-state_wbb_2021.csv
[2021] Processing 9/380: american
      Saved: data\2021\american_wbb_2021.csv
[2021] Processing 10/380: appalachian-state
      Saved: data\2021\appalachian-state_wbb_2021.csv
[2021] Processing 11/380: arizona
      Saved: data\2021\arizona_wbb_2021.csv
[2021] Processing 12/380: arizona-state
      Saved: data\2021\arizona-state_

In [None]:
# need to run this still
my_years = [2020]
scrape_multi_year_data(links_list, my_years)

[2020] Processing 1/380: abilene-christian
      Saved: data\2020\abilene-christian_wbb_2020.csv
[2020] Processing 2/380: air-force
      Saved: data\2020\air-force_wbb_2020.csv
[2020] Processing 3/380: akron
      Saved: data\2020\akron_wbb_2020.csv
[2020] Processing 4/380: alabama
      Saved: data\2020\alabama_wbb_2020.csv
[2020] Processing 5/380: alabama-am
      Saved: data\2020\alabama-am_wbb_2020.csv
[2020] Processing 6/380: alabama-state
      Saved: data\2020\alabama-state_wbb_2020.csv
[2020] Processing 7/380: albany-ny
      Saved: data\2020\albany-ny_wbb_2020.csv
[2020] Processing 8/380: alcorn-state
      Saved: data\2020\alcorn-state_wbb_2020.csv
[2020] Processing 9/380: american
      Saved: data\2020\american_wbb_2020.csv
[2020] Processing 10/380: appalachian-state
      Saved: data\2020\appalachian-state_wbb_2020.csv
[2020] Processing 11/380: arizona
      Saved: data\2020\arizona_wbb_2020.csv
[2020] Processing 12/380: arizona-state
      Saved: data\2020\arizona-state_

: 

In [6]:
my_years = [2019]
scrape_multi_year_data(links_list, my_years)

[2019] Processing 1/380: abilene-christian
      Saved: data\2019\abilene-christian_wbb_2019.csv
[2019] Processing 2/380: air-force
      Saved: data\2019\air-force_wbb_2019.csv
[2019] Processing 3/380: akron
      Saved: data\2019\akron_wbb_2019.csv
[2019] Processing 4/380: alabama
      Saved: data\2019\alabama_wbb_2019.csv
[2019] Processing 5/380: alabama-am
      Saved: data\2019\alabama-am_wbb_2019.csv
[2019] Processing 6/380: alabama-state
      Saved: data\2019\alabama-state_wbb_2019.csv
[2019] Processing 7/380: albany-ny
      Saved: data\2019\albany-ny_wbb_2019.csv
[2019] Processing 8/380: alcorn-state
      Saved: data\2019\alcorn-state_wbb_2019.csv
[2019] Processing 9/380: american
      Saved: data\2019\american_wbb_2019.csv
[2019] Processing 10/380: appalachian-state
      Saved: data\2019\appalachian-state_wbb_2019.csv
[2019] Processing 11/380: arizona
      Saved: data\2019\arizona_wbb_2019.csv
[2019] Processing 12/380: arizona-state
      Saved: data\2019\arizona-state_

In [7]:
my_years = [2018]
scrape_multi_year_data(links_list, my_years)

[2018] Processing 1/380: abilene-christian
      Saved: data\2018\abilene-christian_wbb_2018.csv
[2018] Processing 2/380: air-force
      Saved: data\2018\air-force_wbb_2018.csv
[2018] Processing 3/380: akron
      Saved: data\2018\akron_wbb_2018.csv
[2018] Processing 4/380: alabama
      Saved: data\2018\alabama_wbb_2018.csv
[2018] Processing 5/380: alabama-am
      Saved: data\2018\alabama-am_wbb_2018.csv
[2018] Processing 6/380: alabama-state
      Saved: data\2018\alabama-state_wbb_2018.csv
[2018] Processing 7/380: albany-ny
      Saved: data\2018\albany-ny_wbb_2018.csv
[2018] Processing 8/380: alcorn-state
      Saved: data\2018\alcorn-state_wbb_2018.csv
[2018] Processing 9/380: american
      Saved: data\2018\american_wbb_2018.csv
[2018] Processing 10/380: appalachian-state
      Saved: data\2018\appalachian-state_wbb_2018.csv
[2018] Processing 11/380: arizona
      Saved: data\2018\arizona_wbb_2018.csv
[2018] Processing 12/380: arizona-state
      Saved: data\2018\arizona-state_

In [8]:
my_years = [2017]
scrape_multi_year_data(links_list, my_years)

[2017] Processing 1/380: abilene-christian
      Saved: data\2017\abilene-christian_wbb_2017.csv
[2017] Processing 2/380: air-force
      Saved: data\2017\air-force_wbb_2017.csv
[2017] Processing 3/380: akron
      Saved: data\2017\akron_wbb_2017.csv
[2017] Processing 4/380: alabama
      Saved: data\2017\alabama_wbb_2017.csv
[2017] Processing 5/380: alabama-am
      Saved: data\2017\alabama-am_wbb_2017.csv
[2017] Processing 6/380: alabama-state
      Saved: data\2017\alabama-state_wbb_2017.csv
[2017] Processing 7/380: albany-ny
      Saved: data\2017\albany-ny_wbb_2017.csv
[2017] Processing 8/380: alcorn-state
      Saved: data\2017\alcorn-state_wbb_2017.csv
[2017] Processing 9/380: american
      Saved: data\2017\american_wbb_2017.csv
[2017] Processing 10/380: appalachian-state
      Saved: data\2017\appalachian-state_wbb_2017.csv
[2017] Processing 11/380: arizona
      Saved: data\2017\arizona_wbb_2017.csv
[2017] Processing 12/380: arizona-state
      Saved: data\2017\arizona-state_

In [9]:
import pandas as pd
import glob
import os

# aggregate data by year

# 1. Setup paths
output_dir = 'data/yearly_data'
os.makedirs(output_dir, exist_ok=True) # Creates the folder if it doesn't exist

# 2. Process each year
years = range(2017, 2027)
for year in years:
    path = f'data/{year}/*.csv'
    files = glob.glob(path)
    
    if files:
        # Combine all files for the specific year
        year_df = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)
        
        # 3. Save to the new folder
        output_filename = f'{output_dir}/sr_data_{year}.csv'
        year_df.to_csv(output_filename, index=False)
        print(f"Successfully created: {output_filename}")
    else:
        print(f"Skip: No files found for year {year}")

Successfully created: data/yearly_data/sr_data_2017.csv
Successfully created: data/yearly_data/sr_data_2018.csv
Successfully created: data/yearly_data/sr_data_2019.csv
Successfully created: data/yearly_data/sr_data_2020.csv
Successfully created: data/yearly_data/sr_data_2021.csv
Successfully created: data/yearly_data/sr_data_2022.csv
Successfully created: data/yearly_data/sr_data_2023.csv
Successfully created: data/yearly_data/sr_data_2024.csv
Successfully created: data/yearly_data/sr_data_2025.csv
Successfully created: data/yearly_data/sr_data_2026.csv
