This notebook does two things
1. Aggregates roster data into one giant dataset of rosters encompassing every team, from 2016-2017 to 2025-2026
2. Creates a dataset showing each individual transfer occurrence from these seasons

### Part 1: Roster Aggregation

In [30]:
import pandas as pd
import glob
import os

# if this cell does not work, your working directory is not BSA-Basketball-W26
print(f"Current working directory: {os.getcwd()}")

# Path to rosters
path = 'data/rosters/'

# Get all csv files
all_files = glob.glob(os.path.join(path, '**/*.csv'), recursive=True)

print(f"Found {len(all_files)} files")

# Read all files and concatenate
df_list = []
for file in all_files:
    df = pd.read_csv(file)
    df_list.append(df)

all_rosters = pd.concat(df_list, ignore_index=True)
all_rosters.shape

Current working directory: c:\Users\rajak\OneDrive - UCLA IT Services\UCLA\BSA\Basketball\BSA-Basketball-W26
Found 3544 files


(46230, 8)

In [31]:
all_rosters.number = all_rosters.number.astype("Int64")
all_rosters.head()

Unnamed: 0,player_sr_link,player_name,number,class,pos,height,school,season
0,https://www.sports-reference.com/cbb/players/a...,Alexis Mason,15,SR,G,5-9,abilene-christian,2017
1,https://www.sports-reference.com/cbb/players/s...,Suzzy Dimba,23,SR,F,5-11,abilene-christian,2017
2,https://www.sports-reference.com/cbb/players/l...,Lizzy Dimba,32,SR,F,5-11,abilene-christian,2017
3,https://www.sports-reference.com/cbb/players/s...,Sydney Shelstead,33,SR,F,6-1,abilene-christian,2017
4,https://www.sports-reference.com/cbb/players/b...,Breanna Wright,10,FR,G,5-8,abilene-christian,2017


### Part 2: Creating the list of transfers

In [32]:
transfers = all_rosters.merge(all_rosters, on=['player_sr_link', 'player_name'], how='inner', suffixes=('_old', '_new'))

school_mask = transfers['school_old'] != transfers['school_new']
season_mask = transfers['season_old'] == transfers['season_new'] - 1

transfers = transfers[school_mask & season_mask]

transfers = transfers[['player_sr_link', 'player_name', 'season_old', 'school_old', 'school_new']].rename(columns={'season_old': 'transfer_year'})
transfers.head()

Unnamed: 0,player_sr_link,player_name,transfer_year,school_old,school_new
68,https://www.sports-reference.com/cbb/players/a...,Alli Ball,2017,akron,illinois
119,https://www.sports-reference.com/cbb/players/k...,Keonna Farmer,2017,alabama-birmingham,north-florida
298,https://www.sports-reference.com/cbb/players/a...,Amber Driver,2017,appalachian-state,prairie-view
306,https://www.sports-reference.com/cbb/players/m...,Mikaya Wilson,2017,appalachian-state,north-carolina-at
312,https://www.sports-reference.com/cbb/players/j...,Julia Buehler,2017,appalachian-state,north-carolina-wilmington


In [33]:
transfers.groupby(['transfer_year']).size().reset_index(name='num_transfers')

Unnamed: 0,transfer_year,num_transfers
0,2017,69
1,2018,80
2,2019,133
3,2020,251
4,2021,551
5,2022,679
6,2023,650
7,2024,830
8,2025,992


### Saving data as csvs

In [34]:
transfers.to_csv('data/transfers_17_25.csv', index=False)
all_rosters.to_csv('data/rosters_17_25.csv', index=False)