# NBA to NCCA Data Entity Resolution

In [139]:
import pandas as pd
import string
import numpy as np
from fuzzywuzzy import fuzz
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import pairwise_distances

## Utility Functions

In [2]:
def clean_text(text):
    clean_text = text.translate(str.maketrans('', '', string.punctuation)).upper()
    clean_text = ' '.join(clean_text.split())
    return clean_text

def token_sort_ratio(str1, str2):
    return fuzz.token_sort_ratio(str1, str2)

## Load/Explore NBA data

In [3]:
nba_data = pd.read_csv('./nba_player_avgs_2008-2025.csv')

In [4]:
nba_data.head()

Unnamed: 0.1,Unnamed: 0,personId,season,gameId,numMinutes,points,assists,blocks,steals,fieldGoalsAttempted,...,lastAttended,country,height,bodyWeight,guard,forward,center,draftYear,draftRound,draftNumber
0,0,87,2008-2009,9,10.333333,1.777778,0.0,1.222222,0.0,1.444444,...,Georgetown,Congo,86.0,260.0,False,False,True,1991.0,1.0,4.0
1,1,255,2008-2009,82,29.341463,12.012195,2.268293,0.658537,1.097561,9.085366,...,Duke,USA,80.0,225.0,False,True,False,1994.0,1.0,3.0
2,2,255,2009-2010,81,29.54321,11.259259,2.382716,0.444444,0.740741,8.679012,...,Duke,USA,80.0,225.0,False,True,False,1994.0,1.0,3.0
3,3,255,2010-2011,80,29.5875,13.2,2.5,0.4375,0.8,10.3125,...,Duke,USA,80.0,225.0,False,True,False,1994.0,1.0,3.0
4,4,255,2011-2012,49,27.612245,10.183673,2.183673,0.591837,0.836735,9.204082,...,Duke,USA,80.0,225.0,False,True,False,1994.0,1.0,3.0


In [5]:
nba_data.columns

Index(['Unnamed: 0', 'personId', 'season', 'gameId', 'numMinutes', 'points',
       'assists', 'blocks', 'steals', 'fieldGoalsAttempted', 'fieldGoalsMade',
       'fieldGoalsPercentage', 'threePointersAttempted', 'threePointersMade',
       'threePointersPercentage', 'freeThrowsAttempted', 'freeThrowsMade',
       'freeThrowsPercentage', 'reboundsDefensive', 'reboundsOffensive',
       'reboundsTotal', 'foulsPersonal', 'turnovers', 'plusMinusPoints',
       'firstName', 'lastName', 'birthdate', 'lastAttended', 'country',
       'height', 'bodyWeight', 'guard', 'forward', 'center', 'draftYear',
       'draftRound', 'draftNumber'],
      dtype='object')

In [6]:
len(nba_data)

8617

In [7]:
len(nba_data.drop_duplicates(subset = ['personId', 'season']))

8617

In [8]:
# need to aggregate nba season data
# duplicate personIds for multiple seasons
len(nba_data.drop_duplicates(subset = ['personId']))

1921

## Load/Explore NCAA Data

In [9]:
ncaa_2024data = pd.read_csv('./trank_data_2024.csv')

In [10]:
ncaa_2024data.head()

Unnamed: 0,player_name,team,conf,GP,Min_per,ORtg,usg,eFG,TS_per,ORB_per,...,ogbpm,dgbpm,oreb,dreb,treb,ast,stl,blk,pts,pos_class
0,DeJuan Clayton,Manhattan,MAAC,2,4.3,88.3,27.8,47.4,49.27,0.0,...,-3.00139,-4.18666,0.0,3.5,3.5,3.0,0.0,0.0,11.0,Wing G
1,Seth Towns,Howard,MEAC,30,73.4,108.2,22.1,45.2,52.19,4.2,...,0.710116,-1.20958,1.1562,5.375,6.5312,2.4688,0.9688,0.4375,14.125,Stretch 4
2,Terrence Lewis,Grambling St.,SWAC,20,26.9,116.1,19.0,63.9,64.03,3.2,...,3.84756,-0.389764,0.6522,1.6087,2.2609,0.3478,0.4783,0.1739,7.913,Wing G
3,Austin Williams,Rutgers,B10,26,32.1,85.7,16.5,50.0,48.56,5.3,...,-3.75734,2.32482,0.8462,1.8077,2.6538,0.6154,0.4615,0.2692,3.9615,Wing G
4,Avery Wilson,Southeastern Louisiana,Slnd,29,27.8,87.8,16.7,45.1,47.15,4.1,...,-5.71204,-0.732916,0.3438,1.0,1.3438,0.7812,0.4062,0.0938,3.1875,Wing G


In [11]:
ncaa_2024data.columns

Index(['player_name', 'team', 'conf', 'GP', 'Min_per', 'ORtg', 'usg', 'eFG',
       'TS_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FTM', 'FTA',
       'FT_per', 'twoPM', 'twoPA', 'twoP_per', 'TPM', 'TPA', 'TP_per',
       'blk_per', 'stl_per', 'ftr', 'yr', 'ht', 'num', 'porpag', 'adjoe',
       'pfr', 'year', 'pid', 'type', 'Rec Rank', 'ast/tov', 'rimmade',
       'rimmade+rimmiss', 'midmade', 'midmade+midmiss',
       'rimmade/(rimmade+rimmiss)', 'midmade/(midmade+midmiss)', 'dunksmade',
       'dunksmiss+dunksmade', 'dunksmade/(dunksmade+dunksmiss)', 'pick',
       'drtg', 'adrtg', 'dporpag', 'stops', 'bpm', 'obpm', 'dbpm', 'gbpm',
       'mp', 'ogbpm', 'dgbpm', 'oreb', 'dreb', 'treb', 'ast', 'stl', 'blk',
       'pts', 'pos_class'],
      dtype='object')

In [12]:
# load aggregated college data
player_data_college = pd.read_csv('./player_data_college.csv')

In [13]:
player_data_college.head()

Unnamed: 0,player_name,team,height,min_year,max_year,pos_class,type,avg_box_plus_minus,avg_box_offensive_plus_minus,avg_box_defensive_plus_minus,...,avg_offensive_reb,avg_defensive_reb,avg_total_reb,avg_assists,avg_steals,avg_blocks,avg_points_scored,avg_freethrow_pct,avg_2p_pct,avg_3p_pct
0,A'Jahni Levias,North Dakota,198.12,2023,2023,Wing G,"Vallejo, CA",-5.32495,-2.80885,-2.5161,...,14.66181,28.4625,43.12431,14.66181,8.62569,0.0,85.3875,0.6875,0.457143,0.323529
1,A'Torey Everett,Tennessee St.,193.04,2017,2017,Combo G,"Dallas, TX",-6.37451,-4.92224,-1.45227,...,2.375,14.25,16.625,13.4577,1.5827,0.7923,45.9173,0.782609,0.3,0.571429
2,A'Torri Shine,Grambling St.,198.12,2014,2015,Wing G,,-6.934815,-2.114325,-4.820495,...,180.00152,428.27386,608.27538,151.07596,105.84388,34.3294,2066.74504,0.769133,0.449183,0.308231
3,A'Uston Calhoun,Bowling Green,200.66,2013,2013,Wing F,,-3.12361,-1.28575,-1.83786,...,189.94946,399.44946,589.39054,67.04,30.72946,39.10946,1276.55054,0.79646,0.46131,0.351852
4,A'lahn Sumler,Charleston Southern,193.04,2024,2024,Combo G,"Waterbury, CT",-0.456725,0.617211,-1.07394,...,43.14432,185.26592,228.41024,119.28352,68.5216,20.30624,1048.16704,0.71831,0.454054,0.339286


In [14]:
player_data_college.columns

Index(['player_name', 'team', 'height', 'min_year', 'max_year', 'pos_class',
       'type', 'avg_box_plus_minus', 'avg_box_offensive_plus_minus',
       'avg_box_defensive_plus_minus', 'avg_global_plus_minus',
       'avg_offensive_global_plus_minus', 'avg_defensive_global_plus_minus',
       'total_min_per', 'avg_offensive_rating', 'avg_defensive_rating',
       'avg_defensive_points_ov_replacement', 'avg_defensive_stop',
       'avg_usage_rate', 'avg_effective_field_goal_pct', 'avg_shooting_pct',
       'avg_offensive_rebound_pct', 'avg_defensive_rebound_pct',
       'avg_freethrow_made', 'avg_freethrow_attempted', 'avg_2p_made',
       'avg_2p_attempted', 'avg_3p_made', 'avg_3p_attempted', 'avg_block_pct',
       'avg_steal_pct', 'avg_pick', 'avg_offensive_reb', 'avg_defensive_reb',
       'avg_total_reb', 'avg_assists', 'avg_steals', 'avg_blocks',
       'avg_points_scored', 'avg_freethrow_pct', 'avg_2p_pct', 'avg_3p_pct'],
      dtype='object')

In [15]:
len(player_data_college)

39094

In [16]:
# unique on player name and team. 
# may have some dupes for cases where players have transfered. not a huge deal
len(player_data_college.drop_duplicates(subset = ['player_name', 'team']))

39094

## Generate unique IDs

In [17]:
nba_data['nba_id'] = nba_data['personId']
nba_data.drop('personId', axis = 1, inplace = True)
nba_data.drop('Unnamed: 0', axis = 1, inplace = True)

In [18]:
nba_data.to_csv('./resolved_data/nba_player_season_avgs_2008-2025.csv', index = False)

In [19]:
player_data_college['ncaa_id'] = pd.RangeIndex(start = 1, stop = len(player_data_college) + 1)

In [20]:
player_data_college.head()

Unnamed: 0,player_name,team,height,min_year,max_year,pos_class,type,avg_box_plus_minus,avg_box_offensive_plus_minus,avg_box_defensive_plus_minus,...,avg_defensive_reb,avg_total_reb,avg_assists,avg_steals,avg_blocks,avg_points_scored,avg_freethrow_pct,avg_2p_pct,avg_3p_pct,ncaa_id
0,A'Jahni Levias,North Dakota,198.12,2023,2023,Wing G,"Vallejo, CA",-5.32495,-2.80885,-2.5161,...,28.4625,43.12431,14.66181,8.62569,0.0,85.3875,0.6875,0.457143,0.323529,1
1,A'Torey Everett,Tennessee St.,193.04,2017,2017,Combo G,"Dallas, TX",-6.37451,-4.92224,-1.45227,...,14.25,16.625,13.4577,1.5827,0.7923,45.9173,0.782609,0.3,0.571429,2
2,A'Torri Shine,Grambling St.,198.12,2014,2015,Wing G,,-6.934815,-2.114325,-4.820495,...,428.27386,608.27538,151.07596,105.84388,34.3294,2066.74504,0.769133,0.449183,0.308231,3
3,A'Uston Calhoun,Bowling Green,200.66,2013,2013,Wing F,,-3.12361,-1.28575,-1.83786,...,399.44946,589.39054,67.04,30.72946,39.10946,1276.55054,0.79646,0.46131,0.351852,4
4,A'lahn Sumler,Charleston Southern,193.04,2024,2024,Combo G,"Waterbury, CT",-0.456725,0.617211,-1.07394,...,185.26592,228.41024,119.28352,68.5216,20.30624,1048.16704,0.71831,0.454054,0.339286,5


In [21]:
player_data_college.to_csv('./resolved_data/player_data_college.csv', index = False)

## Execute Entity Resolution

- resolve nba_ids to ncaa_ids
- first block on college team team name 
- then try comparing max year and draft year (draft year >= max year) for a block

### Pull Relevant Fields

In [22]:
nba_identifiers = [
    'nba_id',
    'firstName',
    'lastName',
    'birthdate',
    'lastAttended',
    'country',
    'draftYear',
    'draftRound',
    'draftNumber',
]

# only get unique nba ids
nba_er_data = nba_data[nba_identifiers].drop_duplicates(subset = ['nba_id']).replace({np.nan: None})
nba_er_data = nba_er_data[nba_er_data['lastAttended'].notnull()]

In [23]:
len(nba_er_data)

1440

In [24]:
ncaa_identifiers = [
    'ncaa_id',
    'player_name', 
    'team', 
    'min_year', 
    'max_year'
]

ncaa_er_data = player_data_college[ncaa_identifiers].drop_duplicates(subset = ['ncaa_id'])

### Clean Fields used for ER

In [25]:
# nba player name
nba_er_data['full_name'] = nba_er_data['firstName'] + ' ' + nba_er_data['lastName']
nba_er_data['clean_full_name'] = nba_er_data['full_name'].apply(clean_text)

In [26]:
# nba player name
ncaa_er_data['clean_player_name'] = ncaa_er_data['player_name'].apply(clean_text)

In [27]:
# save er data for review
nba_er_data.to_csv('resolved_data/nba_er_data.csv', index = False)
ncaa_er_data.to_csv('resolved_data/ncaa_er_data.csv', index = False)


In [28]:
# clean team names
nba_er_data['clean_lastAttended'] = nba_er_data['lastAttended'].apply(clean_text)
ncaa_er_data['clean_team'] = ncaa_er_data['team'].apply(clean_text)


In [29]:
nba_er_data.head()

Unnamed: 0,nba_id,firstName,lastName,birthdate,lastAttended,country,draftYear,draftRound,draftNumber,full_name,clean_full_name,clean_lastAttended
0,87,Dikembe,Mutombo,1966-06-25,Georgetown,Congo,1991,1,4,Dikembe Mutombo,DIKEMBE MUTOMBO,GEORGETOWN
1,255,Grant,Hill,1972-10-05,Duke,USA,1994,1,3,Grant Hill,GRANT HILL,DUKE
6,283,Lindsey,Hunter,1970-12-03,Jackson State,USA,1993,1,10,Lindsey Hunter,LINDSEY HUNTER,JACKSON STATE
8,406,Shaquille,O'Neal,1972-03-06,Louisiana State,USA,1992,1,1,Shaquille O'Neal,SHAQUILLE ONEAL,LOUISIANA STATE
11,436,Juwan,Howard,1973-02-07,Michigan,USA,1994,1,5,Juwan Howard,JUWAN HOWARD,MICHIGAN


In [30]:
# check if draft year is null 
nba_er_data[nba_er_data['draftYear'].isnull()]

Unnamed: 0,nba_id,firstName,lastName,birthdate,lastAttended,country,draftYear,draftRound,draftNumber,full_name,clean_full_name,clean_lastAttended


In [31]:
ncaa_er_data.head()

Unnamed: 0,ncaa_id,player_name,team,min_year,max_year,clean_player_name,clean_team
0,1,A'Jahni Levias,North Dakota,2023,2023,AJAHNI LEVIAS,NORTH DAKOTA
1,2,A'Torey Everett,Tennessee St.,2017,2017,ATOREY EVERETT,TENNESSEE ST
2,3,A'Torri Shine,Grambling St.,2014,2015,ATORRI SHINE,GRAMBLING ST
3,4,A'Uston Calhoun,Bowling Green,2013,2013,AUSTON CALHOUN,BOWLING GREEN
4,5,A'lahn Sumler,Charleston Southern,2024,2024,ALAHN SUMLER,CHARLESTON SOUTHERN


In [32]:
# check if max_year is ever null
ncaa_er_data[ncaa_er_data['max_year'].isnull()]

Unnamed: 0,ncaa_id,player_name,team,min_year,max_year,clean_player_name,clean_team


### Execute College Name Tocken Block
- ensure that draftYear is greater than or equal to max year
- drop cases where team name is less than 4 characters (matching on acronyms introduces too much noise)

In [33]:
# get nba team tokens
nba_er_data['college_token'] = nba_er_data['clean_lastAttended'].str.split()
nba_er_data_exploded = nba_er_data.explode('college_token')
nba_er_data_exploded = nba_er_data_exploded[nba_er_data_exploded['college_token'].str.len() >= 4]

In [34]:
nba_er_data_exploded.head()

Unnamed: 0,nba_id,firstName,lastName,birthdate,lastAttended,country,draftYear,draftRound,draftNumber,full_name,clean_full_name,clean_lastAttended,college_token
0,87,Dikembe,Mutombo,1966-06-25,Georgetown,Congo,1991,1,4,Dikembe Mutombo,DIKEMBE MUTOMBO,GEORGETOWN,GEORGETOWN
1,255,Grant,Hill,1972-10-05,Duke,USA,1994,1,3,Grant Hill,GRANT HILL,DUKE,DUKE
6,283,Lindsey,Hunter,1970-12-03,Jackson State,USA,1993,1,10,Lindsey Hunter,LINDSEY HUNTER,JACKSON STATE,JACKSON
6,283,Lindsey,Hunter,1970-12-03,Jackson State,USA,1993,1,10,Lindsey Hunter,LINDSEY HUNTER,JACKSON STATE,STATE
8,406,Shaquille,O'Neal,1972-03-06,Louisiana State,USA,1992,1,1,Shaquille O'Neal,SHAQUILLE ONEAL,LOUISIANA STATE,LOUISIANA


In [35]:
len(nba_er_data_exploded)

1956

In [36]:
# get ncaa team tokens
ncaa_er_data['college_token'] = ncaa_er_data['clean_team'].str.split()
ncaa_er_data_exploded = ncaa_er_data.explode('college_token')
ncaa_er_data_exploded = ncaa_er_data_exploded[ncaa_er_data_exploded['college_token'].str.len() >= 4]

In [37]:
ncaa_er_data_exploded.head()

Unnamed: 0,ncaa_id,player_name,team,min_year,max_year,clean_player_name,clean_team,college_token
0,1,A'Jahni Levias,North Dakota,2023,2023,AJAHNI LEVIAS,NORTH DAKOTA,NORTH
0,1,A'Jahni Levias,North Dakota,2023,2023,AJAHNI LEVIAS,NORTH DAKOTA,DAKOTA
1,2,A'Torey Everett,Tennessee St.,2017,2017,ATOREY EVERETT,TENNESSEE ST,TENNESSEE
2,3,A'Torri Shine,Grambling St.,2014,2015,ATORRI SHINE,GRAMBLING ST,GRAMBLING
3,4,A'Uston Calhoun,Bowling Green,2013,2013,AUSTON CALHOUN,BOWLING GREEN,BOWLING


In [38]:
len(ncaa_er_data_exploded)

49952

In [39]:
len(ncaa_er_data)

39094

In [40]:
# execute block
college_token_block = pd.merge(nba_er_data_exploded, ncaa_er_data_exploded, on = 'college_token', how = 'inner')

In [41]:
college_token_block_filtered = college_token_block[college_token_block['draftYear'] >= college_token_block['max_year']].copy()

In [42]:
len(college_token_block)

478116

In [43]:
len(college_token_block_filtered)

57521

In [44]:
college_token_block.to_csv('./resolved_data/college_token_block_filtered.csv', index = False)

### Execute Fuzzy Name Match
- fuzzy name match threshold of 85

In [45]:
college_token_block_filtered['fuzzy_name_score'] = college_token_block_filtered.apply(lambda r: token_sort_ratio(r['clean_full_name'], r['clean_player_name']), axis = 1)

In [46]:
college_token_block_filtered.head()

Unnamed: 0,nba_id,firstName,lastName,birthdate,lastAttended,country,draftYear,draftRound,draftNumber,full_name,...,clean_lastAttended,college_token,ncaa_id,player_name,team,min_year,max_year,clean_player_name,clean_team,fuzzy_name_score
585,202328,Greg,Monroe,1990-06-04,Georgetown,USA,2010,1,7,Greg Monroe,...,GEORGETOWN,GEORGETOWN,14236,Greg Monroe,Georgetown,2010,2010,GREG MONROE,GEORGETOWN,100
631,202328,Greg,Monroe,1990-06-04,Georgetown,USA,2010,1,7,Greg Monroe,...,GEORGETOWN,GEORGETOWN,29300,Nikita Mescheriakov,Georgetown,2010,2010,NIKITA MESCHERIAKOV,GEORGETOWN,20
646,202328,Greg,Monroe,1990-06-04,Georgetown,USA,2010,1,7,Greg Monroe,...,GEORGETOWN,GEORGETOWN,34406,Stephen Stepka,Georgetown,2010,2010,STEPHEN STEPKA,GEORGETOWN,24
995,203490,Otto,Porter Jr.,1993-06-03,Georgetown,USA,2013,1,3,Otto Porter Jr.,...,GEORGETOWN,GEORGETOWN,2779,Austin Freeman,Georgetown,2010,2011,AUSTIN FREEMAN,GEORGETOWN,14
999,203490,Otto,Porter Jr.,1993-06-03,Georgetown,USA,2013,1,3,Otto Porter Jr.,...,GEORGETOWN,GEORGETOWN,3950,Brandon Bolden,Georgetown,2013,2013,BRANDON BOLDEN,GEORGETOWN,14


In [47]:
college_token_block_fuzzy = college_token_block_filtered[college_token_block_filtered['fuzzy_name_score'] >= 85.0]

In [48]:
college_token_block_fuzzy.to_csv('./resolved_data/college_token_block_fuzzy.csv')

In [49]:
idx_max_scores = college_token_block_fuzzy.groupby('nba_id')['fuzzy_name_score'].idxmax()
nba_ncaa_matches = college_token_block_fuzzy.loc[idx_max_scores]

In [50]:
nba_ncaa_matches.to_csv('./resolved_data/nba_ncaa_matches.csv', index = False)

In [51]:
len(nba_ncaa_matches)

402

In [52]:
len(nba_ncaa_matches.drop_duplicates(subset = ['ncaa_id']))

402

### Get ER Match Rates

In [53]:
len(nba_er_data)

1440

In [54]:
nba_er_data.head()

Unnamed: 0,nba_id,firstName,lastName,birthdate,lastAttended,country,draftYear,draftRound,draftNumber,full_name,clean_full_name,clean_lastAttended,college_token
0,87,Dikembe,Mutombo,1966-06-25,Georgetown,Congo,1991,1,4,Dikembe Mutombo,DIKEMBE MUTOMBO,GEORGETOWN,[GEORGETOWN]
1,255,Grant,Hill,1972-10-05,Duke,USA,1994,1,3,Grant Hill,GRANT HILL,DUKE,[DUKE]
6,283,Lindsey,Hunter,1970-12-03,Jackson State,USA,1993,1,10,Lindsey Hunter,LINDSEY HUNTER,JACKSON STATE,"[JACKSON, STATE]"
8,406,Shaquille,O'Neal,1972-03-06,Louisiana State,USA,1992,1,1,Shaquille O'Neal,SHAQUILLE ONEAL,LOUISIANA STATE,"[LOUISIANA, STATE]"
11,436,Juwan,Howard,1973-02-07,Michigan,USA,1994,1,5,Juwan Howard,JUWAN HOWARD,MICHIGAN,[MICHIGAN]


In [55]:
nba_drafted_post_2010 = nba_er_data[nba_er_data['draftYear'] >= 2010]

In [56]:
nba_drafted_post_2010.head()

Unnamed: 0,nba_id,firstName,lastName,birthdate,lastAttended,country,draftYear,draftRound,draftNumber,full_name,clean_full_name,clean_lastAttended,college_token
3291,202322,John,Wall,1990-09-06,Kentucky,USA,2010,1,1,John Wall,JOHN WALL,KENTUCKY,[KENTUCKY]
3302,202323,Evan,Turner,1988-10-27,Ohio State,USA,2010,1,2,Evan Turner,EVAN TURNER,OHIO STATE,"[OHIO, STATE]"
3312,202324,Derrick,Favors,1991-07-15,Georgia Tech,USA,2010,1,3,Derrick Favors,DERRICK FAVORS,GEORGIA TECH,"[GEORGIA, TECH]"
3324,202325,Wesley,Johnson,1987-07-11,Syracuse,USA,2010,1,4,Wesley Johnson,WESLEY JOHNSON,SYRACUSE,[SYRACUSE]
3333,202326,DeMarcus,Cousins,1990-08-13,Kentucky,USA,2010,1,5,DeMarcus Cousins,DEMARCUS COUSINS,KENTUCKY,[KENTUCKY]


In [57]:
len(nba_drafted_post_2010)

509

In [58]:
print(f"Raw Match Rate for all NBA records: {round((len(nba_ncaa_matches) / len(nba_data.drop_duplicates(subset = ['nba_id']))) * 100, 3)}%")
print(f"Match Rate for NBA records with nonnull last attended (college name): {round((len(nba_ncaa_matches) / len(nba_er_data)) * 100, 3)}%")
print(f"Match Rate for NBA records with draftYear >= 2010: {round((len(nba_ncaa_matches) / len(nba_drafted_post_2010)) * 100, 3)}%")

Raw Match Rate for all NBA records: 20.927%
Match Rate for NBA records with nonnull last attended (college name): 27.917%
Match Rate for NBA records with draftYear >= 2010: 78.978%


### Generate QA Review 

In [59]:
min(nba_data['draftYear'])

-22.0

In [60]:
nba_ncaa_matches[['nba_id', 'ncaa_id']].to_csv('./resolved_data/nba_ncaa_map.csv', index = False)

In [61]:
nba_ncaa_matches.head()

Unnamed: 0,nba_id,firstName,lastName,birthdate,lastAttended,country,draftYear,draftRound,draftNumber,full_name,...,clean_lastAttended,college_token,ncaa_id,player_name,team,min_year,max_year,clean_player_name,clean_team,fuzzy_name_score
332368,202322,John,Wall,1990-09-06,Kentucky,USA,2010,1,1,John Wall,...,KENTUCKY,KENTUCKY,19599,John Wall,Kentucky,2010,2010,JOHN WALL,KENTUCKY,100
363139,202323,Evan,Turner,1988-10-27,Ohio State,USA,2010,1,2,Evan Turner,...,OHIO STATE,OHIO,13070,Evan Turner,Ohio St.,2010,2010,EVAN TURNER,OHIO ST,100
176738,202324,Derrick,Favors,1991-07-15,Georgia Tech,USA,2010,1,3,Derrick Favors,...,GEORGIA TECH,GEORGIA,10482,Derrick Favors,Georgia Tech,2010,2010,DERRICK FAVORS,GEORGIA TECH,100
381231,202325,Wesley,Johnson,1987-07-11,Syracuse,USA,2010,1,4,Wesley Johnson,...,SYRACUSE,SYRACUSE,37984,Wes Johnson,Syracuse,2010,2010,WES JOHNSON,SYRACUSE,88
332700,202326,DeMarcus,Cousins,1990-08-13,Kentucky,USA,2010,1,5,DeMarcus Cousins,...,KENTUCKY,KENTUCKY,9911,DeMarcus Cousins,Kentucky,2010,2010,DEMARCUS COUSINS,KENTUCKY,100


In [62]:
nba_ncaa_matches.columns

Index(['nba_id', 'firstName', 'lastName', 'birthdate', 'lastAttended',
       'country', 'draftYear', 'draftRound', 'draftNumber', 'full_name',
       'clean_full_name', 'clean_lastAttended', 'college_token', 'ncaa_id',
       'player_name', 'team', 'min_year', 'max_year', 'clean_player_name',
       'clean_team', 'fuzzy_name_score'],
      dtype='object')

In [63]:
col_rename_dict = {
    'firstName': 'nba_first_name',
    'lastName': 'nba_last_name',
    'birthdate': 'nba_birthdate',
    'lastAttended': 'nba_college_name',
    'country': 'nba_country',
    'draftYear': 'nba_draft_year',
    'draftRound': 'nba_draft_round',
    'full_name': 'nba_full_name',
    'clean_full_name': 'nba_clean_full_name',
    'clean_lastAttended': 'nba_clean_college_name',
    'player_name': 'ncaa_full_name',
    'team': 'ncaa_college_name',
    'min_year': 'ncaa_min_year',
    'max_year': 'ncaa_max_year',
    'clean_player_name' : 'ncaa_clean_full_name',
    'clean_team': 'clean_ncaa_college_name'
}
matches_review = nba_ncaa_matches.rename(columns = col_rename_dict)
matches_review.to_csv('./resolved_data/nba_ncaa_matches_review.csv', index = False)

In [64]:
matches_review.head()

Unnamed: 0,nba_id,nba_first_name,nba_last_name,nba_birthdate,nba_college_name,nba_country,nba_draft_year,nba_draft_round,draftNumber,nba_full_name,...,nba_clean_college_name,college_token,ncaa_id,ncaa_full_name,ncaa_college_name,ncaa_min_year,ncaa_max_year,ncaa_clean_full_name,clean_ncaa_college_name,fuzzy_name_score
332368,202322,John,Wall,1990-09-06,Kentucky,USA,2010,1,1,John Wall,...,KENTUCKY,KENTUCKY,19599,John Wall,Kentucky,2010,2010,JOHN WALL,KENTUCKY,100
363139,202323,Evan,Turner,1988-10-27,Ohio State,USA,2010,1,2,Evan Turner,...,OHIO STATE,OHIO,13070,Evan Turner,Ohio St.,2010,2010,EVAN TURNER,OHIO ST,100
176738,202324,Derrick,Favors,1991-07-15,Georgia Tech,USA,2010,1,3,Derrick Favors,...,GEORGIA TECH,GEORGIA,10482,Derrick Favors,Georgia Tech,2010,2010,DERRICK FAVORS,GEORGIA TECH,100
381231,202325,Wesley,Johnson,1987-07-11,Syracuse,USA,2010,1,4,Wesley Johnson,...,SYRACUSE,SYRACUSE,37984,Wes Johnson,Syracuse,2010,2010,WES JOHNSON,SYRACUSE,88
332700,202326,DeMarcus,Cousins,1990-08-13,Kentucky,USA,2010,1,5,DeMarcus Cousins,...,KENTUCKY,KENTUCKY,9911,DeMarcus Cousins,Kentucky,2010,2010,DEMARCUS COUSINS,KENTUCKY,100


## Calculate Euclidean Distances for to find Similar Players

In [206]:
# load college player data
ncaa_data_raw = pd.read_csv('./resolved_data/player_data_college_latest_season.csv')

# add percentages
ncaa_data_raw['3p_pct'] = (ncaa_data_raw['3p_made'] / ncaa_data_raw['3p_attempted']) * 100
ncaa_data_raw['3p_pct'] = ncaa_data_raw['3p_pct'].fillna(0)
ncaa_data_raw['2p_pct'] = (ncaa_data_raw['2p_made'] / ncaa_data_raw['2p_attempted']) * 100
ncaa_data_raw['2p_pct'] = ncaa_data_raw['2p_pct'] .fillna(0)

# scale data
scaler = MinMaxScaler()
stats_of_interest = [
    'height', 
    'total_reb', 
    'assists', 
    'steals', 
    'blocks', 
    'shooting_pct',
    'points_scored',
    '3p_pct',
    '2p_pct',
    'shooting_pct'
]

ncaa_data = ncaa_data_raw[['ncaa_id', 'max_year'] + stats_of_interest].copy()
ncaa_data[stats_of_interest] = scaler.fit_transform(ncaa_data[stats_of_interest])

# get incoming prospects
incoming_prospects = ncaa_data[ncaa_data['max_year'] == 2025]

# get players to match to
ncaa_players_to_match_to = pd.merge(ncaa_data, nba_ncaa_matches['ncaa_id'].to_frame(), on = 'ncaa_id', how = 'inner')
ncaa_players_to_match_to = ncaa_players_to_match_to[ncaa_players_to_match_to['max_year'] != 2025]

# convert dfs to arrays 
incoming_prospects_arr = incoming_prospects.to_numpy()
ncaa_players_to_match_to_arr = ncaa_players_to_match_to.to_numpy()

In [155]:
incoming_prospects.head()

Unnamed: 0,ncaa_id,max_year,height,total_reb,assists,steals,blocks,shooting_pct,points_scored,3p_pct,2p_pct,shooting_pct.1
23880,32016,2025,0.483333,0.15536,0.106667,0.225456,0.045644,0.551449,0.372628,0.361702,0.505495,0.551449
23881,11675,2025,0.458333,0.089497,0.212721,0.108398,0.014625,0.550264,0.199518,0.421053,0.410256,0.550264
23882,21738,2025,0.533333,0.157265,0.028672,0.08265,0.079763,0.563301,0.112352,0.0,0.634921,0.563301
23883,15748,2025,0.533333,0.472478,0.200721,0.2562,0.067477,0.572137,0.304519,0.347458,0.652174,0.572137
23884,32041,2025,0.508333,0.170531,0.047881,0.20186,0.132557,0.583019,0.362624,0.380531,0.540984,0.583019


In [None]:
len(incoming_prospects)

3673

In [157]:
ncaa_players_to_match_to.head()

Unnamed: 0,ncaa_id,max_year,height,total_reb,assists,steals,blocks,shooting_pct,points_scored,3p_pct,2p_pct,shooting_pct.1
0,32811,2010,0.7,0.26838,0.111506,0.14875,0.098153,0.478289,0.224003,0.0,0.491018,0.478289
1,18618,2010,0.8,0.475346,0.072622,0.210981,0.341207,0.642495,0.332338,0.25,0.673367,0.642495
2,14293,2010,0.5,0.321181,0.626548,0.440357,0.065385,0.515354,0.665206,0.360215,0.471875,0.515354
3,7422,2010,1.0,0.680724,0.085047,0.19215,0.660353,0.56481,0.373886,0.0,0.562264,0.56481
4,24405,2010,0.5,0.518763,0.151052,0.482271,0.100683,0.500377,0.60152,0.349462,0.478659,0.500377


In [158]:
len(ncaa_players_to_match_to)

401

In [167]:
np.linalg.norm(np.array([1,2,3]) - np.array([1,2,3]))

0.0

In [185]:
similarity_results = []
for prospect in incoming_prospects_arr:
    prospect_id = prospect[0]
    prospect_stats = prospect[2:] # remove id and max year

    for target_player in ncaa_players_to_match_to_arr:
        target_player_id = target_player[0]
        target_stats = target_player[2:]

        # calculate euclidean distance
        euclidean_dist = np.linalg.norm(np.array(prospect_stats) - np.array(target_stats))

        tmp_res = {'prospect_ncaa_id': int(prospect_id), 'similar_player_ncaa_id': int(target_player_id), 'euclidean_distance': euclidean_dist}
        similarity_results.append(tmp_res)

similarity_results_df = pd.DataFrame(similarity_results)

In [196]:
# add similarity score
def reverse_min_max_scale(series):
    min_val = series.min()
    max_val = series.max()
    return 1 - ((series - min_val) / (max_val - min_val))

similarity_results_df['similarity_score'] = reverse_min_max_scale(similarity_results_df['euclidean_distance'])

# order df on similarity score
similarity_results_df_ordered = similarity_results_df.sort_values(by = ['prospect_ncaa_id', 'similarity_score'], ascending = False)
similarity_results_df_ordered['rank'] = similarity_results_df_ordered.groupby('prospect_ncaa_id')['similarity_score'].rank(method = 'first', ascending = False)

In [200]:
similarity_results_df_ordered[similarity_results_df_ordered['prospect_ncaa_id'] == 39094].to_csv('./test.csv')

In [203]:
similarity_results_df_ordered.to_csv('./player_similarity_results.csv', index = False)

In [None]:
len(similarity_results_df_ordered) 

1472873

## Generate Similarity Review

In [204]:
similarity_results_df_ordered.head()

Unnamed: 0,prospect_ncaa_id,similar_player_ncaa_id,euclidean_distance,similarity_score,rank
797260,39094,20880,0.274481,0.851543,1.0
797495,39094,14382,0.329531,0.816308,2.0
797420,39094,19779,0.331741,0.814893,3.0
797337,39094,14094,0.336692,0.811725,4.0
797560,39094,20991,0.368759,0.7912,5.0


In [None]:
ncaa_review = ncaa_data_raw[['ncaa_id', 'player_name', 'team', 'pos_class'] + stats_of_interest].copy()

In [210]:
ncaa_review.head()

Unnamed: 0,ncaa_id,player_name,team,height,pos_class,height.1,total_reb,assists,steals,blocks,shooting_pct,points_scored,3p_pct,2p_pct,shooting_pct.1
0,8686,Dana Smith,Longwood,167.64,Wing F,167.64,7.0741,2.4074,0.8148,0.4815,53.76,17.5556,36.263736,48.275862,53.76
1,5795,Carlos Strong,Boston University,106.68,Combo G,106.68,4.5714,1.8,1.6571,0.4,60.93,10.8571,47.2,47.794118,60.93
2,12966,Eulis Stephens,Detroit Mercy,137.16,Wing G,137.16,1.9032,0.5806,0.3871,0.129,55.91,3.5806,18.181818,61.111111,55.91
3,4471,Brett Gifford,Albany,350.52,C,350.52,3.3,0.6,0.4,0.8333,46.77,2.0667,0.0,47.540984,46.77
4,37615,Valdas Sirutis,Boston University,228.6,Stretch 4,228.6,2.4688,0.5938,0.2188,0.0625,43.49,2.0312,24.137931,45.16129,43.49


In [246]:
ncaa_review = ncaa_data_raw[['ncaa_id', 'player_name', 'team', 'pos_class'] + stats_of_interest].copy()
similarity_review = pd.merge(similarity_results_df_ordered, ncaa_review, left_on = 'prospect_ncaa_id', right_on = 'ncaa_id', how = 'inner')
similarity_review = pd.merge(similarity_review, ncaa_review, left_on = 'similar_player_ncaa_id', right_on = 'ncaa_id', how = 'inner')
review_cols = {
    'prospect_ncaa_id': 'prospect_ncaa_id', 
    'similar_player_ncaa_id': 'similar_player_ncaa_id', 
    'euclidean_distance': 'euclidean_distance',
    'similarity_score': 'similarity_score', 
    'rank': 'rank', 
    'player_name_x': 'prospect_player_name', 
    'player_name_y': 'similar_player_player_name', 
    'team_x': 'prospect_team',
    'team_y': 'similar_player_team',
    'height_x': 'prospect_height', 
    'height_y': 'similar_player_height', 
    'pos_class_x': 'prospect_pos_class', 
    'pos_class_y': 'similar_player_pos_class', 
    'points_scored_x': 'prospect_points_scored', 
    'points_scored_y': 'similar_player_points_score',
    'total_reb_x': 'prospect_total_reb', 
    'total_reb_y': 'similar_player_total_reb',
    'assists_x': 'prospect_assists',
    'assists_y': 'similar_player_assists',
    'steals_x': 'prospect_steals', 
    'steals_y': 'similar_player_steals',
    'blocks_x': 'prospect_blocks', 
    'blocks_y': 'similar_player_blocks',
    'shooting_pct_x': 'prospect_shooting_pct', 
    'shooting_pct_y': 'similar_player_shooting_pct',
    '3p_pct_x': 'prospect_3p_pct',
    '3p_pct_y': 'similar_player_3p_pct',
    '2p_pct_x': 'prospect_2p_pct',    
    '2p_pct_y': 'similar_player_2p_pct'
}
similarity_review = similarity_review[list(review_cols.keys())].rename(columns = review_cols)
similarity_review = similarity_review[similarity_review['rank'] <= 3.0].sort_values(by = ['prospect_ncaa_id', 'rank'], ascending = True)

In [247]:
similarity_review.columns

Index(['prospect_ncaa_id', 'similar_player_ncaa_id', 'euclidean_distance',
       'similarity_score', 'rank', 'prospect_player_name',
       'similar_player_player_name', 'prospect_team', 'similar_player_team',
       'prospect_height', 'similar_player_height', 'prospect_pos_class',
       'similar_player_pos_class', 'prospect_points_scored',
       'similar_player_points_score', 'prospect_total_reb',
       'similar_player_total_reb', 'prospect_assists',
       'similar_player_assists', 'prospect_steals', 'similar_player_steals',
       'prospect_blocks', 'similar_player_blocks', 'prospect_shooting_pct',
       'prospect_shooting_pct', 'similar_player_shooting_pct',
       'similar_player_shooting_pct', 'prospect_3p_pct',
       'similar_player_3p_pct', 'prospect_2p_pct', 'similar_player_2p_pct'],
      dtype='object')

In [248]:
similarity_review.head()

Unnamed: 0,prospect_ncaa_id,similar_player_ncaa_id,euclidean_distance,similarity_score,rank,prospect_player_name,similar_player_player_name,prospect_team,similar_player_team,prospect_height,...,prospect_blocks,similar_player_blocks,prospect_shooting_pct,prospect_shooting_pct.1,similar_player_shooting_pct,similar_player_shooting_pct.1,prospect_3p_pct,similar_player_3p_pct,prospect_2p_pct,similar_player_2p_pct
128554,39,26265,0.11378,0.9544,1.0,A.J. Hoggard,Marquis Teague,Vanderbilt,Kentucky,193.04,...,0.2581,0.275,45.21,45.21,49.08,49.08,27.368421,32.5,39.89899,43.75
40402,39,1791,0.204733,0.896185,2.0,A.J. Hoggard,Andrew Harrison,Vanderbilt,Kentucky,193.04,...,0.2581,0.2051,45.21,45.21,53.13,53.13,27.368421,38.297872,39.89899,37.572254
418721,39,29224,0.205924,0.895423,3.0,A.J. Hoggard,Nico Mannion,Vanderbilt,Arizona,193.04,...,0.2581,0.0,45.21,45.21,52.07,52.07,27.368421,32.716049,39.89899,44.660194
227724,52,37108,0.0992,0.963732,1.0,A.J. Lopez,Tyler Dorsey,Maine,Oregon,195.58,...,0.1515,0.1026,60.09,60.09,60.49,60.49,39.694656,42.364532,52.261307,51.030928
135899,52,11768,0.108065,0.958058,2.0,A.J. Lopez,Dusty Hannahs,Maine,Arkansas,195.58,...,0.1515,0.1111,60.09,60.09,58.93,58.93,39.694656,38.743455,52.261307,48.958333


In [249]:
similarity_review.to_csv('./similarity_review.csv', index = False)