## Bayesian Model for Predicting Player Impact Based on College and Draft Position

In [87]:
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.static import players
import pandas as pd
import numpy as np
import time
import requests

In [148]:
win_share_25 = pd.read_csv('Data/NBA Win Shares Leaders 24-25.csv')
#will specify starting year of season
win_share_25['Year'] = 2024
#rearrange columns for readibility
new_order = ['Rk','Year','Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'PER', 'TS%',
            '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
            'USG%', 'OWS', 'DWS', 'WS▼', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP',
            'Awards']
win_share_25 = win_share_25[new_order]
win_share_24 = pd.read_csv('Data/NBA Win Shares Leaders 23-24.csv')
win_share_24['Year'] = 2023
win_share_24 = win_share_24[new_order]
win_share_23 = pd.read_csv('Data/NBA Win Shares Leaders 22-23.csv')
win_share_23['Year'] = 2022
win_share_23 = win_share_23[new_order]
win_share_22 = pd.read_csv('Data/NBA Win Shares Leaders 21-22.csv')
win_share_22['Year'] = 2021
win_share_22 = win_share_22[new_order]
win_share_21 = pd.read_csv('Data/NBA Win Shares Leaders 20-21.csv')
win_share_21['Year'] = 2020
win_share_21 = win_share_21[new_order]

win_shares = pd.concat([win_share_25, win_share_24, win_share_23, win_share_22, win_share_21], ignore_index=True, sort=False)
win_shares.head()

Unnamed: 0,Rk,Year,Player,Age,Team,Pos,G,GS,MP,PER,...,USG%,OWS,DWS,WS▼,WS/48,OBPM,DBPM,BPM,VORP,Awards
0,1,2024,Shai Gilgeous-Alexander,26.0,OKC,PG,76.0,76.0,2598.0,30.7,...,34.8,11.9,4.8,16.7,0.309,8.9,2.6,11.5,8.9,"MVP-1,DPOY-10,CPOY-8,AS,NBA1"
1,2,2024,Nikola Jokić,29.0,DEN,C,70.0,70.0,2571.0,32.0,...,29.5,12.7,3.8,16.4,0.307,9.9,3.3,13.3,9.8,"MVP-2,CPOY-2,AS,NBA1"
2,3,2024,Ivica Zubac,27.0,LAC,C,80.0,80.0,2624.0,22.3,...,19.5,7.0,4.7,11.7,0.215,2.3,0.8,3.1,3.4,"DPOY-6,DEF2"
3,4,2024,Jarrett Allen,26.0,CLE,C,82.0,82.0,2296.0,22.1,...,15.9,7.6,4.0,11.6,0.243,3.0,1.2,4.2,3.6,
4,5,2024,Giannis Antetokounmpo,30.0,MIL,PF,67.0,67.0,2289.0,30.5,...,35.2,7.8,3.7,11.5,0.241,6.9,2.5,9.5,6.6,"MVP-3,DPOY-8,AS,NBA1"


Next pull all college data and store as a dictionary

Creating a function to ensure we don't hit rate limits

In [93]:
def get_player_info(player_id, retries=3, delay=30):
    for attempt in range(retries):
        try:
            time.sleep(delay)
            return commonplayerinfo.CommonPlayerInfo(player_id=player_id)
        except requests.exceptions.ReadTimeout:
            if attempt < retries - 1:
                time.sleep(delay*2)
            else:
                raise 

Pulling for one subset since most players should be repeating

In [None]:
active_players = win_shares['Player'].str.strip().unique()

info_dict = {}

for player in active_players:
    try:
        if player in info_dict:
            continue
        
        # 1. Try full name search
        result = players.find_players_by_full_name(player)
        
        # 2. If still no match → skip
        if not result:
            print("Could not find:", player)
            info_dict[player] = {'College': 'NA', 'Draft Pick': 'NA'}
            continue
        
        # 3. Extract the player_id
        curr_player_id = result[0]['id']
        
        # 4. Pull player info
        player_info = get_player_info(player_id=curr_player_id)
        
        info = player_info.common_player_info.get_data_frame()
        
        college = info.loc[0, 'SCHOOL']
        draft_pick = info.loc[0, 'DRAFT_NUMBER']
        
        info_dict[player] = {'College': college, 'Draft Pick': draft_pick}

    except Exception as e:
        print("Failed:", player, "| Reason:", e)
        info_dict[player] = {'College': 'NA', 'Draft Pick': 'NA'}

    time.sleep(15)


Failed: A.J. Green | Reason: module 'nba_api.stats.static.players' has no attribute 'find_players_by_partial_name'
Failed: Ron Holland | Reason: module 'nba_api.stats.static.players' has no attribute 'find_players_by_partial_name'
Failed: Brandon Boston Jr. | Reason: module 'nba_api.stats.static.players' has no attribute 'find_players_by_partial_name'
Failed: Jamal Cain | Reason: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed: GG Jackson II | Reason: module 'nba_api.stats.static.players' has no attribute 'find_players_by_partial_name'
Failed: Xavier Tillman Sr. | Reason: module 'nba_api.stats.static.players' has no attribute 'find_players_by_partial_name'
Failed: Jeenathan Williams | Reason: module 'nba_api.stats.static.players' has no attribute 'find_players_by_partial_name'
Failed: Matthew Hurt | Reason: module 'nba_api.stats.static.players' has no attribute 'find_players_by_partial_name'
Failed: B.J. Johnson | Reason: module 'nba_

Saving as dictionary since it took nearly 12 hours to pull all info safely

In [97]:
import pickle

with open('player_info_dict.pkl', 'wb') as f:
    pickle.dump(info_dict, f)

Loading in my info dict to then pull info for the new players I added

In [149]:
with open('player_info_dict.pkl', 'rb') as file:
    info_dict = pickle.load(file)

In [156]:
all_players = win_shares['Player'].unique()

player_w_info = info_dict.keys()

for player in all_players:
    if player not in player_w_info:
        try:
            if player in info_dict:
                continue
            
            # 1. Try full name search
            result = players.find_players_by_full_name(player)
            
            # 2. If still no match → skip
            if not result:
                print("Could not find:", player)
                info_dict[player] = {'College': 'NA', 'Draft Pick': 'NA'}
                continue
            
            # 3. Extract the player_id
            curr_player_id = result[0]['id']
            
            # 4. Pull player info
            player_info = get_player_info(player_id=curr_player_id)
            
            info = player_info.common_player_info.get_data_frame()
            
            college = info.loc[0, 'SCHOOL']
            draft_pick = info.loc[0, 'DRAFT_NUMBER']
            
            info_dict[player] = {'College': college, 'Draft Pick': draft_pick}

        except Exception as e:
            print("Failed:", player, "| Reason:", e)
            info_dict[player] = {'College': 'NA', 'Draft Pick': 'NA'}

        time.sleep(15)

Failed: Ulrich Chomche | Reason: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Could not find: Cui Yongxi
Failed: nan | Reason: normalize() argument 2 must be str, not float
Could not find: RJ Nembhard Jr.
Could not find: C.J. Miles
Could not find: M.J. Walker


In [197]:
win_shares['College'] = win_shares['Player'].map(lambda x: info_dict[x]['College'])
win_shares['Draft Pick'] = win_shares['Player'].map(lambda x: info_dict[x]['Draft Pick'])

cleaned = win_shares.drop(columns=['G','GS','MP','TS%','3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS','WS/48', 'OBPM', 'DBPM','VORP', 'Awards'])

cleaned = cleaned.rename(columns={'WS▼':'WS'})
cleaned

cleaned[cleaned['College'] == 'NA']['Player'].unique()

KeyError: nan

In [198]:
#Ulrich Chomche -> 1642279
#Cui Yongxi -> Cui Cui
#RJ Nembhard Jr. -> Ruben Nembhard Jr.
#C.J. Miles -> CJ Miles
#M.J. Walker -> Walker

cleaned = cleaned.dropna(subset='Player')
info_dict = {k: v for k, v in info_dict.items() if not pd.isna(k)}
missing_info_players = cleaned[cleaned['College'] == 'NA']['Player'].unique()

renamed_players = {'Ulrich Chomche': 'Chomche',
                   'Cui Yongxi': 'Cui Cui',
                   'RJ Nembhard Jr.': 'Ruben Nembhard Jr.',
                   'C.J. Miles': 'CJ Miles',
                   'M.J. Walker': 'MJ Walker'}

for player in missing_info_players:
    if player == 'Ulrich Chomche':
        playerid = 1642279
        player_info = commonplayerinfo.CommonPlayerInfo(player_id=playerid)
        info = player_info.common_player_info.get_data_frame()

        college = info.loc[0, 'SCHOOL']
        draft_pick = info.loc[0, 'DRAFT_NUMBER']
        
        info_dict[player] = {'College': college, 'Draft Pick': draft_pick}
    else:
        corrected_name = renamed_players[player]
        result = players.find_players_by_full_name(corrected_name)
        curr_player_id = result[0]['id']
        player_info = get_player_info(player_id=curr_player_id)
        info = player_info.common_player_info.get_data_frame()

        college = info.loc[0, 'SCHOOL']
        draft_pick = info.loc[0, 'DRAFT_NUMBER']
        
        info_dict[player] = {'College': college, 'Draft Pick': draft_pick}
        

In [200]:
win_shares = win_shares.dropna(subset='Player')
win_shares['College'] = win_shares['Player'].map(lambda x: info_dict[x]['College'])
win_shares['Draft Pick'] = win_shares['Player'].map(lambda x: info_dict[x]['Draft Pick'])

cleaned = win_shares.drop(columns=['G','GS','MP','TS%','3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS','WS/48', 'OBPM', 'DBPM','VORP', 'Awards'])

cleaned = cleaned.rename(columns={'WS▼':'WS'})
cleaned

Unnamed: 0,Rk,Year,Player,Age,Team,Pos,PER,WS,BPM,College,Draft Pick
0,1,2024,Shai Gilgeous-Alexander,26.0,OKC,PG,30.7,16.7,11.5,Kentucky,11
1,2,2024,Nikola Jokić,29.0,DEN,C,32.0,16.4,13.3,Mega Basket,41
2,3,2024,Ivica Zubac,27.0,LAC,C,22.3,11.7,3.1,Mega Basket,32
3,4,2024,Jarrett Allen,26.0,CLE,C,22.1,11.6,4.2,Texas,22
4,5,2024,Giannis Antetokounmpo,30.0,MIL,PF,30.5,11.5,9.5,Filathlitikos,15
...,...,...,...,...,...,...,...,...,...,...,...
2934,536,2020,Théo Maledon,19.0,OKC,PG,8.2,-0.8,-5.0,ASVEL,34
2935,537,2020,Josh Hall,20.0,OKC,SF,1.0,-0.9,-11.9,Moravian Prep (NC),Undrafted
2936,538,2020,Darius Bazley,20.0,OKC,PF,9.7,-0.9,-4.4,Princeton HS (OH),23
2937,539,2020,Killian Hayes,19.0,DET,PG,5.3,-1.1,-7.2,Ratiopharm Ulm,7


In [202]:
with open('player_info_dict.pkl', 'wb') as f:
    pickle.dump(info_dict, f)

Creating the College and Draft Number columns in the dataframe 

In [204]:
win_shares['College'] = win_shares['Player'].map(lambda x: info_dict[x]['College'])
win_shares['Draft Pick'] = win_shares['Player'].map(lambda x: info_dict[x]['Draft Pick'])

cleaned = win_shares.drop(columns=['G','GS','TS%','3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'OBPM', 'DBPM','VORP', 'Awards'])

cleaned = cleaned.rename(columns={'WS▼':'WS'})
cleaned

Unnamed: 0,Rk,Year,Player,Age,Team,Pos,MP,PER,WS,WS/48,BPM,College,Draft Pick
0,1,2024,Shai Gilgeous-Alexander,26.0,OKC,PG,2598.0,30.7,16.7,0.309,11.5,Kentucky,11
1,2,2024,Nikola Jokić,29.0,DEN,C,2571.0,32.0,16.4,0.307,13.3,Mega Basket,41
2,3,2024,Ivica Zubac,27.0,LAC,C,2624.0,22.3,11.7,0.215,3.1,Mega Basket,32
3,4,2024,Jarrett Allen,26.0,CLE,C,2296.0,22.1,11.6,0.243,4.2,Texas,22
4,5,2024,Giannis Antetokounmpo,30.0,MIL,PF,2289.0,30.5,11.5,0.241,9.5,Filathlitikos,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2934,536,2020,Théo Maledon,19.0,OKC,PG,1778.0,8.2,-0.8,-0.021,-5.0,ASVEL,34
2935,537,2020,Josh Hall,20.0,OKC,SF,336.0,1.0,-0.9,-0.126,-11.9,Moravian Prep (NC),Undrafted
2936,538,2020,Darius Bazley,20.0,OKC,PF,1714.0,9.7,-0.9,-0.026,-4.4,Princeton HS (OH),23
2937,539,2020,Killian Hayes,19.0,DET,PG,670.0,5.3,-1.1,-0.081,-7.2,Ratiopharm Ulm,7


Seeing which players we have missing info on 

In [None]:
missing_info_players = cleaned[cleaned['College'] == 'NA']['Player'].unique()

array(['A.J. Green', 'Ron Holland', 'Brandon Boston Jr.', 'Jamal Cain',
       'GG Jackson II', 'Xavier Tillman Sr.', 'Jeenathan Williams',
       'Matthew Hurt', 'B.J. Johnson', 'Tre Scott', 'Cameron Reynolds'],
      dtype=object)

Making the changes and filling in correct info for these players

In [140]:
#change A.J. Green -> AJ Green
#Ron Holland -> Ronald Holland II
#Brandon Boston Jr. -> Brandon Boston
#Jamal Cain -> Jamal Cain (lost connection during data pull)
#GG Jackson II -> GG Jackson
#Xavier Tillman Sr. -> Xavier Tillman
#Jeenathan Williams -> Nate Williams (id = 1631466)
#Matthew Hurt -> Matt Hurt
#B.J. Johnson -> BJ Johnson
#Tre Scott -> Trevon Scott
#Cameron Reynolds -> Cam Reynolds

missing_info_players = cleaned[cleaned['College'] == 'NA']['Player'].unique()

renamed_players = {'A.J. Green': 'AJ Green',
                   'Ron Holland': 'Ronald Holland II',
                   'Brandon Boston Jr.': 'Brandon Boston',
                   'GG Jackson II': 'GG Jackson',
                   'Xavier Tillman Sr.': 'Xavier Tillman',
                   'Jeenathan Williams': 'Nate Williams',
                   'Matthew Hurt': 'Matt Hurt',
                   'B.J. Johnson': 'BJ Johnson',
                   'Tre Scott': 'Trevon Scott',
                   'Cameron Reynolds': 'Cam Reynolds'}

for player in missing_info_players:
    if player == 'Jeenathan Williams':
        player_id = 1631466
        player_info = commonplayerinfo.CommonPlayerInfo(player_id=1631466)
        info = player_info.common_player_info.get_data_frame()

        college = info.loc[0, 'SCHOOL']
        draft_pick = info.loc[0, 'DRAFT_NUMBER']
        
        info_dict[player] = {'College': college, 'Draft Pick': draft_pick}
    elif player == 'Jamal Cain':
        result = players.find_players_by_full_name(player)
        curr_player_id = result[0]['id']
        player_info = get_player_info(player_id=curr_player_id)
        info = player_info.common_player_info.get_data_frame()

        college = info.loc[0, 'SCHOOL']
        draft_pick = info.loc[0, 'DRAFT_NUMBER']
        
        info_dict[player] = {'College': college, 'Draft Pick': draft_pick}
    else:
        corrected_name = renamed_players[player]
        result = players.find_players_by_full_name(corrected_name)
        curr_player_id = result[0]['id']
        player_info = get_player_info(player_id=curr_player_id)
        info = player_info.common_player_info.get_data_frame()

        college = info.loc[0, 'SCHOOL']
        draft_pick = info.loc[0, 'DRAFT_NUMBER']
        
        info_dict[player] = {'College': college, 'Draft Pick': draft_pick}
        

Resaving my info dictionary

In [143]:
with open('player_info_dict.pkl', 'wb') as f:
    pickle.dump(info_dict, f)

Creating the column again and ensuring we have no more missing info

In [144]:
win_shares['College'] = win_shares['Player'].map(lambda x: info_dict[x]['College'])
win_shares['Draft Pick'] = win_shares['Player'].map(lambda x: info_dict[x]['Draft Pick'])

cleaned = win_shares.drop(columns=['G','GS','MP','TS%','3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS','WS/48', 'OBPM', 'DBPM','VORP', 'Awards'])

cleaned = cleaned.rename(columns={'WS▼':'WS'})
cleaned

Unnamed: 0,Rk,Year,Player,Age,Team,Pos,PER,WS,BPM,College,Draft Pick
0,1,2024,Shai Gilgeous-Alexander,26,OKC,PG,30.7,16.7,11.5,Kentucky,11
1,2,2024,Nikola Jokić,29,DEN,C,32.0,16.4,13.3,Mega Basket,41
2,3,2024,Ivica Zubac,27,LAC,C,22.3,11.7,3.1,Mega Basket,32
3,4,2024,Jarrett Allen,26,CLE,C,22.1,11.6,4.2,Texas,22
4,5,2024,Giannis Antetokounmpo,30,MIL,PF,30.5,11.5,9.5,Filathlitikos,15
...,...,...,...,...,...,...,...,...,...,...,...
2495,496,2020,Chandler Hutchison,24,2TM,SF,6.8,-0.1,-5.6,Boise State,22
2496,497,2020,Andre Roberson,29,BRK,SG,4.2,-0.1,-6.1,Colorado,26
2497,498,2020,Tyrell Terry,20,DAL,PG,6.0,-0.1,-5.0,Stanford,31
2498,499,2020,Jalen Lecque,20,IND,PG,4.7,-0.1,-13.0,Brewster Academy (NH),Undrafted


In [145]:
cleaned[cleaned['College'] == 'NA']['Player'].unique()

array([], dtype=object)

Creating a csv!

In [205]:
cleaned.to_csv('Data/Win_Shares_College_Draft.csv')