# Fetching the data of players from NBA.com, through the nba_api package

## Players career stats

In [None]:
import pandas as pd
import numpy as np
import time

#Career stats API calls
from nba_api.stats.static import players
from nba_api.stats.endpoints import playercareerstats as career

#Player biography API calls
from nba_api.stats.endpoints import leaguedashplayerbiostats as bio

Most code is taken from the `nba_api` documentation, given as examples.

Let's collect all players:

In [50]:
# get_players returns a list of dictionaries, each representing a player.
nba_players = players.get_players()
print("Number of players fetched: {}".format(len(nba_players)))
nba_players[0]

Number of players fetched: 5011


{'id': 76001,
 'full_name': 'Alaa Abdelnaby',
 'first_name': 'Alaa',
 'last_name': 'Abdelnaby',
 'is_active': False}

In [51]:
ids =[player['id'] for player in nba_players]
len(ids)

5011

This is fast, and we gathered player IDs, but contain only basic information - we need player career stats and bio.

We can run `career.PlayerCareerStats(id)` to gather data about a player:

In [48]:
id = 76001
career_stats = career.PlayerCareerStats(player_id=id, timeout=2)
careers = career_stats.get_data_frames()[0]
careers

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,76001,1990-91,0,1610612757,POR,23.0,43,0,290.0,55,...,0.568,27,62,89,12,4,12,22,39,135
1,76001,1991-92,0,1610612757,POR,24.0,71,1,934.0,178,...,0.752,81,179,260,30,25,16,66,132,432
2,76001,1992-93,0,1610612749,MIL,25.0,12,0,159.0,26,...,0.75,12,25,37,10,6,4,13,24,64
3,76001,1992-93,0,1610612738,BOS,25.0,63,52,1152.0,219,...,0.76,114,186,300,17,19,22,84,165,514
4,76001,1992-93,0,0,TOT,25.0,75,52,1311.0,245,...,0.759,126,211,337,27,25,26,97,189,578
5,76001,1993-94,0,1610612738,BOS,26.0,13,0,159.0,24,...,0.64,12,34,46,3,2,3,17,20,64
6,76001,1994-95,0,1610612758,SAC,27.0,51,0,476.0,117,...,0.571,34,72,106,13,15,12,40,102,254
7,76001,1994-95,0,1610612755,PHL,27.0,3,0,30.0,1,...,0.0,3,5,8,0,0,0,5,2,2
8,76001,1994-95,0,0,TOT,27.0,54,0,506.0,118,...,0.571,37,77,114,13,15,12,45,104,256


The data we have collected before of 4900 players is just name and ID.<br>
`PlayerCareerStats` data seems to contain most (if not all) of the information we can collect about player match statistics, so let's fetch it for all players.

We fetch the players in 4 batches, and any player who was not successfully fetched in stored in a list, to try again later. (The NBA API has quite a limit (especially during matches), so that is why for some players data collection didn't happen.)

In [52]:
id_len = len(ids)
ids_half = ids[:id_len//2]
ids_half2 = ids[id_len//2:]
ids_quarter1 = ids[:id_len//4]
ids_quarter2 = ids[id_len//4:id_len//2]
ids_quarter3 = ids[id_len//2:3*id_len//4]
ids_quarter4 = ids[3*id_len//4:]

wrong = []

In [None]:
from tqdm import tqdm

def add_career(id, careers, timeout=2):
    career_stats = career.PlayerCareerStats(player_id=id, timeout=timeout)
    career_stats_df = career_stats.get_data_frames()[0]
    careers = pd.concat([careers, career_stats_df], ignore_index=True)
    return careers

def get_careers(ids, careers, wrong=[], timeouts = [1, 2]):
    for id in tqdm(ids, desc="Processing IDs"):
        try:
            careers = add_career(id, careers, timeout= timeouts[0])
        except:
            #Retry with delay
            time.sleep(0.3)
            try:
                careers = add_career(id, careers, timeout= timeouts[1])
            except:
                try:
                    time.sleep(1)
                    careers = add_career(id, careers, timeout = 1.5)
                except:
                    wrong.append(id)
    return careers, wrong

Depending on many things (current website traffic, also from where you visit the website), fetching each quarter can take varying time. It usually takes 40-80mins for each, but I've also had cases when some batches surprisingly took only 5-10 minutes (no rate limiting).

In [None]:
#Warning: This will typically take 30-60 minutes to run
careers, wrong = get_careers(ids_quarter1, careers, wrong)
careers.to_csv('data\career.csv', index=False)

2nd batch:

In [None]:
careers, wrong = get_careers(ids_quarter2, careers, wrong)
careers.to_csv('data\career.csv', index=False)

3rd batch (removed process bar):

In [None]:
careers, wrong = get_careers(ids_quarter3, careers, wrong, timeouts = [1, 1])
careers.to_csv('data\career.csv', index=False)

4th batch:

In [None]:
careers, wrong = get_careers(ids_quarter4, careers, wrong)
careers.to_csv('data\career.csv', index=False)

Let's try to refetch the "wrong ID" players, we can just create a new list of wrong IDs and use the previous wrongs to try again.<br>
Each missing 

In [None]:
wrong2 = []
for i in range(10):
    #On even loops, we append to wrong2 and empty wrong, on odd loops, vice versa
    if(i%2==0):
        careers, wrong2 = get_careers(wrong, careers, wrong2)
        wrong = []
    else:
        careers, wrong = get_careers(wrong2, careers, wrong)
        wrong2 = []

After 10 iterations of trying to fetch previously unsuccessful IDs, we're left with only 7 IDs that did not yield a result anytime.<br>
These don't seem to fetch with further tries either.

In [None]:
wrong

[76415, 1432, 202408, 76468, 201986, 1629602, 76564]

I select only 20 seasons, 2003-04 to 2022-23, and save the data into a CSV.<br>
(Because of not succeeding to gather the 2023-24 season bio data, probably due to high subpage traffic, I take the "last" season as 2022-23.)

In [None]:
careers.to_csv('data\career.csv', index=False)

seasons = ["2022-23", "2021-22","2020-21",'2019-20', '2018-19', '2017-18', '2016-17', '2015-16', '2014-15', '2013-14','2012-13', '2011-12', '2010-11', '2009-10', '2008-09','2007-08', '2006-07', '2005-06', '2004-05', '2003-04',]
careers_filtered = careers[(careers['SEASON_ID'].isin(seasons)) | (careers['SEASON_ID']=="2022-23")].reset_index(drop=True) #I didn't include 2022-23 in the seasons list
careers_filtered.to_csv('data\career_filtered.csv', index=False)

## Player Bio Data

We also need info about player height, weight etc. which is not included in the career stats.

One important information missing is player position.<br>
Sadly, the NBA does not provide this information as it is not trivially defined. Positions changed a lot over the years, and can differ team by team.<br>

There are some sources that predict the positions, but I would rather skip using them.<br>
|POS|Position   |G: Guard (shooting guard and point guard), F: Forward (power forward and small forward), C: Center   |

The previous season players:

In [30]:
bio_stats = bio.LeagueDashPlayerBioStats(season="2022-23", timeout = 4)  #LeagueID, PerMode, Season, SeasonType
bios = bio_stats.get_data_frames()[0]
bios["SEASON_ID"] = ["2022-23"]*len(bios)
bios.head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,AGE,PLAYER_HEIGHT,PLAYER_HEIGHT_INCHES,PLAYER_WEIGHT,COLLEGE,COUNTRY,...,PTS,REB,AST,NET_RATING,OREB_PCT,DREB_PCT,USG_PCT,TS_PCT,AST_PCT,SEASON_ID
0,1630639,A.J. Lawson,1610612742,DAL,22.0,6-6,78,179,South Carolina,Canada,...,56,21,2,-20.1,0.046,0.152,0.189,0.589,0.032,2022-23
1,1631260,AJ Green,1610612749,MIL,23.0,6-5,77,190,Northern Iowa,USA,...,154,45,22,-4.9,0.016,0.105,0.159,0.607,0.092,2022-23
2,1631100,AJ Griffin,1610612737,ATL,19.0,6-6,78,220,Duke,USA,...,639,153,73,1.5,0.026,0.08,0.174,0.577,0.07,2022-23
3,203932,Aaron Gordon,1610612743,DEN,27.0,6-8,80,235,Arizona,USA,...,1109,446,203,12.1,0.086,0.136,0.206,0.617,0.129,2022-23
4,1628988,Aaron Holiday,1610612737,ATL,26.0,6-0,72,185,UCLA,USA,...,247,74,89,0.9,0.028,0.059,0.129,0.528,0.135,2022-23


Fetch seasons, with some delay-retry if the request fails:

In [34]:
def add_season_bios(season, bios, timeout=2):
    bio_stats = bio.LeagueDashPlayerBioStats(season=season, timeout=timeout)
    season_bio = bio_stats.get_data_frames()[0]
    season_bio["SEASON_ID"] = [season]*len(season_bio)
    bios = pd.concat([bios, season_bio], ignore_index=True)
    return bios

def get_seasons_bios(seasons, bios, timeouts = [4, 4]):
    wrongs = []
    for season in seasons:
        try:
            bios = add_season_bios(season, bios, timeout=timeouts[0])
        except:
            time.sleep(2)
            try:
                bios = add_season_bios(season, bios, timeout=timeouts[1])
            except:
                print(f"Failed to fetch season {season}")
                wrongs.append(season)
    return bios, wrongs

seasons = ["2021-22","2020-21",'2019-20', '2018-19', '2017-18', '2016-17', '2015-16', '2014-15', '2013-14','2012-13', '2011-12', '2010-11', '2009-10', '2008-09','2007-08', '2006-07', '2005-06', '2004-05', '2003-04',]
wrongs = []
bios, wrongs = get_seasons_bios(seasons, bios, timeouts = [4, 4])

No season missing. We can save our table:

In [38]:
bios.to_csv('data\player_bios.csv', index=False)