In [34]:
#!pip install nba_api

In [35]:
import urllib.request
from nba_api.stats.endpoints import leaguegamefinder 
import json
import csv
import datetime
import pandas as pd
import time
import numpy as np

We are going to retrive and preprocess data from the NBA_API, following the project https://github.com/swar/nba_api/tree/master/docs/nba_api<br>
For the medium sports analytic analysis the last date of data retrieval is 06/13/2021 (aka Last Facundo Campazzo's game this season)

In [60]:
#this variable controls the last data to retrieve from the different endpoints from the API
last_date_to_update = "06/13/2021"

# GAMES DATA

In [61]:
#league_id == 00 is NBA (data hast another leagues if not filtered -WNBA, G League-)
Games = leaguegamefinder.LeagueGameFinder(league_id_nullable='00',season_nullable="2020-21",\
                                    date_to_nullable=last_date_to_update).get_data_frames()[0]

In [62]:
#Preprocess 
Games["SEASON"] = "2020-21"
Games["RIVAL"] = Games["MATCHUP"].apply(lambda x: x.replace("vs.","@").split("@")[1].strip())
Games["TEAM_NAME"] = Games["TEAM_NAME"].str.upper()
Games["TEAM_ABBREVIATION"] = Games["TEAM_ABBREVIATION"].str.strip() 
Games["GAME_DATE"] = pd.to_datetime(Games["GAME_DATE"])
Games["SEASON_TYPE"] = Games.SEASON_ID.map({
                        '22020':'REGULAR',
                        '32020':'ALL-STAR',
                        '42020':'PLAYOFFS',
                        '52020':'PLAYIN'})
Games.drop(['SEASON_ID'],axis=1,inplace=True)

In [63]:
#Create a rivals df to finally merge with the original dataset
Games_rivals = Games[['TEAM_ABBREVIATION', 'TEAM_NAME','GAME_DATE','WL', 'MIN', 'PTS',
       'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PLUS_MINUS']].copy()
Games_rivals.columns = ["RIVAL_"+x for x in Games_rivals.columns]

In [64]:
#Merge both datasets
Final_Games = Games.merge(Games_rivals, left_on=["RIVAL", "GAME_DATE"],\
                         right_on=["RIVAL_TEAM_ABBREVIATION", "RIVAL_GAME_DATE"],\
                         how="left")

In [65]:
#Final Preprocessing
Final_Games = Final_Games.iloc[Final_Games["GAME_ID"].drop_duplicates().index,:]
Final_Games["GAME_ID"] = Final_Games["GAME_ID"].astype("int")
Final_Games["MATCHUP"] = Final_Games.MATCHUP.apply(lambda x: x.replace("@","vs."))

Final_Games["FG2M"]    = Final_Games["FGM"] - Final_Games["FG3M"]
Final_Games["FG2A"]    = Final_Games["FGA"] - Final_Games["FG3A"]
Final_Games["FG2_PCT"] = round(Final_Games["FG2M"] / Final_Games["FG2A"],3)

In [66]:
Final_Games.to_csv("data\GamesDB.csv")

# PLAYERS DATA

In [67]:
from nba_api.stats.endpoints import LeaguePlayerOnDetails, PlayerGameLog

In [68]:
teams = pd.DataFrame()
for teamID in Final_Games.TEAM_ID.unique():
    temporal = LeaguePlayerOnDetails(team_id=teamID,\
                    date_to_nullable=last_date_to_update).get_data_frames()[0]
    teams = teams.append(temporal, ignore_index=True)

In [69]:
teams.TEAM_ID = teams.TEAM_ID.astype("str")
teams.rename(columns={"VS_PLAYER_ID":"PLAYER_ID","VS_PLAYER_NAME":"PLAYER_NAME"},inplace=True)

In [70]:
teams.to_csv("data\TeamsDB.csv")

In [71]:
#Read database already created and update it
players = pd.read_csv("PlayersDB.csv", index_col=0)
players["GAME_DATE"] = pd.to_datetime(players["GAME_DATE"])

In [72]:
def game_date_process(x):
    callendar = {"JAN":1, "FEB":2, "MAR":3, "APR":4, "MAY":5, "JUN":6, "JUL":7, "AGO":8, "SEP":9, "OCT":10,
                "NOV": 11, "DEC":12}
    
    numeric_month = str(callendar[x.split(" ")[0]])
    day           = str(x.split(" ")[1].replace(",",""))
    year          = str(x.split(" ")[2])
    
    date = pd.to_datetime("-".join([day, numeric_month, year]),format="%d-%m-%Y")
    return datetime.datetime.date(date)

Update Regular Season

In [73]:
last_date_updated = "/".join([
    str(players["GAME_DATE"].max().month),
    str(players["GAME_DATE"].max().day+1),
    str(players["GAME_DATE"].max().year)
         ])

indicator = 0
failure_mark = 0
previous_database_instances = players.shape[0]
print("Reading new data...")

#Players in Regular Season
for playerID in teams.PLAYER_ID.unique():
    indicator = indicator + 1
    #sleep to avoid errors
    time.sleep(.600)
    temporal = PlayerGameLog(player_id=playerID, date_from_nullable=last_date_updated)\
                                                                    .get_data_frames()[0]
    temporal["SEASON_TYPE"] = "Regular"
    if temporal.shape[0] == 0:
        failure_mark = failure_mark + 1
        if failure_mark == 20:
            print("No more data to read")
            break
    else:
        temporal["GAME_DATE"] = temporal["GAME_DATE"].apply(lambda x: game_date_process(x))
        temporal.columns = temporal.columns.str.upper()
        players  = players.append(temporal, ignore_index=True)
    
    if indicator%50 == 0:
        print("Working...")

print(f"Se cargaron {players.shape[0] - previous_database_instances} instancias a la base")

Reading new data...
No more data to read
Se cargaron 0 instancias a la base


Update Playoffs

In [76]:
last_date_updated = "/".join([
    str(players["GAME_DATE"].max().month),
    str(players["GAME_DATE"].max().day+1),
    str(players["GAME_DATE"].max().year)
         ])


#Players in Playoffs
teamsInPlayoffs = Final_Games[Final_Games["SEASON_TYPE"]=="PLAYOFFS"].\
                                                    TEAM_ABBREVIATION.unique()
indicator = 1
failure_mark = 0
previous_database_instances = players.shape[0]
print("Reading new data...")

#Get only players from teams that are in playoffs
for playerID in teams[teams["TEAM_ABBREVIATION"].isin(teamsInPlayoffs)].\
                                                            PLAYER_ID.unique():
    indicator = indicator + 1
    time.sleep(.600)
    temporal = PlayerGameLog(player_id=playerID, date_from_nullable=last_date_updated,
                            date_to_nullable=last_date_to_update,\
                            season_type_all_star="Playoffs").get_data_frames()[0]
    if temporal.shape[0] == 0:
        failure_mark = failure_mark+1
        if failure_mark == 100:
            print("No more data to read")
            break
    else:
        #We reset the failure_mark as we want to sum only the empty DF in a row
        failure_mark = 0
        temporal["SEASON_TYPE"] = "Playoffs"
        temporal["GAME_DATE"] = temporal["GAME_DATE"].apply(lambda x: game_date_process(x))
        temporal.columns = temporal.columns.str.upper()
        temporal.rename(columns={"VS_PLAYER_ID":"PLAYER_ID","VS_PLAYER_NAME":"PLAYER_NAME"},\
                                                                        inplace=True)
        temporal["PLAYER_NAME"] = temporal.PLAYER_ID.apply(lambda x: teams[teams["PLAYER_ID"]==x]\
                                                                        .PLAYER_NAME.unique()[0])
        players  = players.append(temporal, ignore_index=True)
    
    if indicator%50 == 0:
        print("Working...")

print(f"Se cargaron {players.shape[0] - previous_database_instances} instancias a la base")

Reading new data...
Working...
Working...
No more data to read
Se cargaron 0 instancias a la base


In [79]:
players["MATCHUP"] = players.MATCHUP.apply(lambda x: x.replace("@","vs."))
players["GAME_DATE"] = pd.to_datetime(players.GAME_DATE)
players["PLAYER_ID"] = players.PLAYER_ID.astype("int64")
players["GAME_ID"] = players.GAME_ID.astype("int64")
players["SEASON"] = "2020-21"
players["TEAM_ABBREVIATION"] = players.MATCHUP.apply(lambda x: x.split("vs.")[0].strip())

In [80]:
players["FG2M"]    = players["FGM"] - players["FG3M"]
players["FG2A"]    = players["FGA"] - players["FG3A"]
players["FG2_PCT"] = round(players["FG2M"] / players["FG2A"],3)
players["FG2_PCT"].fillna(0,inplace=True)

In [81]:
for x in range(players.shape[0]):
    playersTeam = players.loc[x,"TEAM_ABBREVIATION"]
    GameID      = players.loc[x,"GAME_ID"]
    LocalTeam   = Final_Games[Final_Games["GAME_ID"]==GameID]["TEAM_ABBREVIATION"].values[0]
    
    if playersTeam == LocalTeam:
        players.loc[x,"LOCAL_AWAY"] = "LOCAL"
    else:
        players.loc[x,"LOCAL_AWAY"] = "AWAY"

In [82]:
players.to_csv("data\PlayersDB.csv")

# Players personal data

In [84]:
from nba_api.stats.endpoints import CommonPlayerInfo

In [85]:
print("Reading new data...")
df = pd.DataFrame()
indicator = 0

for playerID in teams.PLAYER_ID.unique():
    indicator = indicator + 1
    time.sleep(.600)
    temporal = CommonPlayerInfo(player_id=playerID).get_data_frames()[0]    
    temporal.columns = temporal.columns.str.upper()
    df  = df.append(temporal, ignore_index=True)
    
    if indicator%50 == 0:
        print("Working...")
print("Done!")

Reading new data...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Done!


In [86]:
df.to_csv("data\PlayersCommonDB.csv")

# PLAYER SHOOT LOCATIONS

In [87]:
from nba_api.stats.endpoints import LeagueDashPlayerShotLocations, LeagueDashPlayerPtShot, ShotChartDetail

In [89]:
shot_dist = ShotChartDetail(player_id=1630267, team_id=1610612743,\
                           context_measure_simple = 'FGA').get_data_frames()[0]
shot_dist_play_offs = ShotChartDetail(player_id=1630267, team_id=1610612743,\
                           context_measure_simple = 'FGA',\
                           season_type_all_star="Playoffs").get_data_frames()[0]

shot_dist = shot_dist.append(shot_dist_play_offs)

In [90]:
#Denver Shots
shots = LeagueDashPlayerShotLocations(team_id_nullable=1610612743)

In [91]:
# There are 3 unique columns for each floor zone:
# FGM, FGA, FG_PCT
lists = [[c] * 3 for c in shots.get_dict()['resultSets']['headers'][0]['columnNames']]
all_columns = ['player_data']*6 + [j for i in lists for j in i]

assert len(shots.get_dict()['resultSets']['headers'][1]['columnNames']) == len(all_columns)

cols = pd.MultiIndex.from_arrays([all_columns,\
                                 shots.get_dict()['resultSets']['headers'][1]['columnNames']])

player_shooting_df = pd.DataFrame(data=shots.get_dict()['resultSets']['rowSet'],
                                  columns=cols)

In [92]:
shot_dist.to_csv("data\shot_loc.csv")
player_shooting_df.to_csv("data\shots_type_stats.csv")

# Campazzo data by period

In [93]:
from nba_api.stats.endpoints import PlayerDashboardByGameSplits

In [94]:
campa_by_period = PlayerDashboardByGameSplits(player_id=1630267).get_data_frames()[2]

In [95]:
campa_by_period.to_csv("data\Campa_by_period.csv")