# Set Up

In [73]:
import numpy as np
import pandas as pd
import altair as alt
import os
import NBAStatsFetcher
import importlib
from NBAStatsFetcher import StatsFetcher

# Load Data

In [123]:
importlib.reload(NBAStatsFetcher)

fetch_player_data_totals = StatsFetcher(
    api_name='player_data_totals',
    api_url='http://rest.nbaapi.com/api/playerdatatotals/query',
    season=2025,
    sort_by="PlayerName",
    ascending=False,
    delay=0.5,
    page_size=50
)

fetch_player_data_advanced = StatsFetcher(
    api_name='player_data_advanced',
    api_url='http://rest.nbaapi.com/api/playerdataadvanced/query',
    season=2025,
    sort_by="PlayerName",
    ascending=False,
    delay=0.5,
    page_size=50
)

player_data_totals_df = fetch_player_data_totals.get_dataframe()
player_data_advanced_df = fetch_player_data_advanced.get_dataframe()

2025-04-20 11:35:08,335 - Fetching NBA player stats from player_data_totals...
Teams processed: 100%|██████████| 30/30 [00:19<00:00,  1.51it/s]
2025-04-20 11:35:28,225 - No failed requests.
2025-04-20 11:35:28,233 - Fetching NBA player stats from player_data_advanced...
Teams processed: 100%|██████████| 30/30 [00:19<00:00,  1.52it/s]
2025-04-20 11:35:48,011 - No failed requests.


In [124]:
# Columns in advanced that are not in totals (excluding the key 'playerId')
unique_advanced_cols = [
    col for col in player_data_advanced_df.columns
    if col != 'playerId' and col not in player_data_totals_df.columns
]

# Keep only playerId + the unique columns from advanced
advanced_subset = player_data_advanced_df[['playerId'] + unique_advanced_cols]

# Now merge cleanly — no duplicates or suffixes!
stats_df = pd.merge(player_data_totals_df, advanced_subset, on='playerId', how='inner')


In [125]:
stats_df.head()

Unnamed: 0,id,playerName,position,age,games,gamesStarted,minutesPg,fieldGoals,fieldAttempts,fieldPercent,...,turnoverPercent,usagePercent,offensiveWS,defensiveWS,winShares,winSharesPer,offensiveBox,defensiveBox,box,vorp
0,18813,Zaccharie Risacher,SF,19,75,73,1843.0,357,779,0.458,...,9.8,20.9,0.6,1.1,1.8,0.046,-1.7,-1.8,-3.5,-0.7
1,19081,Vit Krejci,PG,24,57,16,1153.0,145,292,0.497,...,13.8,12.9,1.8,0.8,2.6,0.11,0.0,0.7,0.7,0.8
2,18733,Trae Young,PG,26,76,76,2739.0,566,1376,0.411,...,17.9,29.6,4.4,1.3,5.7,0.1,3.3,-2.7,0.5,1.7
3,19383,Terance Mann,SG,28,30,1,682.0,118,218,0.541,...,8.6,15.4,1.3,0.3,1.6,0.111,-0.1,-1.2,-1.3,0.1
4,19383,Terance Mann,SG,28,30,1,682.0,118,218,0.541,...,10.7,14.0,0.4,1.0,1.4,0.093,-2.5,1.2,-1.3,0.1


# Data Profiling

In [126]:
stats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 833 entries, 0 to 832
Data columns (total 53 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  833 non-null    int64  
 1   playerName          833 non-null    object 
 2   position            833 non-null    object 
 3   age                 833 non-null    int64  
 4   games               833 non-null    int64  
 5   gamesStarted        833 non-null    int64  
 6   minutesPg           833 non-null    float64
 7   fieldGoals          833 non-null    int64  
 8   fieldAttempts       833 non-null    int64  
 9   fieldPercent        828 non-null    float64
 10  threeFg             833 non-null    int64  
 11  threeAttempts       833 non-null    int64  
 12  threePercent        776 non-null    float64
 13  twoFg               833 non-null    int64  
 14  twoAttempts         833 non-null    int64  
 15  twoPercent          819 non-null    float64
 16  effectFg

In [127]:
missing_values = stats_df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

fieldPercent        5
threePercent       57
twoPercent         14
effectFgPercent     5
ftPercent          62
tsPercent           5
threePAR            5
ftr                 5
turnoverPercent     3
dtype: int64

In [128]:
stats_df.isnull().sum()[stats_df.isnull().sum() > 0]

fieldPercent        5
threePercent       57
twoPercent         14
effectFgPercent     5
ftPercent          62
tsPercent           5
threePAR            5
ftr                 5
turnoverPercent     3
dtype: int64

# Data Cleaning

In [129]:
# Impute missing values
stats_df['fieldPercent'] = stats_df['fieldPercent'].fillna(
    (stats_df['fieldGoals'] / stats_df['fieldAttempts']).fillna(0)
)

stats_df['threePercent'] = stats_df['threePercent'].fillna(
    (stats_df['threeFg'] / stats_df['threeAttempts']).fillna(0)
)

stats_df['twoPercent'] = stats_df['twoPercent'].fillna(
    (stats_df['twoFg'] / stats_df['twoAttempts']).fillna(0)
)

stats_df['effectFgPercent'] = stats_df['effectFgPercent'].fillna(
    ((stats_df['fieldGoals'] + (stats_df['threeFg'] / 2)) / stats_df['fieldAttempts']).fillna(0)
)

stats_df['ftPercent'] = stats_df['ftPercent'].fillna(
    (stats_df['ft'] / stats_df['ftAttempts']).fillna(0)
)

stats_df['tsPercent'] = stats_df['tsPercent'].fillna(
    (stats_df['points'] / (2 * (stats_df['fieldAttempts'] + 0.44 * stats_df['ftAttempts']))).fillna(0)
)

stats_df['threePAR'] = stats_df['threePAR'].fillna(
    (stats_df['threeAttempts'] / stats_df['fieldAttempts']).fillna(0)
)

stats_df['ftr'] = stats_df['ftr'].fillna(
    (stats_df['ftAttempts'] / stats_df['fieldAttempts']).fillna(0)
)

stats_df['turnoverPercent'] = stats_df['turnoverPercent'].fillna(
    (stats_df['ftAttempts'] / stats_df['fieldAttempts']).fillna(0)
)

stats_df['turnoverPercent'] = stats_df['turnoverPercent'].fillna(
    ((stats_df['turnovers'] / (stats_df['fieldAttempts'] + (0.44 * stats_df['ftAttempts']) + stats_df['turnovers'])) * 100).fillna(0)
)

In [131]:
missing_values = stats_df.isnull().sum()
missing_values = missing_values[missing_values > 0]

if missing_values.empty:
    print("✅ No missing values")
else:
    print("⚠️ Missing values found:")
    display(missing_values)

✅ No missing values
