# Set Up

In [145]:
import numpy as np
import pandas as pd
import altair as alt
import os
import NBAStatsFetcher
import importlib
from NBAStatsFetcher import StatsFetcher

In [146]:
pd.set_option('display.max_columns', None)

# Load Data

In [147]:
importlib.reload(NBAStatsFetcher)

fetch_player_data_totals = StatsFetcher(
    api_name='player_data_totals',
    api_url='http://rest.nbaapi.com/api/playerdatatotals/query',
    season=2025,
    sort_by="PlayerName",
    ascending=False,
    delay=0.5,
    page_size=50
)

fetch_player_data_advanced = StatsFetcher(
    api_name='player_data_advanced',
    api_url='http://rest.nbaapi.com/api/playerdataadvanced/query',
    season=2025,
    sort_by="PlayerName",
    ascending=False,
    delay=0.5,
    page_size=50
)

player_data_totals_df = fetch_player_data_totals.get_dataframe()
player_data_advanced_df = fetch_player_data_advanced.get_dataframe()

2025-04-20 11:50:52,322 - Fetching NBA player stats from player_data_totals...
Teams processed:   0%|          | 0/30 [00:00<?, ?it/s]

Teams processed: 100%|██████████| 30/30 [00:20<00:00,  1.47it/s]
2025-04-20 11:51:12,715 - No failed requests.
2025-04-20 11:51:12,721 - Fetching NBA player stats from player_data_advanced...
Teams processed: 100%|██████████| 30/30 [00:19<00:00,  1.51it/s]
2025-04-20 11:51:32,527 - No failed requests.


In [151]:
# Columns in advanced that are not in totals (excluding the key 'playerId')
unique_advanced_cols = [
    col for col in player_data_advanced_df.columns
    if col not in ['playerId', 'team'] and col not in player_data_totals_df.columns
]

# Keep only playerId + the unique columns from advanced
advanced_subset = player_data_advanced_df[['playerId', 'team'] + unique_advanced_cols]

# Now merge cleanly — no duplicates or suffixes!
stats_df = pd.merge(player_data_totals_df, advanced_subset, on=['playerId', 'team'], how='inner')


In [153]:
# Only keep data from the last team the player played on in the season
stats_df = stats_df.sort_values(['playerId', 'id'], ascending=[True, False])
stats_df = stats_df.drop_duplicates(subset='playerId', keep='first')

In [160]:
assert stats_df[stats_df['playerName'].str.contains('Luka Dončić', case=False, na=False)]['team'].iloc[0] == 'LAL', "Luka Dončić is not in team 'LAL'"
assert stats_df[stats_df['playerName'].str.contains('Anthony Davis', case=False, na=False)]['team'].iloc[0] == 'DAL', "Anthony Davis is not in team 'DAL'"

# Data Profiling

In [161]:
stats_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 570 entries, 416 to 273
Data columns (total 53 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  570 non-null    int64  
 1   playerName          570 non-null    object 
 2   position            570 non-null    object 
 3   age                 570 non-null    int64  
 4   games               570 non-null    int64  
 5   gamesStarted        570 non-null    int64  
 6   minutesPg           570 non-null    float64
 7   fieldGoals          570 non-null    int64  
 8   fieldAttempts       570 non-null    int64  
 9   fieldPercent        566 non-null    float64
 10  threeFg             570 non-null    int64  
 11  threeAttempts       570 non-null    int64  
 12  threePercent        539 non-null    float64
 13  twoFg               570 non-null    int64  
 14  twoAttempts         570 non-null    int64  
 15  twoPercent          560 non-null    float64
 16  effectFgPer

In [162]:
missing_values = stats_df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

fieldPercent        4
threePercent       31
twoPercent         10
effectFgPercent     4
ftPercent          31
tsPercent           4
threePAR            4
ftr                 4
turnoverPercent     3
dtype: int64

# Data Cleaning

In [163]:
# Impute missing values
stats_df['fieldPercent'] = stats_df['fieldPercent'].fillna(
    (stats_df['fieldGoals'] / stats_df['fieldAttempts']).fillna(0)
)

stats_df['threePercent'] = stats_df['threePercent'].fillna(
    (stats_df['threeFg'] / stats_df['threeAttempts']).fillna(0)
)

stats_df['twoPercent'] = stats_df['twoPercent'].fillna(
    (stats_df['twoFg'] / stats_df['twoAttempts']).fillna(0)
)

stats_df['effectFgPercent'] = stats_df['effectFgPercent'].fillna(
    ((stats_df['fieldGoals'] + (stats_df['threeFg'] / 2)) / stats_df['fieldAttempts']).fillna(0)
)

stats_df['ftPercent'] = stats_df['ftPercent'].fillna(
    (stats_df['ft'] / stats_df['ftAttempts']).fillna(0)
)

stats_df['tsPercent'] = stats_df['tsPercent'].fillna(
    (stats_df['points'] / (2 * (stats_df['fieldAttempts'] + 0.44 * stats_df['ftAttempts']))).fillna(0)
)

stats_df['threePAR'] = stats_df['threePAR'].fillna(
    (stats_df['threeAttempts'] / stats_df['fieldAttempts']).fillna(0)
)

stats_df['ftr'] = stats_df['ftr'].fillna(
    (stats_df['ftAttempts'] / stats_df['fieldAttempts']).fillna(0)
)

stats_df['turnoverPercent'] = stats_df['turnoverPercent'].fillna(
    (stats_df['ftAttempts'] / stats_df['fieldAttempts']).fillna(0)
)

stats_df['turnoverPercent'] = stats_df['turnoverPercent'].fillna(
    ((stats_df['turnovers'] / (stats_df['fieldAttempts'] + (0.44 * stats_df['ftAttempts']) + stats_df['turnovers'])) * 100).fillna(0)
)

In [164]:
missing_values = stats_df.isnull().sum()
missing_values = missing_values[missing_values > 0]

if missing_values.empty:
    print("✅ No missing values")
else:
    print("⚠️ Missing values found:")
    display(missing_values)

✅ No missing values


In [170]:
# Create new columns
stats_df['ppg'] = round((stats_df['points'] / stats_df['games']), 1).fillna(0)

# Visualizations

[something about why I chose the Lakers and Timberwolves]

In [172]:
# Filter for Lakers and Timberwolves
teams_df = stats_df[stats_df['team'].isin(['LAL', 'MIN'])]

## Insight into Player Performance

Goal: Identify top-performing players based on scoring and efficiency. \
Visualization: Points vs. PER (Player Efficiency Rating)

In [None]:
# Interactive scatter plot
scatter = alt.Chart(teams_df).mark_circle().encode(
    x=alt.X('per:Q', title='Player Efficiency Rating (PER)'),
    y=alt.Y('ppg:Q', title='Points Per Game'),
    color=alt.Color('team:N', scale=alt.Scale(domain=['LAL', 'MIN'], range=['purple', 'skyblue'])),
    size=alt.Size('winShares:Q', title='Win Shares'),
    tooltip=['playerName', 'team', 'ppg', 'per', 'winShares']
).properties(
    title='Player Performance: Lakers vs Timberwolves',
    width=600,
    height=400
).interactive()

scatter

## Team Performance Comparison

In [177]:
# Aggregate team-level stats
team_stats = teams_df.groupby('team').agg(
    total_points=('points', 'sum'),
    total_field_goals=('fieldGoals', 'sum'),
    total_three_pointers=('threeFg', 'sum'),
    total_field_attempts=('fieldAttempts', 'sum'),
    total_rebounds=('totalRb', 'sum'),
    total_assists=('assists', 'sum'),
    total_games=('games', 'sum')
).reset_index()

# Calculate additional team-level metrics
team_stats['field_goal_percentage'] = team_stats['total_field_goals'] / team_stats['total_field_attempts'] * 100
team_stats['points_per_game'] = team_stats['total_points'] / team_stats['total_games']
team_stats['rebounds_per_game'] = team_stats['total_rebounds'] / team_stats['total_games']
team_stats['assists_per_game'] = team_stats['total_assists'] / team_stats['total_games']
team_stats['three_pointers_per_game'] = team_stats['total_three_pointers'] / team_stats['total_games']



# Create separate dataframes for each stat
ppg_chart_data = team_stats[['team', 'points_per_game']]
rpg_chart_data = team_stats[['team', 'rebounds_per_game']]
apg_chart_data = team_stats[['team', 'assists_per_game']]
fg_percent_chart_data = team_stats[['team', 'field_goal_percentage']]
three_pointers_chart_data = team_stats[['team', 'three_pointers_per_game']]


ppg_chart = alt.Chart(ppg_chart_data).mark_bar().encode(
    x='team:N',
    y='points_per_game:Q',
    color=alt.Color('team:N', scale=alt.Scale(domain=['LAL', 'MIN'], range=['purple', 'skyblue'])),
    tooltip=['team', 'points_per_game']
).properties(
    title="Points Per Game"
)

rpg_chart = alt.Chart(rpg_chart_data).mark_bar().encode(
    x='team:N',
    y='rebounds_per_game:Q',
    color=alt.Color('team:N', scale=alt.Scale(domain=['LAL', 'MIN'], range=['purple', 'skyblue'])),
    tooltip=['team', 'rebounds_per_game']
).properties(
    title="Rebounds Per Game"
)

apg_chart = alt.Chart(apg_chart_data).mark_bar().encode(
    x='team:N',
    y='assists_per_game:Q',
    color=alt.Color('team:N', scale=alt.Scale(domain=['LAL', 'MIN'], range=['purple', 'skyblue'])),
    tooltip=['team', 'assists_per_game']
).properties(
    title="Assists Per Game"
)

tpg_chart = alt.Chart(three_pointers_chart_data).mark_bar().encode(
    x='team:N',
    y='three_pointers_per_game:Q',
    color=alt.Color('team:N', scale=alt.Scale(domain=['LAL', 'MIN'], range=['purple', 'skyblue'])),
    tooltip=['team', 'three_pointers_per_game']
).properties(
    title="3PT Per Game"
)

fg_percent_chart = alt.Chart(fg_percent_chart_data).mark_bar().encode(
    x='team:N',
    y='field_goal_percentage:Q',
    color=alt.Color('team:N', scale=alt.Scale(domain=['LAL', 'MIN'], range=['purple', 'skyblue'])),
    tooltip=['team', 'field_goal_percentage']
).properties(
    title="Field Goal Percentage: Lakers vs Timberwolves"
)



# Combine the three charts into a single view
final_chart = alt.hconcat(
    ppg_chart,
    rpg_chart,
    apg_chart,
    tpg_chart,
    fg_percent_chart
)

final_chart

## Player Position Analysis

In [183]:
positional_data = stats_df[stats_df['team'].isin(['LAL', 'MIN'])].copy()
positional_data['ppg'] = positional_data['points'] / positional_data['games']
positional_data['rpg'] = positional_data['totalRb'] / positional_data['games']
positional_data['apg'] = positional_data['assists'] / positional_data['games']
positional_data['bpg'] = positional_data['blocks'] / positional_data['games']
positional_data['spg'] = positional_data['steals'] / positional_data['games']

position_stats = positional_data.groupby(['team', 'position']).agg(
    avg_ppg=('ppg', 'mean'),
    avg_rpg=('rpg', 'mean'),
    avg_apg=('apg', 'mean'),
    avg_bpg=('bpg', 'mean'),
    avg_spg=('spg', 'mean'),
).reset_index()

color_map = {'LAL': 'purple', 'MIN': 'skyblue'}

position_order = ['PG', 'SG', 'SF', 'PF', 'C']

ppg_position_chart = alt.Chart(position_stats).mark_bar().encode(
    x=alt.X('position:N', sort=position_order),
    y='avg_ppg:Q',
    color=alt.Color('team:N', scale=alt.Scale(domain=['LAL', 'MIN'], range=['purple', 'skyblue'])),
    tooltip=['team:N', 'position:N', 'avg_ppg:Q']
).properties(
    title='Average Points Per Game by Position'
)

rpg_position_chart = alt.Chart(position_stats).mark_bar().encode(
    x=alt.X('position:N', sort=position_order),
    y='avg_rpg:Q',
    color=alt.Color('team:N', scale=alt.Scale(domain=['LAL', 'MIN'], range=['purple', 'skyblue'])),
    tooltip=['team:N', 'position:N', 'avg_rpg:Q']
).properties(
    title='Average Rebounds Per Game by Position'
)

apg_position_chart = alt.Chart(position_stats).mark_bar().encode(
    x=alt.X('position:N', sort=position_order),
    y='avg_apg:Q',
    color=alt.Color('team:N', scale=alt.Scale(domain=['LAL', 'MIN'], range=['purple', 'skyblue'])),
    tooltip=['team:N', 'position:N', 'avg_apg:Q']
).properties(
    title='Average Assists Per Game by Position'
)

spg_position_chart = alt.Chart(position_stats).mark_bar().encode(
    x=alt.X('position:N', sort=position_order),
    y='avg_spg:Q',
    color=alt.Color('team:N', scale=alt.Scale(domain=['LAL', 'MIN'], range=['purple', 'skyblue'])),
    tooltip=['team:N', 'position:N', 'avg_spg:Q']
).properties(
    title='Average Steals Per Game by Position'
)

bpg_position_chart = alt.Chart(position_stats).mark_bar().encode(
    x=alt.X('position:N', sort=position_order),
    y='avg_bpg:Q',
    color=alt.Color('team:N', scale=alt.Scale(domain=['LAL', 'MIN'], range=['purple', 'skyblue'])),
    tooltip=['team:N', 'position:N', 'avg_bpg:Q']
).properties(
    title='Average Blocks Per Game by Position'
)



position_analysis_side_by_side = alt.hconcat(
    ppg_position_chart,
    rpg_position_chart,
    apg_position_chart,
    spg_position_chart,
    bpg_position_chart
)

position_analysis_side_by_side

## Age and Experience Trends