# Introduction to Machine Learning With NBA Data
#### A project designed for learning machine learning for the purposes of NBA data journalism. 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

### Question 1:
Cluster NBA players by playing style using nba_api data (per 100 possessions data)

In [4]:
from nba_api.stats.endpoints import LeagueDashPlayerStats
import time

In [15]:
def fetch_player_base_stats(season):
    # Fetch player statistics for the specified season
    response = LeagueDashPlayerStats(
        season=season,
        per_mode_detailed='Per100Possessions',
        season_type_all_star='Regular Season',
        measure_type_detailed_defense='Base',
        plus_minus='N',
        rank='N',
        pace_adjust='N'
    )

    # Convert the response to a DataFrame
    player_base_stats_df = response.get_data_frames()[0]
    player_base_stats_df['SEASON'] = season
    return player_base_stats_df

df_2026 = fetch_player_base_stats('2025-26')
df_2026.columns


Index(['PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'AGE', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M',
       'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
       'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS',
       'NBA_FANTASY_PTS', 'DD2', 'TD3', 'WNBA_FANTASY_PTS', 'GP_RANK',
       'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'FGM_RANK', 'FGA_RANK',
       'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 'FTM_RANK',
       'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK',
       'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK',
       'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK',
       'DD2_RANK', 'TD3_RANK', 'WNBA_FANTASY_PTS_RANK', 'TEAM_COUNT',
       'SEASON'],
      dtype='object')

In [16]:
# fetch advanced stats
def fetch_player_advanced_stats(season):
    # Fetch player advanced statistics for the specified season
    response = LeagueDashPlayerStats(
        season=season,
        per_mode_detailed='Per100Possessions',
        season_type_all_star='Regular Season',
        measure_type_detailed_defense='Advanced',
        plus_minus='N',
        rank='N',
        pace_adjust='N'
    )

    # Convert the response to a DataFrame
    player_advanced_stats_df = response.get_data_frames()[0]
    player_advanced_stats_df['SEASON'] = season
    return player_advanced_stats_df

df_2026_advanced = fetch_player_advanced_stats('2025-26')
df_2026_advanced.columns

Index(['PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'AGE', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'E_OFF_RATING', 'OFF_RATING',
       'sp_work_OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
       'sp_work_DEF_RATING', 'E_NET_RATING', 'NET_RATING',
       'sp_work_NET_RATING', 'AST_PCT', 'AST_TO', 'AST_RATIO', 'OREB_PCT',
       'DREB_PCT', 'REB_PCT', 'TM_TOV_PCT', 'E_TOV_PCT', 'EFG_PCT', 'TS_PCT',
       'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE', 'PACE_PER40', 'sp_work_PACE',
       'PIE', 'POSS', 'FGM', 'FGA', 'FGM_PG', 'FGA_PG', 'FG_PCT', 'GP_RANK',
       'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'E_OFF_RATING_RANK',
       'OFF_RATING_RANK', 'sp_work_OFF_RATING_RANK', 'E_DEF_RATING_RANK',
       'DEF_RATING_RANK', 'sp_work_DEF_RATING_RANK', 'E_NET_RATING_RANK',
       'NET_RATING_RANK', 'sp_work_NET_RATING_RANK', 'AST_PCT_RANK',
       'AST_TO_RANK', 'AST_RATIO_RANK', 'OREB_PCT_RANK', 'DREB_PCT_RANK',
       'REB_PCT_RANK', 'TM_TOV_PCT_RANK', 'E_TOV_PCT_

Pull multiple seasons to loop through and assign to an array of dataframes called dfs

In [23]:
BASE_FEATURES = [
    "PLAYER_ID",
    "PLAYER_NAME",
    "SEASON",
    "PTS",
    "FGA",
    "FG3A",
    "FTA",
    "AST",
    "REB",
    "OREB",
    "DREB",
    "STL",
    "BLK",
    "TOV",
    "PF",
    "PFD",
    "PLUS_MINUS"
]

ADV_FEATURES = [
    "PLAYER_ID",
    "SEASON",
    "TS_PCT",
    "USG_PCT",
    "AST_PCT",
    "REB_PCT",
    "OFF_RATING",
    "DEF_RATING"
]

base_df = players_raw_base[BASE_FEATURES]
advanced_df = players_raw_advanced[ADV_FEATURES]

In [None]:
# define array of seasons
seasons = ['2021-22', '2022-23', '2023-24', '2024-25', '2025-26']

dfs = []
for season in seasons:
    df = fetch_player_base_stats(season)
    dfs.append(df)
    time.sleep(2)  # Wait for 2 seconds before making the next API call

# concatenate all DataFrames into one
players_raw_base = pd.concat(dfs, ignore_index=True)

# fetch advanced stats for all seasons
dfs_advanced = []
for season in seasons:
    df_advanced = fetch_player_advanced_stats(season)
    dfs_advanced.append(df_advanced)
    time.sleep(2)  # Wait for 2 seconds before making the next API call

# concatenate all advanced DataFrames into one
players_raw_advanced = pd.concat(dfs_advanced, ignore_index=True)

players_raw_base = players_raw_base[BASE_FEATURES]
players_raw_advanced = players_raw_advanced[ADV_FEATURES]


In [25]:
# merge the two DataFrames on PLAYER_ID and SEASON
players_merged = pd.merge(players_raw_base, players_raw_advanced, on=["PLAYER_ID", "SEASON"], how="inner")
players_merged.columns

Index(['PLAYER_ID', 'PLAYER_NAME', 'SEASON', 'PTS', 'FGA', 'FG3A', 'FTA',
       'AST', 'REB', 'OREB', 'DREB', 'STL', 'BLK', 'TOV', 'PF', 'PFD',
       'PLUS_MINUS', 'TS_PCT', 'USG_PCT', 'AST_PCT', 'REB_PCT', 'OFF_RATING',
       'DEF_RATING'],
      dtype='object')

In [28]:
# Compute advanced metrics from basic metrics
players_merged['3PAr'] = players_merged['FG3A'] / players_merged['FGA']
players_merged['FTr'] = players_merged['FTA'] / players_merged['FGA']


In [29]:
players_merged = players_merged.dropna()
players_merged = players_merged.reset_index(drop=True)
players_merged.columns

Index(['PLAYER_ID', 'PLAYER_NAME', 'SEASON', 'PTS', 'FGA', 'FG3A', 'FTA',
       'AST', 'REB', 'OREB', 'DREB', 'STL', 'BLK', 'TOV', 'PF', 'PFD',
       'PLUS_MINUS', 'TS_PCT', 'USG_PCT', 'AST_PCT', 'REB_PCT', 'OFF_RATING',
       'DEF_RATING', '3PAr', 'FTr'],
      dtype='object')

In [30]:

players_merged.shape
players_merged.duplicated(["PLAYER_ID", "SEASON"]).sum()
players_merged.isna().sum().sort_values(ascending=False).head()



PLAYER_ID      0
PLAYER_NAME    0
SEASON         0
PTS            0
FGA            0
dtype: int64