<a href="https://colab.research.google.com/github/nshejwalkar/bballML/blob/main/nba_player_cluster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Prep

loading csv files from google drive

In [None]:
directory = '/content/drive/My Drive/Machine Learning/NBA Cluster/data/OG 2022-2023'
files = {}
for filename in os.listdir(directory):
  stat,_ = filename.split('.')
  files[stat] = pd.read_csv(os.path.join(directory, filename))
  files[stat].columns = files[stat].columns.str.strip()

In [None]:
files.keys()

dict_keys(['catch_and_shoot', 'post_ups', 'isolation', 'pnr_handler', 'pnr_man', 'handoff', 'cut', 'putback', 'screen_assists', 'spot_up', 'off_screen', 'shot_chart', 'touches', 'height_weight', 'drives', 'box_scores', 'assists'])

In [None]:
for name in files.keys():
  print(f"STAT: {name}")
  files[name].info()
  print()

STAT: catch_and_shoot
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539 entries, 0 to 538
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PLAYER               539 non-null    object 
 1   TEAM                 539 non-null    object 
 2   CATCH AND SHOOT FGA  539 non-null    float64
 3   CATCH AND SHOOT 3PA  539 non-null    object 
dtypes: float64(1), object(3)
memory usage: 17.0+ KB

STAT: post_ups
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539 entries, 0 to 538
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   PLAYER    539 non-null    object 
 1   TEAM      539 non-null    object 
 2   POST UPS  539 non-null    float64
dtypes: float64(1), object(2)
memory usage: 12.8+ KB

STAT: isolation
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 3 columns):
 #   Column          Non-Null Co

merging dataframes

In [None]:
players = pd.DataFrame()
for stat in files.keys():
  if players.empty:
    players = files[stat]
  else:
    players = pd.merge(players, files[stat], how="left", on = ["PLAYER", "TEAM"])

players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 539 entries, 0 to 538
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PLAYER               539 non-null    object 
 1   TEAM                 539 non-null    object 
 2   CATCH AND SHOOT FGA  539 non-null    float64
 3   CATCH AND SHOOT 3PA  539 non-null    object 
 4   POST UPS             539 non-null    float64
 5   ISOLATION FREQ       227 non-null    float64
 6   PNR HANDLER FREQ     282 non-null    float64
 7   PNR MAN FREQ         239 non-null    float64
 8   HANDOFF FREQ         250 non-null    float64
 9   CUT FREQ             314 non-null    float64
 10  PUTBACK FREQ         295 non-null    float64
 11  SCREEN ASSISTS       539 non-null    float64
 12  SPOT UP FREQ         402 non-null    float64
 13  OFF SCREEN FREQ      200 non-null    float64
 14  RESTRICTED AREA FGA  539 non-null    float64
 15  IN THE PAINT FGA     539 non-null    obj

In [None]:
directory = '/content/drive/My Drive/Machine Learning/NBA Cluster/data/2015-16'
files = {}
for filename in os.listdir(directory):
  stat,_ = filename.split('.')
  files[stat] = pd.read_csv(os.path.join(directory, filename))
  files[stat].columns = files[stat].columns.str.strip()

players15_16 = pd.DataFrame()
for stat in files.keys():
  if players15_16.empty:
    players15_16 = files[stat]
  else:
    players15_16 = pd.merge(players15_16, files[stat], how="left", on = ["PLAYER", "TEAM"])

players15_16.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 476 entries, 0 to 475
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PLAYER               476 non-null    object 
 1   TEAM                 476 non-null    object 
 2   RESTRICTED AREA FGA  476 non-null    float64
 3   IN THE PAINT FGA     476 non-null    float64
 4   MID RANGE FGA        476 non-null    float64
 5   CORNER 3 FGA         476 non-null    object 
 6   ABOVE THE BREAK FGA  476 non-null    float64
 7   MIN                  476 non-null    float64
 8   FGA                  476 non-null    float64
 9   3PA                  476 non-null    float64
 10  AST                  476 non-null    float64
 11  TOUCHES              476 non-null    float64
 12  FRONT CT TOUCHES     476 non-null    float64
 13  TIME OF POSS         476 non-null    float64
 14  AVG SEC PER TOUCH    476 non-null    float64
 15  AVG DRIB PER TOUCH   476 non-null    flo

In [None]:
from sklearn.base import TransformerMixin, ClusterMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score

class CleanDataset(TransformerMixin):

  def fit(self, X, y=None):
    return self

  def height_to_inches(self, height):
    ft, inches = height.split('-')
    return int(ft) * 12 + int(inches)

  def transform(self, X, y=None):
    # fill NULL stats
    X.fillna(0, inplace=True)

    # filter out players with less than 15 minutes
    X = X.loc[X['MIN'] >= 15,]

    # convert height
    X['HEIGHT'] = X['HEIGHT'].apply(self.height_to_inches)

    #convert all columns to numeric
    numeric_columns = X.columns.difference(['PLAYER', 'TEAM'])
    X[numeric_columns] = X[numeric_columns].apply(pd.to_numeric, errors='coerce')
    X.fillna(0, inplace=True)

    return X

class NormalizeDataset(TransformerMixin):
  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):

    # Normalize 'SCREEN ASSISTS'
    X.loc[X['AST'] != 0, 'SCREEN ASSISTS'] = X.loc[X['AST'] != 0, 'SCREEN ASSISTS'] / X['AST']

    # Normalize 'PASS OFF DRIVE' when 'DRIVES' is not 0
    X.loc[X['DRIVES'] != 0, 'PASS OFF DRIVE'] = X.loc[X['DRIVES'] != 0, 'PASS OFF DRIVE'] / X.loc[X['DRIVES'] != 0, 'DRIVES']

    # Normalize columns in cols_norm_touches
    cols_norm_touches = ['AST', 'PASSES MADE', 'DRIVES', 'FRONT CT TOUCHES', 'ELBOW TOUCHES', 'PAINT TOUCHES']
    for stat in cols_norm_touches:
        X.loc[:, stat] = (X[stat] / X['TOUCHES']) * 100

    # Rename columns related to touches
    X.rename(columns={'AST': 'AST PER TOUCHES', 'PASSES MADE': 'PASSES PER TOUCHES', 'DRIVES': 'DRIVES PER TOUCHES', 'FRONT CT TOUCHES': 'FRONT CT TOUCH FREQ', 'PAINT TOUCHES':'PAINT TOUCHES FREQ', 'ELBOW TOUCHES':'ELBOW TOUCH FREQ'}, inplace=True)

    # Normalize 'CATCH AND SHOOT 3 FREQ' and 'CORNER 3 FREQ'
    cols_norm_3pa = ['CATCH AND SHOOT 3PA', 'CORNER 3 FGA']
    X['CATCH AND SHOOT 3 FREQ'] = 100 * X['CATCH AND SHOOT 3PA'] / np.where(X['3PA'] != 0, X['3PA'], 1)
    X['CORNER 3 FREQ'] = 100 * X['CORNER 3 FGA'] / np.where(X['3PA'] != 0, X['3PA'], 1)

    # Normalize columns in cols_norm_fga
    cols_norm_fga = ['CATCH AND SHOOT FGA', 'RESTRICTED AREA FGA', 'IN THE PAINT FGA', 'MID RANGE FGA', 'ABOVE THE BREAK FGA', 'CATCH AND SHOOT 3PA', 'CORNER 3 FGA', 'POST UPS']
    for stat in cols_norm_fga:
        X.loc[:, stat] = 100 * X[stat] / X['FGA']

    # Rename columns related to FGA
    X.rename(columns={'CATCH AND SHOOT FGA': 'CATCH AND SHOOT FREQ', 'RESTRICTED AREA FGA': 'RESTRICTED AREA FREQ', 'IN THE PAINT FGA': 'IN THE PAINT FREQ', 'MID RANGE FGA': 'MID RANGE FREQ', 'ABOVE THE BREAK FGA': 'ABOVE THE BREAK FREQ', 'CATCH AND SHOOT 3PA': 'CATCH AND SHOOT 3 TFREQ', 'CORNER 3 FGA': 'CORNER 3 TFREQ', 'POST UPS': 'POST UPS FREQ'}, inplace=True)

    # Drop unnecessary columns
    X.drop(['MIN', 'FGA', '3PA', 'TOUCHES'], axis=1, inplace=True)

    X.reset_index(inplace=True)
    X.drop('index', axis=1, inplace=True)

    # return numerical portion for scaler (preprocessed dataset will be preserved bc drop creates new df)
    return X

class Cluster(ClusterMixin):
  def __init__(self, kmeans = None, num_clusters = 8) -> None:
    self.kmeans = kmeans
    self.num_clusters = num_clusters

  def fit(self, X, y=None):
    if not self.kmeans:
      print("should fire twice")
      self.kmeans = KMeans(n_clusters=self.num_clusters)
      self.kmeans.fit(X)
    return self

  def transform(self, X, y=None):

    newX = pd.DataFrame()
    # get cluster labels
    print("something")
    newX['CLUSTER'] = self.kmeans.predict(X)

    # get soft cluster distances
    col_names =[f'CLUSTER {i} DISTANCE' for i in range(self.num_clusters)]
    newX.loc[:, col_names] = self.kmeans.transform(X)

    # get silhouette scores
    silhouette_coeffs = silhouette_samples(X, self.kmeans.labels_)
    newX.loc[:,'SILHOUETTE COEFFICIENT'] = silhouette_coeffs

    return newX

cluster_pipeline = Pipeline([('std_scaler', StandardScaler()),
                            ('cluster', Cluster(None, num_clusters=8))])

preprocess_pipeline = Pipeline([('clean', CleanDataset()),
                                ('normalize', NormalizeDataset())])



In [None]:
# players_clean = preprocess_pipeline.fit_transform(players)
# # players_clean.info()
# clusters = cluster_pipeline.fit_transform(players_clean.drop(['PLAYER', 'TEAM', 'PASSES PER TOUCHES'], axis=1))
# result = pd.concat([players_clean, clusters], axis=1)


players15_16_clean = preprocess_pipeline.fit_transform(players15_16)
cluster15_16 = cluster_pipeline.fit_transform(players15_16_clean.drop(['PLAYER', 'TEAM', 'PASSES PER TOUCHES'], axis=1))
result15_16 = pd.concat([players15_16_clean, cluster15_16], axis=1)
result15_16

should fire twice
something


Unnamed: 0,PLAYER,TEAM,RESTRICTED AREA FREQ,IN THE PAINT FREQ,MID RANGE FREQ,CORNER 3 TFREQ,ABOVE THE BREAK FREQ,AST PER TOUCHES,FRONT CT TOUCH FREQ,TIME OF POSS,...,CLUSTER,CLUSTER 0 DISTANCE,CLUSTER 1 DISTANCE,CLUSTER 2 DISTANCE,CLUSTER 3 DISTANCE,CLUSTER 4 DISTANCE,CLUSTER 5 DISTANCE,CLUSTER 6 DISTANCE,CLUSTER 7 DISTANCE,SILHOUETTE COEFFICIENT
0,Aaron Brooks,CHI,26.470588,17.647059,16.176471,4.411765,35.294118,7.471264,34.770115,3.1,...,0,2.813249,9.093946,7.378479,11.331063,12.167642,5.679775,8.522355,10.015589,0.370368
1,Aaron Gordon,ORL,47.297297,10.810811,17.567568,9.459459,13.513514,4.155844,50.129870,1.2,...,6,6.762526,3.511444,4.896892,6.451446,8.100575,4.897155,2.391658,4.481521,0.156974
2,Al Horford,ATL,32.031250,11.718750,32.031250,5.468750,18.750000,5.039370,66.299213,1.7,...,1,8.142464,2.982596,5.823554,7.148440,7.169157,5.927338,4.616355,4.037792,0.149832
3,Al Jefferson,CHA,30.841121,32.710280,37.383178,0.000000,0.000000,5.263158,77.192982,1.0,...,7,11.498640,8.719746,10.898409,6.701234,6.799794,9.502650,9.170703,5.408780,0.124200
4,Al-Farouq Aminu,POR,28.409091,10.227273,12.500000,20.454545,28.409091,4.657534,64.383562,1.3,...,2,6.800263,3.762776,2.238820,9.455172,10.478710,4.822253,3.579621,7.158025,0.190959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,Willie Cauley-Stein,SAC,66.666667,21.568627,11.764706,0.000000,0.000000,2.564103,50.854701,0.5,...,3,10.278271,8.101370,9.946070,2.810331,6.194490,8.932358,7.589797,5.930813,0.357906
313,Xavier Munford,MEM,23.636364,20.000000,25.454545,9.090909,23.636364,4.597701,38.793103,2.7,...,0,3.221983,6.620638,5.541364,10.096483,10.987758,4.753134,5.987142,8.057815,0.230939
314,Zach LaVine,MIN,30.769231,6.837607,29.059829,5.128205,28.205128,6.066536,48.727984,3.7,...,0,2.961060,6.775890,4.980418,9.654302,10.631563,3.230971,6.307347,7.838589,0.063422
315,Zach Randolph,MEM,37.593985,24.812030,35.338346,0.751880,2.255639,4.259635,70.791075,1.8,...,7,9.185722,6.792071,8.701042,6.947005,7.138902,6.753090,6.535475,3.793342,0.267222


In [None]:
# clusters.loc[:,['PLAYER', 'SILHOUETTE COEFFICIENT'] + col_names].sort_values(by=['SILHOUETTE COEFFICIENT'],ascending=False)

NameError: name 'col_names' is not defined

In [None]:
for i in range(8):
  print(f"CLUSTER {i}")
  print(result.loc[result['CLUSTER'] == i, 'PLAYER'])
  print()

CLUSTER 0
5                    Alec Burks
9               Andrew Nembhard
15                Austin Reaves
17                  Ayo Dosunmu
31                  Bruce Brown
42                 Caris LeVert
52                   Coby White
53                  Cody Martin
59                  Daishen Nix
66              Davion Mitchell
69            De'Anthony Melton
79                Derrick White
82                Devin Vassell
83              Devonte' Graham
87             Donte DiVincenzo
94                Dyson Daniels
95                  Eric Gordon
101                Gabe Vincent
106                 George Hill
109                Goran Dragic
128               Jacob Gilyard
139                 Jalen Suggs
140              Jalen Williams
142             James Bouknight
151               Jaylen Nowell
159                Jevon Carter
162                  Joe Ingles
172           Jordan McLaughlin
175               Jose Alvarado
178                   Josh Hart
180             Josh Richardso

In [None]:
cols = [f'CLUSTER {i} DISTANCE' for i in range(8)]
cols = ['PLAYER', 'CLUSTER', 'SILHOUETTE COEFFICIENT'] + cols
result.loc[:, cols]

Unnamed: 0,PLAYER,CLUSTER,SILHOUETTE COEFFICIENT,CLUSTER 0 DISTANCE,CLUSTER 1 DISTANCE,CLUSTER 2 DISTANCE,CLUSTER 3 DISTANCE,CLUSTER 4 DISTANCE,CLUSTER 5 DISTANCE,CLUSTER 6 DISTANCE,CLUSTER 7 DISTANCE
0,AJ Griffin,6,0.306394,5.28893,4.962627,11.461931,6.546205,5.053584,8.221201,2.642939,7.054615
1,Aaron Gordon,1,0.172455,6.537851,4.025907,6.898849,5.708841,7.188509,4.54763,6.396021,7.15225
2,Aaron Nesmith,4,0.110632,4.26511,3.3634,11.034669,7.226817,2.039485,7.978,2.956031,7.127784
3,Aaron Wiggins,1,0.175744,5.050617,2.830754,9.404603,7.453711,4.44016,6.670096,4.762047,7.247731
4,Al Horford,4,0.298356,6.217327,5.009354,11.568462,8.872178,2.751485,8.052794,5.320128,9.024551
5,Alec Burks,0,0.050312,3.156062,5.044047,11.357761,4.221239,5.750247,8.3455,3.468522,4.18011
6,Aleksej Pokusevski,1,0.123522,5.908954,3.141061,8.880814,7.793288,4.278638,5.557436,5.316394,8.086138
7,Alex Caruso,4,0.047296,3.805125,4.155919,11.195785,7.533895,3.310351,8.344131,5.040748,6.524515
8,Alperen Sengun,5,0.220247,9.539772,6.838682,7.392529,8.170576,9.886858,3.848614,9.101228,9.819939
9,Andrew Nembhard,0,0.080323,1.97499,5.385623,12.0128,4.663259,6.157848,8.941788,4.770378,2.757471


# 2015-2016 Season Clusters

In [None]:
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.width', None)  # Display full width

for i in range(8):
  print(f"CLUSTER {i}")
  print(result15_16.loc[result15_16['CLUSTER'] == i, 'PLAYER'])
  print()

CLUSTER 0
0                 Aaron Brooks
18              Archie Goodwin
21               Austin Rivers
24                  Beno Udrih
33            Brandon Jennings
34              Brandon Knight
35               Briante Weber
38                 C.J. Watson
39                 CJ McCollum
47                  Chris Paul
51                 Cory Joseph
53            D'Angelo Russell
54               D.J. Augustin
56              Damian Lillard
61             Darren Collison
68             Dennis Schroder
69              Deron Williams
71                Derrick Rose
77                Donald Sloan
85               Elfrid Payton
86             Emmanuel Mudiay
88                Eric Bledsoe
98                 George Hill
102               Goran Dragic
106            Greivis Vasquez
114              Isaiah Thomas
115                  Ish Smith
116                 J.J. Barea
125              Jameer Nelson
126               James Harden
130               Jarrett Jack
134                Jeff Teagu