In [1]:
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [2]:
filename = "Data/PlayersStats.csv"
rawData = pd.read_csv(filename)

In [3]:
rawData

Unnamed: 0,GROUP_SET,PLAYER_NAME,GROUP_VALUE,TEAM_ID,TEAM_ABBREVIATION,MAX_GAME_DATE,GP,W,L,W_PCT,...,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,CFID,CFPARAMS
0,Overall,Alex Abrines,2018-19,1610612760,OKC,2019-02-01T00:00:00,31,21,10,0.677,...,1,1,1,1,1,1,1,1,265,2018-19
1,Overall,Quincy Acy,2018-19,1610612756,PHX,2019-01-25T00:00:00,10,2,8,0.200,...,1,1,1,1,1,1,1,1,265,2018-19
2,Overall,Jaylen Adams,2018-19,1610612737,ATL,2019-04-10T00:00:00,34,13,21,0.382,...,1,1,1,1,1,1,1,1,265,2018-19
3,Overall,Steven Adams,2018-19,1610612760,OKC,2019-04-10T00:00:00,80,47,33,0.588,...,1,1,1,1,1,1,1,1,265,2018-19
4,Overall,Bam Adebayo,2018-19,1610612748,MIA,2019-04-10T00:00:00,82,39,43,0.476,...,1,1,1,1,1,1,1,1,265,2018-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,Overall,Tyler Zeller,2018-19,1610612737,ATL,2019-03-13T00:00:00,2,1,1,0.500,...,1,1,2,2,2,2,1,1,265,2018-19
704,Overall,Ante Zizic,2018-19,1610612739,CLE,2019-04-09T00:00:00,59,13,46,0.220,...,1,1,1,1,1,1,1,1,265,2018-19
705,Overall,Ivica Zubac,2018-19,-1,TOT,2019-04-10T00:00:00,59,32,27,0.542,...,1,1,1,1,1,1,1,1,265,2018-19
706,Overall,Ivica Zubac,2018-19,1610612746,LAC,2019-04-10T00:00:00,26,18,8,0.692,...,1,1,2,2,2,1,1,1,265,2018-19


It is observed that some players have a TEAM_ID of -1 for some reason, remove those rows

In [4]:
modelData = rawData[rawData["TEAM_ID"] != -1]

For accurate results, I want data for players that played for more than half a season's worth of games. In the 2018-19 season, 82 games were played. So I want stats for players that played 42 or more games

In [5]:
modelData = modelData[modelData["GP"] >= 42]

Unnamed: 0,GROUP_SET,PLAYER_NAME,GROUP_VALUE,TEAM_ID,TEAM_ABBREVIATION,MAX_GAME_DATE,GP,W,L,W_PCT,...,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,CFID,CFPARAMS
3,Overall,Steven Adams,2018-19,1610612760,OKC,2019-04-10T00:00:00,80,47,33,0.588,...,1,1,1,1,1,1,1,1,265,2018-19
4,Overall,Bam Adebayo,2018-19,1610612748,MIA,2019-04-10T00:00:00,82,39,43,0.476,...,1,1,1,1,1,1,1,1,265,2018-19
7,Overall,LaMarcus Aldridge,2018-19,1610612759,SAS,2019-04-10T00:00:00,81,48,33,0.593,...,1,1,1,1,1,1,1,1,265,2018-19
10,Overall,Jarrett Allen,2018-19,1610612751,BKN,2019-04-10T00:00:00,80,42,38,0.525,...,1,1,1,1,1,1,1,1,265,2018-19
12,Overall,Al-Farouq Aminu,2018-19,1610612757,POR,2019-04-09T00:00:00,81,52,29,0.642,...,1,1,1,1,1,1,1,1,265,2018-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,Overall,Delon Wright,2018-19,1610612761,TOR,2019-02-05T00:00:00,49,34,15,0.694,...,1,2,2,1,2,2,2,2,265,2018-19
698,Overall,Thaddeus Young,2018-19,1610612754,IND,2019-04-07T00:00:00,81,47,34,0.580,...,1,1,1,1,1,1,1,1,265,2018-19
699,Overall,Trae Young,2018-19,1610612737,ATL,2019-04-10T00:00:00,81,29,52,0.358,...,1,1,1,1,1,1,1,1,265,2018-19
700,Overall,Cody Zeller,2018-19,1610612766,CHA,2019-03-09T00:00:00,49,22,27,0.449,...,1,1,1,1,1,1,1,1,265,2018-19


Next, drop the various columns that are not of interest/can be found by combining other columns.

In [6]:
modelData = modelData.drop(["GROUP_SET", "GROUP_VALUE", "TEAM_ID", "MAX_GAME_DATE", "GP",
                            "W", "L", "W_PCT", "FG_PCT",
                            "FG3_PCT", "FT_PCT", "REB", "NBA_FANTASY_PTS", "DD2", "TD3", "GP_RANK", "W_RANK", 
                            "L_RANK", "W_PCT_RANK", "MIN_RANK", "FGM_RANK", "FGA_RANK", "FG_PCT_RANK",
                            "FG3M_RANK", "FG3A_RANK", "FG3_PCT_RANK", "FTM_RANK", 
                            "FTA_RANK", "FT_PCT_RANK", "OREB_RANK", "DREB_RANK", "REB_RANK", "AST_RANK",
                            "TOV_RANK", "STL_RANK", "BLK_RANK", "BLKA_RANK", "PF_RANK", "PFD_RANK", "PTS_RANK",
                            "PLUS_MINUS_RANK", "NBA_FANTASY_PTS_RANK", "DD2_RANK", "TD3_RANK", "CFID", 
                            "CFPARAMS"], 1)

In [7]:
modelData.to_csv('Data/ModelDataKMeans.csv', index=False, header=True)