<a href="https://colab.research.google.com/github/mnocerino23/NBA-Player-Classifier/blob/main/featureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [76]:
complete = pd.read_csv('/content/drive/MyDrive/Data_Science_Projects/NBA/endSeasonCombined2022-2023.csv')

# For the purposes of our modeling we will only be considering players who played over 15 games in 2022-2023

In [77]:
complete = complete.loc[complete['G'] > 15]

In [78]:
#Drop redundant columns
complete.drop(columns = ['Games'], axis = 1, inplace = True)

In [79]:
#Investigate the presence of NaN values in this dataset.
#There are 7 for 3P% and 2 for FT%
print(complete.isnull().sum())

Player    0
Pos       0
Age       0
Tm        0
G         0
GS        0
MP        0
FG        0
FGA       0
FG%       0
3P        0
3PA       0
3P%       7
2P        0
2PA       0
2P%       0
eFG%      0
FT        0
FTA       0
FT%       2
ORB       0
DRB       0
TRB       0
AST       0
STL       0
BLK       0
TOV       0
PF        0
PTS       0
PER       0
TS%       0
3PAr      0
FTr       0
ORB%      0
DRB%      0
TRB%      0
AST%      0
STL%      0
BLK%      0
TOV%      0
USG%      0
OWS       0
DWS       0
WS        0
WS/48     0
OBPM      0
DBPM      0
BPM       0
VORP      0
Salary    0
dtype: int64


In [80]:
#investigate the 7 players with null values in 
no_threes = complete.loc[complete['3P%'].isna()]
no_threes

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Salary
17,Udoka Azubuike,C,23,UTA,36,4,10.0,1.6,2.0,0.819,...,11.5,0.6,0.4,1.0,0.134,-1.2,0.5,-0.6,0.1,2.17488
42,Bismack Biyombo,C,30,PHO,61,14,14.3,2.0,3.4,0.578,...,14.0,0.2,1.6,1.9,0.102,-3.4,2.5,-0.9,0.2,1.83609
65,Moses Brown,C,23,TOT,36,1,8.2,1.7,2.7,0.635,...,21.2,0.7,0.4,1.1,0.179,0.6,-1.2,-0.6,0.1,0.824041
154,Daniel Gafford,C,24,WAS,78,47,20.6,3.7,5.1,0.732,...,15.2,4.2,1.9,6.1,0.184,0.4,0.6,1.0,1.2,1.930681
383,Mason Plumlee,C,32,TOT,79,60,26.0,4.2,6.1,0.68,...,15.2,5.5,2.4,7.9,0.185,1.2,1.0,2.2,2.2,9.080417
414,Mitchell Robinson,C,24,NYK,59,58,27.0,3.2,4.7,0.671,...,10.1,4.0,2.6,6.5,0.198,1.0,1.4,2.4,1.7,17.045454
520,Mark Williams,C,21,CHO,43,17,19.3,3.7,5.8,0.637,...,17.0,1.5,1.3,2.8,0.163,-0.5,0.6,0.1,0.4,3.72204


In [81]:
#impute with 
complete['3P%'].fillna(value = 0.15, inplace = True)

# Having checked on espn.com, it turns out that all the individuals with null 3P% are centers who attemped 0 threes. We will input with .15 so that are model doesn't undervalue centers (the other option would be imput with 0). We don't impute with the mean though because the lack of attempts implies poor shooting

In [82]:
#only two players are missing FT% because they attempted no free throws.
#impute with league average because these two individuals are okay shooters

no_fts = complete.loc[complete['FT%'].isna()]
no_fts

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Salary
14,Ryan Arcidiacono,PG,28,TOT,20,4,8.6,0.5,1.9,0.243,...,11.1,-0.2,0.1,-0.2,-0.043,-7.3,-1.5,-8.8,-0.3,1.83609
129,PJ Dozier,SG,26,SAC,16,0,4.9,0.6,2.1,0.303,...,20.6,-0.3,0.1,-0.2,-0.097,-8.9,0.9,-8.1,-0.1,0.53945


In [83]:
#impute with the mean using fillna()
complete['FT%'].fillna(value = complete['FT%'].mean(), inplace = True)

In [84]:
#now our dataset is completely clean of 
complete.shape

(450, 50)

# Feature Engineering: creating new features that could be relevant