# 04 Pre-Processing and Training Data Development

## Goals

 - Create dummy or indicator features for categorical variables<br>
 - Standardize the magnitude of numeric features using a scaler<br>
 - Split your data into testing and training datasets

In [18]:
%reset_selective -f regex
import os
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pprint
import numpy as np
import seaborn as sns
from scipy.stats import zscore
from sklearn import preprocessing
%matplotlib inline

In [19]:
df=pd.read_csv('players_cleaned_final.csv')
del[df['Unnamed: 0']]
df.columns

Index(['Player', 'Stats_Year', 'Ht', 'Wt', 'Colleges', 'Pos', 'Age', 'Tm',
       'Team', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P',
       '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Pts Won', 'Pts Max', 'Share', 'W',
       'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS', 'Contract', 'rank',
       'Contract_Year', 'Contract_Team', 'Salary', 'Salary Cap',
       'Adjusted Salary 2022'],
      dtype='object')

dummy variables

In [20]:
# Creating dummy variables for Position first
positions = pd.get_dummies(df['Pos'])
df = pd.concat([df, positions], axis=1)

# Creating dummy variables for Contract_Team
con_team = pd.get_dummies(df['Contract_Team'])
df = pd.concat([df, con_team], axis=1)

In [21]:
df.columns

Index(['Player', 'Stats_Year', 'Ht', 'Wt', 'Colleges', 'Pos', 'Age', 'Tm',
       'Team', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P',
       '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Pts Won', 'Pts Max', 'Share', 'W',
       'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS', 'Contract', 'rank',
       'Contract_Year', 'Contract_Team', 'Salary', 'Salary Cap',
       'Adjusted Salary 2022', 'C', 'PF', 'PF-C', 'PG', 'PG-SG', 'SF', 'SF-PF',
       'SG', 'SG-PG', 'SG-SF', 'Atlanta Hawks', 'Boston Celtics',
       'Brooklyn Nets', 'Charlotte Bobcats', 'Charlotte Hornets',
       'Chicago Bulls', 'Cleveland Cavaliers', 'Dallas Mavericks',
       'Denver Nuggets', 'Detroit Pistons', 'Golden State Warriors',
       'Houston Rockets', 'Indiana Pacers', 'Los Angeles Clippers',
       'Los Angeles Lakers', 'Memphis Grizzlies', 'Miami Heat',
       'Milwaukee Bucks', 'Minnesota Timberwolves', 'New Jersey Nets',
       '

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1236 entries, 0 to 1235
Data columns (total 98 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Player                             1236 non-null   object 
 1   Stats_Year                         1236 non-null   int64  
 2   Ht                                 1236 non-null   object 
 3   Wt                                 1236 non-null   float64
 4   Colleges                           1236 non-null   object 
 5   Pos                                1236 non-null   object 
 6   Age                                1236 non-null   int64  
 7   Tm                                 1236 non-null   object 
 8   Team                               1236 non-null   object 
 9   G                                  1236 non-null   int64  
 10  GS                                 1236 non-null   int64  
 11  MP                                 1236 non-null   float

In [23]:
df.shape

(1236, 98)

In [24]:
X = df.drop(['Pos','Adjusted Salary 2022','Player', 'Ht', 'Colleges', 'Tm', 'Team', 'Contract_Team', 'Contract', 'rank', 'Contract_Year', 'Salary', 'Salary Cap'], axis=1)
y = df['Adjusted Salary 2022']

In [25]:
X.columns

Index(['Stats_Year', 'Wt', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Pts Won',
       'Pts Max', 'Share', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS', 'C',
       'PF', 'PF-C', 'PG', 'PG-SG', 'SF', 'SF-PF', 'SG', 'SG-PG', 'SG-SF',
       'Atlanta Hawks', 'Boston Celtics', 'Brooklyn Nets', 'Charlotte Bobcats',
       'Charlotte Hornets', 'Chicago Bulls', 'Cleveland Cavaliers',
       'Dallas Mavericks', 'Denver Nuggets', 'Detroit Pistons',
       'Golden State Warriors', 'Houston Rockets', 'Indiana Pacers',
       'Los Angeles Clippers', 'Los Angeles Lakers', 'Memphis Grizzlies',
       'Miami Heat', 'Milwaukee Bucks', 'Minnesota Timberwolves',
       'New Jersey Nets', 'New Orleans Hornets', 'New Orleans Pelicans',
       'New Orleans/Oklahoma City Hornets', 'New York Knicks',
       'Oklahoma City Thunder', 'Orlando Magic', 'Philadelphia 

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# 05 Modeling

The goal of the modeling step is to develop a final model that effectively predicts a NBA player's salary. After looking at the types of models that would be appropriate given the modeling response and the features in the dataset, I will build two to three models.

In [27]:
import statsmodels.api as sm
from numpy import mean
from numpy import std
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
import shap
from sklearn import preprocessing

In [28]:
X_train

Unnamed: 0,Stats_Year,Wt,Age,G,GS,MP,FG,FGA,FG%,3P,...,Phoenix Suns,Portland Trail Blazers,Sacramento Kings,San Antonio Spurs,Seattle SuperSonics,Toronto Raptors,Utah Jazz,Vancouver Grizzlies,Washington Bullets,Washington Wizards
326,2000,220.0,22,49,1,7.0,1.1,2.9,0.389,0.3,...,0,0,0,0,0,0,0,0,0,0
509,2017,209.0,22,35,1,7.1,0.7,2.5,0.292,0.4,...,0,1,0,0,0,0,0,0,0,0
1224,2011,220.0,19,38,16,13.9,1.7,4.2,0.406,0.1,...,0,0,0,0,0,0,0,0,0,0
1205,2000,185.0,23,75,11,23.9,3.5,8.7,0.405,0.7,...,0,0,0,0,0,0,0,0,0,0
59,2016,210.0,23,29,11,20.7,1.3,4.3,0.310,0.7,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2012,215.0,21,66,29,24.4,4.8,10.9,0.443,1.7,...,0,0,0,0,0,0,0,0,0,0
905,1996,220.0,31,81,80,30.1,5.3,10.1,0.527,0.0,...,0,0,1,0,0,0,0,0,0,0
1096,2006,215.0,23,22,0,8.0,1.0,2.5,0.389,0.1,...,0,1,0,0,0,0,0,0,0,0
235,1996,180.0,34,79,8,22.3,3.2,7.5,0.428,1.0,...,0,0,0,0,1,0,0,0,0,0


In [29]:
# making dummy model 
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
y_pred = np.ones(len(y_test))*y_train.mean()
print(mean_absolute_percentage_error(y_test, y_pred))
print(mean_squared_error(y_test,y_pred, squared=False))

3.046949970816524
11369831.43931254


In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

params = {'alpha': [.1, 1, 10]}
gs = GridSearchCV(Ridge(), param_grid = params, scoring='neg_mean_absolute_error').fit(X_train, y_train)
print(gs.best_estimator_)
print(gs.best_score_)

Ridge(alpha=10)
-3461968.0903638275


In [31]:
ridge = Ridge(alpha = 10)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(mean_absolute_percentage_error(y_test, y_pred))
print(mean_squared_error(y_test,y_pred, squared=False))

1.7206571629418244
7147890.970616082


In [32]:
from sklearn.ensemble import RandomForestRegressor
#OTHER PARAMETERS: max_features, max_depth

params = {'n_estimators': [10, 50, 200]}
gs = GridSearchCV(RandomForestRegressor(), param_grid = params, scoring = 'neg_mean_absolute_percentage_error').fit(X_train, y_train)
print(gs.best_estimator_)
print(gs.best_score_)

RandomForestRegressor(n_estimators=200)
-1.3205734048360243


In [33]:
rf = RandomForestRegressor(n_estimators = 200)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(mean_absolute_percentage_error(y_test, y_pred))
print(mean_squared_error(y_test,y_pred, squared=False))

1.7441946494769882
8476479.877478095
