In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
%matplotlib inline

In [2]:
dataset = pd.read_csv("../nba_forecast/data/dataset.csv")
conf_rating_df = pd.read_csv("../nba_forecast/data/conf_ratings.csv")
athletics_df = pd.read_csv("../nba_forecast/data/final_athletics.csv")

In [3]:
def get_conf_rating(conf, season):
    for row in conf_rating_df.iterrows():
        if row[1].season == str(season) and row[1].conference == str(conf):
            return row[1].rating
    return None

In [4]:
dataset["conf_rating"] = dataset.apply(lambda x: get_conf_rating(x['conf_abbr'], x['season']), axis=1)

dataset['gs_pct'] = (dataset['gs']/dataset['g']).round(2)
dataset = dataset.drop(columns=['g','gs'])

dataset

Unnamed: 0,player_name,season,school_name,conf_abbr,mp,per,ts_pct,efg_pct,fg3a_per_fga_pct,fta_per_fga_pct,...,hand_width,height_wo_shoes,height_w_shoes,standing_reach,weight,wingspan,ratio_off,ratio_def,conf_rating,gs_pct
0,Anthony Davis,2011-12,Kentucky,SEC,1281,35.1,0.654,0.628,0.059,0.602,...,216,2064,2096,2743,221.8,2273,1.08,0.88,0.562488,1.00
1,Michael Kidd-Gilchrist,2011-12,Kentucky,SEC,1245,21.2,0.570,0.511,0.156,0.589,...,260,1975,2019,2654,232.8,2134,0.98,0.96,0.562488,0.98
2,Bradley Beal,2011-12,Florida,SEC,1267,22.0,0.575,0.525,0.473,0.440,...,229,1911,1949,2540,201.8,2032,0.93,0.86,0.562488,1.00
3,Dion Waiters,2011-12,Syracuse,Big East,891,26.3,0.565,0.534,0.317,0.331,...,241,1892,1930,2489,221.0,2013,0.79,0.73,0.568068,0.00
4,Thomas Robinson,2011-12,Kansas,Big 12,1242,27.4,0.549,0.512,0.027,0.462,...,267,2026,2051,2692,244.2,2216,0.88,0.81,0.564916,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,Keita Bates-Diop,2017-18,Ohio State,Big Ten,1125,27.5,0.577,0.544,0.357,0.274,...,216,2013,2045,2705,223.8,2216,0.74,0.79,0.552393,1.00
241,Chimezie Metu,2017-18,USC,Pac-12,1053,23.5,0.574,0.538,0.102,0.388,...,235,2045,2070,2743,219.6,2146,0.99,0.72,0.550836,0.97
242,Alize Johnson,2017-18,Missouri State,MVC,1028,24.1,0.528,0.481,0.365,0.363,...,248,2013,2032,2616,216.6,2051,1.08,1.16,0.527264,1.00
243,Shake Milton,2017-18,SMU,AAC,800,24.2,0.606,0.551,0.471,0.405,...,241,1943,1968,2527,207.2,2153,0.94,0.84,0.475291,1.00


## Baseline

L'erreur choisie est la MSE car on veut chercher à minimiser les erreurs entre nos prédictions et les ratios de conversion université/NBA réels

In [5]:
Y_def = dataset[['ratio_def']]
Y_off = dataset[['ratio_off']]
y_base_train, y_base_test = train_test_split(Y_def, test_size = 0.2, random_state = 0)
y_base_off_train, y_base_off_test = train_test_split(Y_off, test_size = 0.2, random_state = 0)

baseline_mse = ((y_base_test - y_base_train.mean())**2 ).mean()[0].round(3)
print(f'La MSE a battre par nos modèles pour le ratio def est {baseline_mse}')

baseline_mse_off = ((y_base_off_test-y_base_off_train.mean())**2).mean()[0].round(3)
print(f'La MSE a battre par nos modèles pour le ratio off est {baseline_mse_off}')

La MSE a battre par nos modèles pour le ratio def est 0.075
La MSE a battre par nos modèles pour le ratio off est 0.053


## Scaling des données

In [6]:
off_features = ['last_uni_age', 'pos', 'per','ts_pct','fg3a_per_fga_pct','fta_per_fga_pct','orb_pct','ast_pct','tov_pct','usg_pct','ows','obpm']
def_features = ['last_uni_age', 'pos', 'stl_pct','blk_pct','dws','drb_pct','dbpm']
athletics_features = list(athletics_df)[1:-1]

In [7]:
dataset.columns

Index(['player_name', 'season', 'school_name', 'conf_abbr', 'mp', 'per',
       'ts_pct', 'efg_pct', 'fg3a_per_fga_pct', 'fta_per_fga_pct', 'pprod',
       'orb_pct', 'drb_pct', 'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct',
       'tov_pct', 'usg_pct', 'ows', 'dws', 'ws', 'ws_per_40', 'obpm', 'dbpm',
       'bpm', 'years', 'player_id', 'pos', 'last_uni_age', 'position',
       'body_fat_pct', 'hand_length', 'hand_width', 'height_wo_shoes',
       'height_w_shoes', 'standing_reach', 'weight', 'wingspan', 'ratio_off',
       'ratio_def', 'conf_rating', 'gs_pct'],
      dtype='object')

In [8]:
X_def = dataset[def_features+athletics_features]
X_def = pd.get_dummies(X_def)
X_def.columns

Index(['last_uni_age', 'stl_pct', 'blk_pct', 'dws', 'drb_pct', 'dbpm',
       'body_fat_pct', 'hand_length', 'hand_width', 'height_wo_shoes',
       'height_w_shoes', 'standing_reach', 'weight', 'wingspan', 'pos_C',
       'pos_PF', 'pos_PG', 'pos_SF', 'pos_SG'],
      dtype='object')

RobustScaler choisi pour éviter d'être impacté par les outliers (joueurs qui superforment par rapport aux autres)

In [9]:
rb_scaler = RobustScaler()


X_train, X_test, y_train, y_test = train_test_split(X_def, Y_def, test_size = 0.2, random_state = 0)

X_def_scaled = rb_scaler.fit_transform(X_def)
X_def_scaled = pd.DataFrame(X_def_scaled)

X_train_scaled = rb_scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled)

X_test_scaled = rb_scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled)

## Linear Regression

In [10]:
linreg = LinearRegression()

# 5-Fold Cross validate model
cv_results = cross_validate(linreg, X_train_scaled, y_train, cv=10, scoring=['neg_mean_squared_error'])

# Mean of scores
print(f"MSE: {-cv_results['test_neg_mean_squared_error'].mean()}")

MSE: 0.06140413954202042


In [11]:
linreg.fit(X_train_scaled,y_train)

LinearRegression()

In [27]:
coef_df = pd.DataFrame(linreg.coef_[0], index=X_def.columns, columns=['coef'])
coef_df['abs_coef'] = abs(coef_df['coef'])

In [26]:
X_train_scaled.columns = X_train.columns
X_train_scaled_w_constant = sm.add_constant(X_train_scaled)

y_train.index = X_train_scaled_w_constant.index

mod = sm.OLS(y_train,X_train_scaled_w_constant)
fit = mod.fit()

fit.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,ratio_def,R-squared:,0.261
Model:,OLS,Adj. R-squared:,0.186
Method:,Least Squares,F-statistic:,3.474
Date:,"Mon, 30 Aug 2021",Prob (F-statistic):,9.3e-06
Time:,09:37:37,Log-Likelihood:,15.334
No. Observations:,196,AIC:,7.331
Df Residuals:,177,BIC:,69.62
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.8233,0.021,40.146,0.000,0.783,0.864
last_uni_age,-0.0313,0.023,-1.375,0.171,-0.076,0.014
stl_pct,-0.0755,0.033,-2.279,0.024,-0.141,-0.010
blk_pct,-0.0981,0.037,-2.618,0.010,-0.172,-0.024
dws,-0.1172,0.028,-4.125,0.000,-0.173,-0.061
drb_pct,0.0185,0.044,0.419,0.676,-0.069,0.106
dbpm,0.0622,0.044,1.415,0.159,-0.025,0.149
body_fat_pct,0.0017,0.026,0.064,0.949,-0.050,0.053
hand_length,0.0619,0.035,1.762,0.080,-0.007,0.131

0,1,2,3
Omnibus:,5.572,Durbin-Watson:,1.995
Prob(Omnibus):,0.062,Jarque-Bera (JB):,6.295
Skew:,-0.232,Prob(JB):,0.043
Kurtosis:,3.745,Cond. No.,1.67e+16


## Elastic Net

In [30]:
elastic_net = ElasticNetCV(cv=20, max_iter=10000)
elastic_net.fit(X_train_scaled, y_train)
((y_test.values[:,0] - elastic_net.predict(X_test_scaled))**2).mean()

  return f(*args, **kwargs)


0.06543229326967477

In [37]:
elastic_model = ElasticNet()
elastic_grid = GridSearchCV(elastic_model, {'alpha': [0, 0.5, 0.1, 0.01, 0.001],
                                            'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
                                            'max_iter':[10000]},
                   scoring='neg_mean_squared_error')
elastic_grid.fit(X_train_scaled,y_train)
print(f"MSE: {-elastic_grid.best_score_}")
print(f"Best estimator: {elastic_grid.best_estimator_}")

  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_f

  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_t

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


MSE: 0.05712250050836184
Best estimator: ElasticNet(alpha=0.01, max_iter=10000)


In [44]:
best_elastic_model = ElasticNet(alpha=0.01, max_iter=10000)
best_elastic_model.fit(X_train_scaled, y_train)
cv_elastic_results = cross_validate(elastic_grid.best_estimator_, X_def_scaled, Y_def.values[:,0], cv=20, scoring='neg_mean_squared_error')
# Mean of scores
-cv_elastic_results['test_score'].mean()

0.060572360140475935

## Random Forest

In [46]:
rf_params = {
    'bootstrap': [True, False],
    'max_depth': [1, 5, 10, 20, 25, 30, 40, 50, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [5,10,15,20,30,40,50,100]}

rf_model = RandomForestRegressor()
grid = GridSearchCV(rf_model, rf_params, n_jobs=-1, cv=20, scoring='neg_mean_squared_error')

In [47]:
# grid.fit(X_train_scaled, y_train.values.ravel())

GridSearchCV(cv=20, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [1, 5, 10, 20, 25, 30, 40, 50, None],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [5, 10, 15, 20, 30, 40, 50, 100]},
             scoring='neg_mean_squared_error')

In [196]:
print(grid.best_estimator_)
print(-grid.best_score_)

best_random_forest = RandomForestRegressor(max_depth=1, min_samples_leaf=4, min_samples_split=5,
                      n_estimators=15)
best_random_forest.fit(X_train_scaled,y_train.values.ravel())

cv_elastic_results = cross_validate(best_random_forest, X_train_scaled, y_train.values.ravel(), cv=20, scoring='neg_mean_squared_error')
print(-cv_elastic_results['test_score'].mean())

pd.DataFrame(best_random_forest.predict(X_test_scaled)).value_counts()

RandomForestRegressor(max_depth=1, min_samples_leaf=4, min_samples_split=5,
                      n_estimators=5)
0.05774205821619062
0.058600065193139816


0.839110    11
0.869869     8
0.899701     8
1.072877     6
0.855989     2
0.868942     2
0.914982     2
0.916581     2
1.042117     2
1.089756     2
0.884222     1
0.886749     1
1.018337     1
1.035217     1
dtype: int64

## XGBOOST

In [57]:
from xgboost import XGBRegressor

# scores = cross_val_score(XGBRegressor(), X, y, scoring='neg_mean_squared_error')
xgb_model = XGBRegressor()
cv_results = cross_validate(xgb_model, X_train_scaled, y_train, cv=10, scoring=['neg_mean_squared_error'])

# Mean of scores
print(f"MSE: {-cv_results['test_neg_mean_squared_error'].mean()}")

MSE: 0.07565880265932086


In [61]:
parameters = {'nthread':[4], 
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], 
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb_model,
                        parameters,
                        cv = 20,
                        n_jobs = -1,
                        scoring='neg_mean_squared_error')

In [62]:
xgb_grid.fit(X_train_scaled,
         y_train.values.ravel())

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


-0.0653216844455994
{'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
