In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
%matplotlib inline

In [2]:
dataset = pd.read_csv("../nba_forecast/data/dataset.csv")
conf_rating_df = pd.read_csv("../nba_forecast/data/conf_ratings.csv")
athletics_df = pd.read_csv("../nba_forecast/data/final_athletics.csv")

In [3]:
def get_conf_rating(conf, season):
    for row in conf_rating_df.iterrows():
        if row[1].season == str(season) and row[1].conference == str(conf):
            return row[1].rating
    return None

In [4]:
dataset["conf_rating"] = dataset.apply(lambda x: get_conf_rating(x['conf_abbr'], x['season']), axis=1)

dataset['gs_pct'] = (dataset['gs']/dataset['g']).round(2)
dataset = dataset.drop(columns=['g','gs'])

dataset

Unnamed: 0,player_name,season,school_name,conf_abbr,mp,per,ts_pct,efg_pct,fg3a_per_fga_pct,fta_per_fga_pct,...,hand_width,height_wo_shoes,height_w_shoes,standing_reach,weight,wingspan,ratio_off,ratio_def,conf_rating,gs_pct
0,Anthony Davis,2011-12,Kentucky,SEC,1281,35.1,0.654,0.628,0.059,0.602,...,216,2064,2096,2743,221.8,2273,1.08,0.88,0.562488,1.00
1,Michael Kidd-Gilchrist,2011-12,Kentucky,SEC,1245,21.2,0.570,0.511,0.156,0.589,...,260,1975,2019,2654,232.8,2134,0.98,0.96,0.562488,0.98
2,Bradley Beal,2011-12,Florida,SEC,1267,22.0,0.575,0.525,0.473,0.440,...,229,1911,1949,2540,201.8,2032,0.93,0.86,0.562488,1.00
3,Dion Waiters,2011-12,Syracuse,Big East,891,26.3,0.565,0.534,0.317,0.331,...,241,1892,1930,2489,221.0,2013,0.79,0.73,0.568068,0.00
4,Thomas Robinson,2011-12,Kansas,Big 12,1242,27.4,0.549,0.512,0.027,0.462,...,267,2026,2051,2692,244.2,2216,0.88,0.81,0.564916,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,Keita Bates-Diop,2017-18,Ohio State,Big Ten,1125,27.5,0.577,0.544,0.357,0.274,...,216,2013,2045,2705,223.8,2216,0.74,0.79,0.552393,1.00
241,Chimezie Metu,2017-18,USC,Pac-12,1053,23.5,0.574,0.538,0.102,0.388,...,235,2045,2070,2743,219.6,2146,0.99,0.72,0.550836,0.97
242,Alize Johnson,2017-18,Missouri State,MVC,1028,24.1,0.528,0.481,0.365,0.363,...,248,2013,2032,2616,216.6,2051,1.08,1.16,0.527264,1.00
243,Shake Milton,2017-18,SMU,AAC,800,24.2,0.606,0.551,0.471,0.405,...,241,1943,1968,2527,207.2,2153,0.94,0.84,0.475291,1.00


## Baseline

L'erreur choisie est la MSE car on veut chercher à minimiser les erreurs entre nos prédictions et les ratios de conversion université/NBA réels

In [5]:
Y_def = dataset[['ratio_def']]
Y_off = dataset[['ratio_off']]
y_base_train, y_base_test = train_test_split(Y_def, test_size = 0.2, random_state = 0)
y_base_off_train, y_base_off_test = train_test_split(Y_off, test_size = 0.2, random_state = 0)

baseline_mse = ((y_base_test - y_base_train.mean())**2 ).mean()[0].round(3)
print(f'La MSE a battre par nos modèles pour le ratio def est {baseline_mse}')

baseline_mse_off = ((y_base_off_test-y_base_off_train.mean())**2).mean()[0].round(3)
print(f'La MSE a battre par nos modèles pour le ratio off est {baseline_mse_off}')

La MSE a battre par nos modèles pour le ratio def est 0.075
La MSE a battre par nos modèles pour le ratio off est 0.053


## Scaling des données

In [6]:
off_features = ['last_uni_age', 'pos', 'per','ts_pct','fg3a_per_fga_pct','fta_per_fga_pct','orb_pct','ast_pct','tov_pct','usg_pct','ows','obpm']
def_features = ['last_uni_age', 'pos', 'stl_pct','blk_pct','dws','drb_pct','dbpm']
athletics_features = list(athletics_df)[1:-1]

In [8]:
dataset.columns

Index(['player_name', 'season', 'school_name', 'conf_abbr', 'mp', 'per',
       'ts_pct', 'efg_pct', 'fg3a_per_fga_pct', 'fta_per_fga_pct', 'pprod',
       'orb_pct', 'drb_pct', 'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct',
       'tov_pct', 'usg_pct', 'ows', 'dws', 'ws', 'ws_per_40', 'obpm', 'dbpm',
       'bpm', 'years', 'player_id', 'pos', 'last_uni_age', 'position',
       'body_fat_pct', 'hand_length', 'hand_width', 'height_wo_shoes',
       'height_w_shoes', 'standing_reach', 'weight', 'wingspan', 'ratio_off',
       'ratio_def', 'conf_rating', 'gs_pct'],
      dtype='object')

In [14]:
X_def = dataset[def_features+athletics_features]
X_def = pd.get_dummies(X_def)
X_def.columns

Index(['last_uni_age', 'stl_pct', 'blk_pct', 'dws', 'drb_pct', 'dbpm',
       'body_fat_pct', 'hand_length', 'hand_width', 'height_wo_shoes',
       'height_w_shoes', 'standing_reach', 'weight', 'wingspan', 'pos_C',
       'pos_PF', 'pos_PG', 'pos_SF', 'pos_SG'],
      dtype='object')

RobustScaler choisi pour éviter d'être impacté par les outliers (joueurs qui superforment par rapport aux autres)

In [15]:
rb_scaler = RobustScaler()


X_train, X_test, y_train, y_test = train_test_split(X_def, Y_def, test_size = 0.2, random_state = 0)

X_def_scaled = rb_scaler.fit_transform(X_def)
X_def_scaled = pd.DataFrame(X_def_scaled)

X_train_scaled = rb_scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled)

X_test_scaled = rb_scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled)

## Linear Regression

In [16]:
linreg = LinearRegression()

# 5-Fold Cross validate model
cv_results = cross_validate(linreg, X_train_scaled, y_train, cv=10, scoring=['neg_mean_squared_error'])

# Mean of scores
print(f"MSE: {-cv_results['test_neg_mean_squared_error'].mean()}")

MSE: 0.06140413954202042


In [17]:
linreg.fit(X_train_scaled,y_train)

LinearRegression()

In [19]:
coef_df = pd.DataFrame(linreg.coef_[0], index=X_def.columns, columns=['coef'])
coef_df['abs_coef'] = abs(coef_df['coef'])
coef_df.sort_values(by='abs_coef', ascending=False)

Unnamed: 0,coef,abs_coef
height_w_shoes,0.44808,0.44808
height_wo_shoes,-0.281,0.281
dws,-0.117246,0.117246
standing_reach,-0.109742,0.109742
blk_pct,-0.098053,0.098053
stl_pct,-0.075546,0.075546
hand_width,-0.07025,0.07025
weight,-0.070106,0.070106
pos_SG,-0.064318,0.064318
dbpm,0.062157,0.062157
