In [1]:
import pandas as pd
import numpy as np

In [2]:
df_21= pd.read_csv('data21nd.csv')
df_19 = pd.read_csv('data19nd.csv')

In [3]:
list(df_21.columns)

['and_ones',
 'assist_percentage',
 'assists',
 'assists_per_poss',
 'block_percentage',
 'blocking_fouls',
 'blocks',
 'blocks_per_poss',
 'box_plus_minus',
 'center_percentage',
 'defensive_box_plus_minus',
 'defensive_rebound_percentage',
 'defensive_rebounds',
 'defensive_rebounds_per_poss',
 'defensive_win_shares',
 'dunks',
 'effective_field_goal_percentage',
 'field_goal_attempts',
 'field_goal_attempts_per_poss',
 'field_goal_perc_sixteen_foot_plus_two_pointers',
 'field_goal_perc_ten_to_sixteen_feet',
 'field_goal_perc_three_to_ten_feet',
 'field_goal_perc_zero_to_three_feet',
 'field_goal_percentage',
 'field_goals',
 'field_goals_per_poss',
 'free_throw_attempt_rate',
 'free_throw_attempts',
 'free_throw_attempts_per_poss',
 'free_throw_percentage',
 'free_throws',
 'free_throws_per_poss',
 'games_played',
 'games_started',
 'half_court_heaves',
 'half_court_heaves_made',
 'height',
 'lost_ball_turnovers',
 'minutes_played',
 'nationality',
 'net_plus_minus',
 'offensive_box

Notre objectif est d'entraîner un modèle de régression sur les données de la saison NBA 2018-2019, pour ensuite prédire les salaires de la saison 2020-2021 (nous n'utilisons pas la saison 2019-2020 qui a été perturbée par la crise du COVID). Pour ce faire, nous devons d'abord normaliser les salaires à cause de l'inflation.

In [4]:
df_19['salary']=(df_19['salary']-df_19['salary'].mean())/df_19['salary'].std()
df_19['salary']

0      1.813458
1     -0.658233
2     -0.725268
3      2.735092
4     -0.024559
         ...   
525   -0.597703
526   -0.570333
527   -0.834806
528   -0.449331
529         NaN
Name: salary, Length: 530, dtype: float64

# Machine Learning with Scikit Learn

### Creation of X (feature Matrix)

Nous avons sélectionné les quelques variables explicatives présentes dans nos données qui nous semblaient être les plus pertinentes pour juger la valeur salariale d'un joueur. De plus, nombre de variables sont très corrélées entre elles, ainsi les utiliser ensemble n'aurait que peu d'intérêt. Nous ne considérons également que les joueurs ayant comptabilisé plus de 20 matchs sur la saison en question.

In [5]:
x = df_19[df_19['games_played']>=20][['points_per_poss','total_rebounds_per_poss','assists_per_poss','win_shares','player_efficiency_rating']]
x.dropna()
x

Unnamed: 0,points_per_poss,total_rebounds_per_poss,assists_per_poss,win_shares,player_efficiency_rating
0,23.2,10.7,5.4,5.1,15.1
1,22.3,5.1,6.6,0.9,11.9
2,16.2,7.8,1.3,0.9,8.8
3,22.6,11.2,6.9,7.5,20.2
4,16.1,12.9,2.2,5.8,13.2
...,...,...,...,...,...
520,18.3,4.8,6.0,1.9,12.7
522,18.3,11.6,2.5,2.9,13.5
523,33.3,6.6,6.3,2.8,18.7
525,15.0,14.9,5.1,2.3,13.7


### Creation of Y (target)

In [6]:
y = df_19[df_19['games_played']>=20]['salary']
y

0      1.813458
1     -0.658233
2     -0.725268
3      2.735092
4     -0.024559
         ...   
520   -0.521574
522   -0.442582
523    1.550838
525   -0.597703
528   -0.449331
Name: salary, Length: 423, dtype: float64

Ici nous identifions puis retirons de y et de x les joueurs pour lesquels le salaire n'est pas disponible.

In [7]:
index_with_nan = y.index[y.isna()]
print(index_with_nan)

Int64Index([7, 9, 86, 138, 230, 262, 263, 443], dtype='int64')


In [8]:
y = y.dropna()
y

0      1.813458
1     -0.658233
2     -0.725268
3      2.735092
4     -0.024559
         ...   
520   -0.521574
522   -0.442582
523    1.550838
525   -0.597703
528   -0.449331
Name: salary, Length: 415, dtype: float64

In [9]:
x = x.drop(index_with_nan)
x

Unnamed: 0,points_per_poss,total_rebounds_per_poss,assists_per_poss,win_shares,player_efficiency_rating
0,23.2,10.7,5.4,5.1,15.1
1,22.3,5.1,6.6,0.9,11.9
2,16.2,7.8,1.3,0.9,8.8
3,22.6,11.2,6.9,7.5,20.2
4,16.1,12.9,2.2,5.8,13.2
...,...,...,...,...,...
520,18.3,4.8,6.0,1.9,12.7
522,18.3,11.6,2.5,2.9,13.5
523,33.3,6.6,6.3,2.8,18.7
525,15.0,14.9,5.1,2.3,13.7


## Machine Learning Regressors

Nous essayons ici différents modèles de Machine Learning utilisant des méthodes de régression pour tenter de prédire les salaires des joueurs.

In [10]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [11]:
np.random.seed(53)

In [12]:
# 1
from sklearn import svm
reg1 = svm.SVR(gamma = 'scale')
reg1 = reg1.fit(x_train, y_train)

In [13]:
y_preds = reg1.predict(x_test)
y_preds

array([-0.0517241 ,  0.164605  , -0.41069101, -0.45244479, -0.48008987,
       -0.62185917,  0.74570391,  0.33689536, -0.20142989,  0.80387827,
       -0.55039835, -0.30656344, -0.45056225, -0.43879135,  0.09392269,
       -0.23046366, -0.19448564, -0.01693651, -0.42047196, -0.39427359,
        0.1204162 , -0.18777799, -0.66919625, -0.51786175, -0.45499398,
        0.92604117,  0.62589566,  0.30250918,  0.95082185, -0.28118499,
       -0.34490238, -0.62206143, -0.43945576, -0.13471027,  0.82139472,
       -0.6230668 , -0.37183702, -0.30772109,  0.52715268, -0.43387217,
       -0.59114813,  0.31817046,  0.74001036, -0.10794813, -0.52323925,
       -0.56381617, -0.18717155,  1.00583384, -0.61203353, -0.56665414,
       -0.29578785, -0.58794883,  0.54823466, -0.54900944,  1.01624274,
       -0.65487642, -0.36238577, -0.63750722,  0.21079168, -0.11682329,
        0.14208757, -0.16499071, -0.46894737, -0.40599129, -0.43686354,
       -0.6394125 ,  0.80868148, -0.02577455, -0.27095236, -0.52

In [14]:
reg1.score(x_test,y_test)

0.2801530687984334

In [15]:
np.random.seed(37)

In [16]:
# 2
from sklearn.linear_model import LinearRegression
reg2 = LinearRegression()
reg2 = reg2.fit(x_train,y_train)

In [17]:
y_preds2  = reg2.predict(x_test)
y_preds2

array([ 0.43500088,  0.56773576, -0.04597572, -0.1643885 , -0.14106468,
       -0.40257576,  1.0381317 ,  0.52022465,  0.09674933,  0.83847647,
       -0.26237192,  0.08088345, -0.23991255, -0.10062323,  0.3884655 ,
       -0.02885166,  0.25697488,  0.13258267, -0.13851796, -0.33598266,
        0.62827653,  0.05232813, -0.51377983, -0.3092819 , -0.16885615,
        1.15425908,  0.7485175 ,  0.61291356,  1.4196941 ,  0.11640208,
        0.01254454, -0.49832603, -0.13601096, -0.09112229,  0.99436441,
       -0.31369752, -0.09815495,  0.08037414,  0.96704557, -0.20633666,
       -0.43138841,  0.88454789,  1.30549485,  0.28294027, -0.22637732,
       -0.26656882,  0.18314568,  1.69765281, -0.68258101, -0.30621897,
        0.04890796, -0.95820327,  0.97388186, -0.26460104,  1.11026532,
       -0.53273329,  0.00537634, -0.4753016 ,  0.70151066,  0.41208401,
        0.32859423,  0.39024067, -0.18727981, -0.08578218, -0.17176392,
       -0.42892006,  0.89846652,  0.48965082,  0.05804036, -0.25

In [18]:
reg2.score(x_test, y_test)

0.3206247862612256

In [19]:
np.random.seed(42)

In [23]:
#3
from sklearn.ensemble import RandomForestRegressor
reg3 = RandomForestRegressor(n_estimators = 100)
reg3 = reg3.fit(x_train, y_train)

In [24]:
y_preds3 = reg3.predict(x_test)
y_preds3

array([-0.05995993,  0.92525346, -0.05098332, -0.40728664, -0.20015116,
       -0.43925543,  0.55329369,  0.09816212, -0.11284302,  1.11361472,
       -0.19720113,  0.46121622, -0.36772607, -0.3553628 ,  0.63881624,
        0.03725512,  0.06312455,  0.33734049,  0.18719552, -0.17059172,
       -0.08838994,  0.7352722 , -0.41264594, -0.44056856, -0.19274673,
        1.12851855,  0.98979945,  0.67256803,  1.87694973,  0.02509876,
       -0.01489442, -0.34413366,  0.01181848, -0.03392019,  0.69831653,
       -0.61774465,  0.12760808,  0.54551547,  1.05715303, -0.09353196,
       -0.52984582,  0.10914712,  1.04860758,  0.48342942, -0.14537859,
       -0.36432181,  0.38168046,  2.50761065, -0.12168264, -0.2283089 ,
       -0.2401908 ,  0.01776318,  0.77923798, -0.44464993,  0.48321638,
       -0.56902707,  0.29169016, -0.56891062, -0.17756972,  0.26627383,
        0.65868648, -0.06054521, -0.3949775 ,  0.09147116,  0.20991226,
       -0.31748821,  0.52652356, -0.01604545, -0.26259104, -0.23

In [25]:
reg3.score(x_test, y_test)

0.1450714873855986

Après ces premiers tests, nous nous sommes dit qu'il était très commun au basket de comparer les performances des joueurs en utilisant trois variables : les points par match, les passes par match et les rebonds par match. Nous avons alors créé ces trois nouvelles variables (modified data) qui n'étaient initialement pas dans notre base de données, et les avons utilisés pour nos modèles.

### Nouvelles variables

In [26]:
x = df_19[df_19['games_played']>=20][['points_per_game','reb_per_game','ast_per_game','win_shares','player_efficiency_rating']]
x.dropna()
x

Unnamed: 0,points_per_game,reb_per_game,ast_per_game,win_shares,player_efficiency_rating
0,15.974359,7.358974,3.705128,5.1,15.1
1,5.880000,1.340000,1.740000,0.9,11.9
2,3.950820,1.901639,0.327869,0.9,8.8
3,13.602941,6.735294,4.161765,7.5,20.2
4,9.382716,7.530864,1.283951,5.8,13.2
...,...,...,...,...,...
520,5.915493,1.535211,1.929577,1.9,12.7
522,6.649351,4.207792,0.922078,2.9,13.5
523,23.682540,4.666667,4.492063,2.8,18.7
525,3.926471,3.897059,1.338235,2.3,13.7


In [27]:
x = x.drop(index_with_nan)
x

Unnamed: 0,points_per_game,reb_per_game,ast_per_game,win_shares,player_efficiency_rating
0,15.974359,7.358974,3.705128,5.1,15.1
1,5.880000,1.340000,1.740000,0.9,11.9
2,3.950820,1.901639,0.327869,0.9,8.8
3,13.602941,6.735294,4.161765,7.5,20.2
4,9.382716,7.530864,1.283951,5.8,13.2
...,...,...,...,...,...
520,5.915493,1.535211,1.929577,1.9,12.7
522,6.649351,4.207792,0.922078,2.9,13.5
523,23.682540,4.666667,4.492063,2.8,18.7
525,3.926471,3.897059,1.338235,2.3,13.7


In [28]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [29]:
np.random.seed(53)

In [30]:
# 1
from sklearn import svm
reg4 = svm.SVR(gamma = 'scale')
reg4 = reg4.fit(x_train, y_train)

In [31]:
y_preds4 = reg4.predict(x_test)
y_preds4

array([ 0.41073195,  0.23320155, -0.33107522, -0.65693628,  1.48091468,
        2.5316821 ,  1.84094374,  0.17876154, -0.59159811, -0.59183222,
        0.87284608, -0.52797079,  0.1722785 ,  1.46406821, -0.59109836,
       -0.64193005, -0.42418092,  0.57127501,  0.35667495, -0.11203781,
       -0.53422385,  0.59019193, -0.43366006, -0.60081736, -0.01476049,
       -0.57899524, -0.59930225, -0.45585584,  0.42669326, -0.39957376,
       -0.3343981 , -0.59937327, -0.21681049, -0.53791944,  1.27962514,
       -0.30170982, -0.57174366, -0.07832434,  0.28027004, -0.49188747,
        0.38380613, -0.54754078, -0.48015928, -0.45774673, -0.69906712,
        0.03671409,  2.3406377 ,  0.00286863,  0.19770173, -0.07379335,
       -0.65022009, -0.58972411,  0.04348962, -0.38925836, -0.67780646,
       -0.60242981, -0.64342058,  1.14465334, -0.19650086, -0.50466189,
        1.36845208,  1.17448059,  0.50076079,  0.2572172 ,  0.93390522,
       -0.56978804, -0.03951739, -0.11950151, -0.49619317,  1.29

In [32]:
reg4.score(x_test,y_test)

0.2782638309536052

In [33]:
np.random.seed(37)

In [34]:
# 2
from sklearn.linear_model import LinearRegression
reg5 = LinearRegression()
reg5 = reg5.fit(x_train,y_train)

In [35]:
y_preds5  = reg5.predict(x_test)
y_preds5

array([ 0.70913937,  0.45018791,  0.12430459, -0.55879644,  1.26502526,
        1.91327366,  1.37901972,  0.47149188, -0.52531831, -0.52315835,
        0.82664271, -0.43219479,  0.63097257,  1.13289577, -0.55231656,
       -0.770504  , -0.05994038,  0.7519882 ,  0.67006058,  0.10649177,
       -0.25847793,  0.72098105, -0.24991652, -0.51807344,  0.45860196,
       -0.26749762, -0.49438662, -0.06738546,  0.73227381, -0.18218355,
       -0.09801994, -0.47018812,  0.12701181, -0.22507302,  1.07943774,
        0.21423069, -0.33940326,  0.20821451,  0.48190582, -0.1906311 ,
        0.90342083, -0.15504021, -0.29053854, -0.16134121, -0.6710242 ,
        0.40113972,  2.61379459,  0.23937467,  0.55891294,  0.27975893,
       -0.7082813 , -0.52565708,  0.31457894, -0.124299  , -0.64158771,
       -0.51367163, -0.48461655,  1.2091462 ,  0.08221139, -0.1277685 ,
        1.3924274 ,  1.11746036,  0.66792579,  0.72175828,  1.06390052,
       -0.29850298,  0.14687694,  0.31533871, -0.27464885,  1.04

In [36]:
reg5.score(x_test, y_test)

0.31799869209764764

In [37]:
np.random.seed(42)

In [38]:
# 3
from sklearn.ensemble import RandomForestRegressor
reg6 = RandomForestRegressor(n_estimators = 100)
reg6 = reg6.fit(x_train, y_train)

In [39]:
y_preds6 = reg6.predict(x_test)
y_preds6

array([ 7.73940502e-01,  4.81683828e-01, -7.97506313e-02, -4.04420489e-01,
        1.34019125e+00,  2.54613583e+00,  1.32245320e+00,  7.24145795e-01,
       -4.75330037e-01, -4.75622061e-01,  1.02418357e+00, -2.51477085e-01,
        3.87739298e-01,  1.14595890e+00,  6.87964526e-01, -6.03365810e-01,
       -1.68117195e-01, -4.12505375e-02,  5.29794376e-01,  4.86204983e-01,
       -9.88835495e-02,  6.64674105e-01, -3.12091528e-01, -1.97817882e-01,
        9.48624685e-02, -2.12400203e-01, -3.28687305e-01, -1.37854592e-01,
        3.89531683e-01, -2.01311350e-01, -2.41528088e-01, -3.92552768e-01,
        2.07123140e-01, -4.69271377e-02,  1.44415725e+00,  2.36032865e-01,
       -5.49300006e-01,  2.89844953e-01,  2.87478716e-01, -9.49849233e-02,
        5.29215410e-01, -3.17221966e-01, -4.46731942e-01, -3.08057120e-01,
       -6.28638210e-01,  5.23744781e-01,  2.76258453e+00,  2.14625799e-01,
        8.21719533e-01,  1.70369651e-01, -6.77134423e-01, -5.30422316e-01,
        2.59189985e-01,  

In [40]:
reg6.score(x_test, y_test)

0.1806250503309179

Après avoir essayer ces trois modèles, le plus efficace (lorsque l'on se base sur le R-squared de la régression) semble être le sklearn.linear_model.LinearRegression.

## Improving our model

In [42]:
reg5.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

In [45]:
from sklearn.linear_model import LinearRegression
reg7 = LinearRegression(normalize = True)
reg7 = reg7.fit(x_train,y_train)

In [46]:
reg7.score(x_test,y_test)

0.31799869209764775

In [47]:
from sklearn.linear_model import LinearRegression
reg8 = LinearRegression(fit_intercept = False)
reg8 = reg8.fit(x_train,y_train)

In [48]:
reg7.score(x_test,y_test)

0.31799869209764775

Visiblement, changer les paramètres par défaut du modèle ne modifie pas significativement sa performance.

In [49]:
# On enregistre le modèle entraîné pour le réutiliser dans le projet final

In [50]:
import pickle

In [51]:
pickle.dump(reg5, open('reg_linear_regression.pkl', 'wb'))

In [41]:
L = []
for i in range (10, 210, 10):
    np.random.seed(25)
    reg = RandomForestRegressor(n_estimators = i)
    reg = reg.fit(x_train, y_train)
    r2 = reg.score(x_test,y_test)
    print(i, "R squared =", r2)
    L.append(r2)
print("n_opti =",(L.index(max(L))+1)*10)

10 R squared = 0.17055384862602463
20 R squared = 0.16308237593163188
30 R squared = 0.14210582402991567
40 R squared = 0.1381060425041245
50 R squared = 0.15785418053296063
60 R squared = 0.15737908624142727
70 R squared = 0.17866185903841558
80 R squared = 0.1861789682434657
90 R squared = 0.1740758670515834
100 R squared = 0.16887289884499634
110 R squared = 0.1717284611157689
120 R squared = 0.17469902755891564
130 R squared = 0.17902669397501347
140 R squared = 0.1777247887310499
150 R squared = 0.1848099872932487
160 R squared = 0.18011473730929928
170 R squared = 0.17956816247938456
180 R squared = 0.17771836647898342
190 R squared = 0.1710043145146003
200 R squared = 0.1718975285905412
n_opti = 80


L'objectif est maintenant d'utiliser notre modèle entraîné avec les données de la saison 2018-2019 sur les données de la saison 2020-2021, pour pouvoir ensuite comparer les valeurs réelles des salaires avec les valeurs prédites, qu'on pourrait qualifiées de "théoriques".

In [None]:
df_new = df_21[df_21['games_played']>=20]

In [None]:
loaded_pickle_model = pickle.load(open('reg_linear_regression.pkl','rb'))

In [None]:
mn = df_new['salary'].mean()
st = df_new['salary'].std()

In [None]:
df_new['predicted salary'] = mn + st*loaded_pickle_model.predict(df_21[df_21['games_played']>=20][['points_per_game','reb_per_game','ast_per_game','win_shares','player_efficiency_rating']])

In [None]:
df_new

In [None]:
df_new['gap salaries']= df_new['salary'] - df_new['predicted salary']
df_new['error ratio']=df_new['gap salaries']/df_new['salary']
df_new

In [None]:
df_new.groupby('team')['gap salaries'].sum()