In [3]:
import pandas as pd

In [4]:
!pip install scikit-learn



In [5]:
nba_stats = pd.read_csv('nba_mvp_stats.csv')

In [6]:
del nba_stats['Unnamed: 0']

In [7]:
nba_stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,SF,31,PHO,82,52,32.8,3.8,7.5,0.504,...,0.0,0.0,Phoenix Suns,59,23,0.72,0.0,110.6,106.8,3.86
1,Aaron Swinson,SF,24,PHO,9,0,5.7,1.1,2.0,0.556,...,0.0,0.0,Phoenix Suns,59,23,0.72,0.0,110.6,106.8,3.86
2,Antonio Lang,SF,22,PHO,12,0,4.4,0.3,0.8,0.4,...,0.0,0.0,Phoenix Suns,59,23,0.72,0.0,110.6,106.8,3.86
3,Charles Barkley,PF,31,PHO,68,66,35.0,8.1,16.8,0.486,...,1050.0,0.091,Phoenix Suns,59,23,0.72,0.0,110.6,106.8,3.86
4,Dan Majerle,SF,29,PHO,82,46,37.7,5.3,12.6,0.425,...,0.0,0.0,Phoenix Suns,59,23,0.72,0.0,110.6,106.8,3.86


In [8]:
pd.isnull(nba_stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          61
3P            0
3PA           0
3P%        1898
2P            0
2PA           0
2P%         111
eFG%         61
FT            0
FTA           0
FT%         562
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [9]:
nba_stats = nba_stats.fillna(0)

In [10]:
pd.isnull(nba_stats).sum()

Player     0
Pos        0
Age        0
Tm         0
G          0
GS         0
MP         0
FG         0
FGA        0
FG%        0
3P         0
3PA        0
3P%        0
2P         0
2PA        0
2P%        0
eFG%       0
FT         0
FTA        0
FT%        0
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
Year       0
Pts Won    0
Pts Max    0
Share      0
Team       0
W          0
L          0
W/L%       0
GB         0
PS/G       0
PA/G       0
SRS        0
dtype: int64

In [11]:
nba_stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [12]:
# the columns used to make the predictions - player, pos, tm, and team are not needed
# also remove coloumn from the mvp table coloumns - share, 
# the rest are predictors used to predict the share
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
              '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
              'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
              'W', 'L', 'W/L%', 'GB', 'PS/G','PA/G', 'SRS']

In [13]:
train = nba_stats[nba_stats['Year'] < 2023 ]
test = nba_stats[nba_stats['Year'] == 2023]

In [14]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=0.1)

In [15]:
model.fit(train[predictors], train['Share'])

In [16]:
pred = model.predict(test[predictors])

In [17]:
pred = pd.DataFrame(pred , columns=['Predictions'], index=test.index)

In [18]:
pred

Unnamed: 0,Predictions
125,0.003323
126,0.030629
127,0.036789
128,0.223147
129,-0.000350
...,...
14109,-0.013690
14110,0.004279
14111,-0.012571
14112,0.021243


In [19]:
combine = pd.concat([test[['Player', 'Share']], pred], axis=1)

In [20]:
combine

Unnamed: 0,Player,Share,Predictions
125,A.J. Green,0.000,0.003323
126,Bobby Portis,0.000,0.030629
127,Brook Lopez,0.000,0.036789
128,Giannis Antetokounmpo,0.606,0.223147
129,Goran Dragić,0.000,-0.000350
...,...,...,...
14109,Mitchell Robinson,0.000,-0.013690
14110,Obi Toppin,0.000,0.004279
14111,Quentin Grimes,0.000,-0.012571
14112,RJ Barrett,0.000,0.021243


In [21]:
combine.sort_values('Share', ascending=False).head(15)

Unnamed: 0,Player,Share,Predictions
13904,Joel Embiid,0.915,0.195818
690,Nikola Jokić,0.674,0.17011
128,Giannis Antetokounmpo,0.606,0.223147
2850,Jayson Tatum,0.28,0.135242
1326,Shai Gilgeous-Alexander,0.046,0.145228
12817,Donovan Mitchell,0.03,0.083924
4153,Domantas Sabonis,0.027,0.093792
238,Luka Dončić,0.01,0.193217
6467,Stephen Curry,0.005,0.105812
10213,Jimmy Butler,0.003,0.108942


In [22]:
from sklearn.metrics import mean_squared_error

In [23]:
mean_squared_error(combine['Share'], combine['Predictions'])

0.002652907511493022

In [24]:
combine['Share'].value_counts()

Share
0.000    526
0.001      2
0.606      1
0.010      1
0.674      1
0.046      1
0.280      1
0.002      1
0.027      1
0.005      1
0.003      1
0.030      1
0.915      1
Name: count, dtype: int64

In [25]:
# add a rank
combine = combine.sort_values('Share', ascending=False)
combine['Rk'] = list(range(1, combine.shape[0] + 1))
combine.head(10)

Unnamed: 0,Player,Share,Predictions,Rk
13904,Joel Embiid,0.915,0.195818,1
690,Nikola Jokić,0.674,0.17011,2
128,Giannis Antetokounmpo,0.606,0.223147,3
2850,Jayson Tatum,0.28,0.135242,4
1326,Shai Gilgeous-Alexander,0.046,0.145228,5
12817,Donovan Mitchell,0.03,0.083924,6
4153,Domantas Sabonis,0.027,0.093792,7
238,Luka Dončić,0.01,0.193217,8
6467,Stephen Curry,0.005,0.105812,9
10213,Jimmy Butler,0.003,0.108942,10


In [26]:
combine = combine.sort_values('Predictions', ascending=False)
combine['Predicted Rk'] = list(range(1, combine.shape[0] + 1))
combine.head(10)

Unnamed: 0,Player,Share,Predictions,Rk,Predicted Rk
128,Giannis Antetokounmpo,0.606,0.223147,3,1
13904,Joel Embiid,0.915,0.195818,1,2
238,Luka Dončić,0.01,0.193217,8,3
690,Nikola Jokić,0.674,0.17011,2,4
1326,Shai Gilgeous-Alexander,0.046,0.145228,5,5
7746,Damian Lillard,0.0,0.137896,90,6
10803,Kevin Durant,0.0,0.137006,50,7
2850,Jayson Tatum,0.28,0.135242,4,8
8097,Anthony Davis,0.0,0.133679,139,9
8104,LeBron James,0.0,0.129874,132,10


In [27]:
# error metric
combine = combine.sort_values('Share', ascending=False)
combine.head(10)

Unnamed: 0,Player,Share,Predictions,Rk,Predicted Rk
13904,Joel Embiid,0.915,0.195818,1,2
690,Nikola Jokić,0.674,0.17011,2,4
128,Giannis Antetokounmpo,0.606,0.223147,3,1
2850,Jayson Tatum,0.28,0.135242,4,8
1326,Shai Gilgeous-Alexander,0.046,0.145228,5,5
12817,Donovan Mitchell,0.03,0.083924,6,24
4153,Domantas Sabonis,0.027,0.093792,7,17
238,Luka Dončić,0.01,0.193217,8,3
6467,Stephen Curry,0.005,0.105812,9,14
10213,Jimmy Butler,0.003,0.108942,10,13


In [28]:
def find_ap(combine):
    # sort by share and get the predicted top 5 players
    actual = combine.sort_values('Share', ascending=False).head(5)
    
    predicted = combine.sort_values('Predictions', ascending=False)
    ps = []
    found = 0
    seen = 1
    
    for index, row in predicted.iterrows():
        if row['Player'] in actual['Player'].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [29]:
find_ap(combine)

0.835

In [30]:
# back testing 
years = list(range(1996, 2024))

In [None]:
# mean average precision
sum(aps)/len(aps)

In [32]:
def add_ranks(combine):
    combine = combine.sort_values('Share', ascending=False)
    combine['Rk'] = list(range(1, combine.shape[0] + 1))   
    
    combine = combine.sort_values('Predictions', ascending=False)
    combine['Predicted Rk'] = list(range(1, combine.shape[0] + 1))
    
    combine['diff'] = combine['Rk'] - combine['Predicted Rk']
    return combine

In [None]:
# Top 5 mvp vote getters 2002 and predicted difference
ranking = add_ranks(all_pred[1])
ranking[ranking['Rk']< 6].sort_values('diff', ascending=False)
# all_pred[1]

In [34]:
def backtest(nba_stats, model, year, predictors):
    aps = []
    all_pred = []
    # all predictions from 1999
    for year in years[5: ]:
        train = nba_stats[nba_stats['Year'] < year]
        test = nba_stats[nba_stats['Year'] == year]
        model.fit(train[predictors], train['Share'])

        pred = model.predict(test[predictors])
        # pred into data frame
        pred = pd.DataFrame(pred , columns=['Predictions'], index=test.index)
        # combine the predictions data frame with the test set
        combine = pd.concat([test[['Player', 'Share']], pred], axis=1)
        combine = add_ranks(combine)

        all_pred.append(combine)
        aps.append(find_ap(combine))
    return sum(aps)/len(aps), aps, pd.concat(all_pred)

In [35]:
mean_aps, aps, all_predictions = backtest(nba_stats, model, years[5: ], predictors)

In [36]:
mean_aps

0.7398047518007032

In [37]:
aps

[0.9028571428571428,
 0.72,
 0.9428571428571428,
 0.5232600732600733,
 0.504390243902439,
 0.3894428152492668,
 0.6803174603174603,
 0.821111111111111,
 0.9111111111111111,
 0.788888888888889,
 0.7392857142857142,
 0.4083333333333333,
 0.676923076923077,
 0.6477272727272727,
 0.9266666666666665,
 0.8333333333333333,
 0.8111111111111111,
 0.8,
 0.6375757575757576,
 0.9428571428571428,
 0.7136363636363636,
 0.8588235294117647,
 0.835]

In [38]:
all_predictions

Unnamed: 0,Player,Share,Predictions,Rk,Predicted Rk,diff
12412,Shaquille O'Neal,0.466,0.272036,3,1,2
9701,Chris Webber,0.420,0.157423,4,2,2
9152,Tim Duncan,0.569,0.137012,2,3,-1
12691,Karl Malone,0.017,0.134758,7,4,3
1091,Allen Iverson,0.904,0.123538,1,5,-4
...,...,...,...,...,...,...
4851,Dorian Finney-Smith,0.000,-0.042059,422,535,-113
7753,Justin Minaya,0.000,-0.045886,83,536,-453
7754,Justise Winslow,0.000,-0.047270,82,537,-455
11818,Jacob Gilyard,0.000,-0.048135,209,538,-329


In [39]:
# the most import mvp vote sharing is eFG%
pd.concat([pd.Series(model.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.100185,eFG%
18,0.033148,DRB
29,0.025489,W/L%
17,0.021175,ORB
10,0.017229,2P
15,0.011335,FTA
21,0.009728,STL
12,0.008449,2P%
22,0.008302,BLK
25,0.007799,PTS


In [57]:
# add more predictors
# mean steal, blk, pts, ast per season per player
# ratios = nba_stats[['PTS','AST','STL','BLK', 'Year']].groupby('Year').apply(lambda x: x/x.mean())
ratios = nba_stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean())

In [58]:
mean_ap, aps, all_predictions = backtest(nba_stats, reg, years[5:], predictors)

Unnamed: 0_level_0,Unnamed: 1_level_0,PTS,AST,STL,BLK,3P,Year
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1995,0,1.319103,0.758048,0.991605,0.931570,1.107162,1.0
1995,1,0.317998,0.151610,0.141658,0.000000,0.000000,1.0
1995,2,0.105999,0.050537,0.000000,0.465785,0.000000,1.0
1995,3,2.708873,2.071999,2.266527,1.630247,2.435757,1.0
1995,4,1.837322,2.071999,1.699895,1.164462,5.314379,1.0
...,...,...,...,...,...,...,...
2024,13861,0.448439,0.050287,0.351165,0.000000,0.212681,1.0
2024,13862,0.519246,0.402293,2.282573,1.527117,0.744385,1.0
2024,13863,0.519246,0.553153,1.229078,0.000000,1.382429,1.0
2024,13864,2.112386,1.659460,1.755825,1.272598,1.276088,1.0


In [None]:
nba_stats[["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]] = ratios[["PTS", "AST", "STL", "BLK", "3P"]]

In [None]:
predictors += ["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]