In [1]:
# Importing required libraries
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

In [2]:
# Reading the data file
stats = pd.read_csv("player_mvp_stats.csv")

In [3]:
stats

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14692,14692,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14693,14693,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14694,14694,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14695,14695,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


# Data Cleaning

In [4]:
del stats["Unnamed: 0"]

In [5]:
# Checking for null values
pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          59
3P            0
3PA           0
3P%        2086
2P            0
2PA           0
2P%         100
eFG%         59
FT            0
FTA           0
FT%         521
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [6]:
stats[pd.isnull(stats["3P%"])][["Player", "3PA"]]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0
...,...,...
14666,Evan Eschmeyer,0.0
14667,Gheorghe Mureșan,0.0
14669,Jim McIlvaine,0.0
14675,Mark Hendrickson,0.0


In [7]:
stats[pd.isnull(stats["FT%"])][["Player", "FTA"]]

Unnamed: 0,Player,FTA
77,John Coker,0.0
92,Jason Sasser,0.0
103,Adrian Caldwell,0.0
119,Bruno Šundov,0.0
158,Jamal Robinson,0.0
...,...,...
14556,Mark McNamara,0.0
14584,Luke Zeller,0.0
14637,Myron Brown,0.0
14659,Malcolm Lee,0.0


We can see that there are null values in the percentage columns. From the analysis we did above the conclusion is that those percentages are null because the player did not make any FT or 3PA etc. So, we will replace those null values with "0". Technically this is not right because if you did not attempt any 3P doesn't mean your percentage is 0, but in this case we can do it this way because if the player did not attempt any FT or 3P means there is no chance he is in the MVP race.

In [8]:
# Replacing null values with 0
stats = stats.fillna(0)

# Training Machine Learning Models

In [9]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [10]:
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [11]:
train = stats[stats["Year"] < 2022]

In [12]:
test = stats[stats["Year"] == 2022]

In [13]:
# Ridge Regression Model
reg = Ridge(alpha=.1) # alpha controls just how much the coefficient is gonna be shrunk to prevent overfitting

In [14]:
reg.fit(train[predictors], train["Share"])

Ridge(alpha=0.1)

In [15]:
predictions = reg.predict(test[predictors])

In [16]:
# Converting predictions from numpy array to pandas dataframe
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [17]:
predictions

Unnamed: 0,predictions
648,0.012934
649,-0.028142
650,-0.006163
651,0.016564
652,-0.004820
...,...
12508,-0.019380
12509,-0.010196
12510,0.003810
12511,0.001162


In [18]:
# Combining test data and predictions
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)

In [19]:
combination

Unnamed: 0,Player,Share,predictions
648,Aaron Gordon,0.0,0.012934
649,Austin Rivers,0.0,-0.028142
650,Bol Bol,0.0,-0.006163
651,Bones Hyland,0.0,0.016564
652,Bryn Forbes,0.0,-0.004820
...,...,...,...
12508,Micah Potter,0.0,-0.019380
12509,Rodney McGruder,0.0,-0.010196
12510,Saben Lee,0.0,0.003810
12511,Saddiq Bey,0.0,0.001162


In [20]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
663,Nikola Jokić,0.875,0.190365
837,Joel Embiid,0.706,0.190462
11678,Giannis Antetokounmpo,0.595,0.21941
907,Devin Booker,0.216,0.091309
11469,Luka Dončić,0.146,0.157395
1179,Jayson Tatum,0.043,0.095902
12226,Ja Morant,0.01,0.120508
6398,Stephen Curry,0.004,0.093138
905,Chris Paul,0.002,0.078329
8241,LeBron James,0.001,0.157828


In [21]:
mean_squared_error(combination["Share"], combination["predictions"])

0.002240133917395923

We only care about the top 10 players or something because 99% of players don't get any mvp votes. So, the above error metric doesn't make any sense.

In [22]:
# Creating a rank column based on the actual data
combination = combination.sort_values("Share", ascending=False)
combination["Rk"] = list(range(1, combination.shape[0]+1))

In [23]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
663,Nikola Jokić,0.875,0.190365,1
837,Joel Embiid,0.706,0.190462,2
11678,Giannis Antetokounmpo,0.595,0.21941,3
907,Devin Booker,0.216,0.091309,4
11469,Luka Dončić,0.146,0.157395,5
1179,Jayson Tatum,0.043,0.095902,6
12226,Ja Morant,0.01,0.120508,7
6398,Stephen Curry,0.004,0.093138,8
905,Chris Paul,0.002,0.078329,9
8241,LeBron James,0.001,0.157828,10


In [24]:
# Creating a rank column based on the predictions data
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1, combination.shape[0]+1))

In [25]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
11678,Giannis Antetokounmpo,0.595,0.21941,3,1
837,Joel Embiid,0.706,0.190462,2,2
663,Nikola Jokić,0.875,0.190365,1,3
8241,LeBron James,0.001,0.157828,10,4
11469,Luka Dončić,0.146,0.157395,5,5
6185,Kevin Durant,0.001,0.140627,12,6
12226,Ja Morant,0.01,0.120508,7,7
11820,Trae Young,0.0,0.109246,289,8
8231,Anthony Davis,0.0,0.107306,112,9
836,James Harden,0.0,0.103584,393,10


So, the metric we care about is how accurately the model is predictiong the top 5 mvp ranking. So, we want to know how many of the top 5 did the model find and how many ranks did it have to go through to find them.

In [26]:
# Error metric we care about
def find_ap(combination):
    actual = combination.sort_values("Share", ascending=False).head(5)
    predicted = combination.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [27]:
find_ap(combination)

0.8188235294117646

# Predictions for most of the years

In [28]:
years = list(range(1991, 2023))

In [29]:
# Backtesting
aps = []
all_predictions = []
for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [30]:
# Mean Average Precision
sum(aps) / len(aps)

0.7152712173135063

# Diagnosing the algorithm

In [31]:
def add_ranks(combination):
    combination = combination.sort_values("Share", ascending=False)
    combination["Rk"] = list(range(1, combination.shape[0]+1))
    combination = combination.sort_values("predictions", ascending=False)
    combination["Predicted_Rk"] = list(range(1, combination.shape[0]+1))
    combination["Diff"] = combination["Rk"] - combination["Predicted_Rk"]
    return combination

In [32]:
ranking = add_ranks(all_predictions[1])
ranking[ranking["Rk"] < 6].sort_values("Diff", ascending=False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1710,Karl Malone,0.857,0.192318,1,2,-1
10976,Michael Jordan,0.832,0.167629,2,3,-1
970,Grant Hill,0.327,0.128646,3,6,-3
4912,Tim Hardaway,0.207,0.059984,4,20,-16
8642,Glen Rice,0.117,0.03311,5,53,-48


In [33]:
# making backtest accessible
def backtest(stats, model, year, predictors):
    aps = []
    all_predictions = []
    for year in years[5:]:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors], train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [34]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [35]:
mean_ap

0.7152712173135063

In [36]:
all_predictions[all_predictions["Rk"] < 6].sort_values("Diff").head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1334,Jason Kidd,0.712,0.02821,2,52,-50
8642,Glen Rice,0.117,0.03311,5,53,-48
5420,Steve Nash,0.839,0.0341,1,45,-44
8910,Peja Stojaković,0.228,0.03627,4,38,-34
13331,Joakim Noah,0.258,0.046968,4,37,-33
5438,Steve Nash,0.739,0.054129,1,34,-33
3849,Chauncey Billups,0.344,0.052696,5,35,-30
1499,Chris Paul,0.138,0.072293,5,33,-28
5453,Steve Nash,0.785,0.074421,2,21,-19
4912,Tim Hardaway,0.207,0.059984,4,20,-16


Note to self: Look at why the algorithm is misrepresenting these players.

In [37]:
reg.coef_

array([ 2.93735224e-04,  9.32496060e-05,  4.36070215e-06, -4.06678753e-03,
        3.35298556e-03,  4.82731071e-03, -1.55936167e-01,  1.70143855e-03,
       -9.68145388e-03, -9.69457504e-03,  1.64556965e-02, -1.64638115e-02,
        7.05375427e-03,  8.78520556e-02, -6.34362495e-03,  1.04141393e-02,
       -4.78392544e-03,  2.09926485e-02,  3.38603376e-02, -2.65779402e-02,
        7.11287673e-03,  1.20701548e-02,  1.09007744e-02, -9.14249968e-03,
       -2.62455552e-03,  6.91794892e-03, -1.79966961e-04,  1.35851205e-04,
       -3.16068141e-04,  2.31975716e-02,  2.79407205e-04, -4.94261748e-04,
       -2.74860936e-04, -5.69253506e-04])

In [38]:
# Looking at the coefficients
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.087852,eFG%
18,0.03386,DRB
29,0.023198,W/L%
17,0.020993,ORB
10,0.016456,2P
21,0.01207,STL
22,0.010901,BLK
15,0.010414,FTA
20,0.007113,AST
12,0.007054,2P%


# Adding new predictors

In [39]:
# These new variables will let us know if a players stats are above or below average
stats_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean())

In [40]:
stats_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,1.013334,0.420714,0.961127,0.673469,0.508587,1.0
1,1.614653,1.028412,1.647646,0.673469,4.577279,1.0
2,0.311795,0.093492,0.274608,1.571429,0.000000,1.0
3,0.200440,0.186984,0.274608,0.000000,0.000000,1.0
4,2.383005,1.636110,1.784950,0.897959,1.525760,1.0
...,...,...,...,...,...,...
14692,0.735752,0.819562,0.479763,1.528302,0.650951,1.0
14693,0.071202,0.000000,0.000000,0.000000,0.130190,1.0
14694,1.281633,0.601012,1.119447,2.547170,0.520761,1.0
14695,0.474679,0.218550,0.319842,1.273585,0.650951,1.0


In [41]:
stats[["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stats_ratios[["PTS", "AST", "STL", "BLK", "3P"]]

In [42]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.707,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.707,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.707,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,0.707,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.707,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576


In [43]:
# Adding these new variables to the predictors
predictors += ["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]

In [44]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [45]:
mean_ap

0.726619022474594

In [46]:
stats["Pos"].unique()

array(['PF', 'SG', 'SF', 'PG', 'C', 'PG-SG', 'PF-SF', 'SG-PG', 'PF-C',
       'SG-SF', 'SF-PF', 'SF-SG', 'C-PF', 'SG-PF', 'PG-SF', 'SF-C'],
      dtype=object)

In [47]:
# New variable with position as categorical codes
stats["NPos"] = stats["Pos"].astype("category").cat.codes

In [48]:
# New variable with team as categorical codes
stats["NTm"] = stats["Tm"].astype("category").cat.codes

In [49]:
# Adding the new variables to predictors
predictors += ["NPos", "NTm"]

In [50]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R,NPos,NTm
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587,2,15
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279,12,15
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0,2,15
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0,2,15
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576,8,15


# Random Forest Model

In [51]:
rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

mean_ap, aps, all_predictions = backtest(stats, rf, years[5:], predictors)

In [52]:
mean_ap

0.7195289737193595