In [1]:
import pandas as pd

In [2]:
stats = pd.read_csv("player_mvp_stats.csv")

In [3]:
stats

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14087,14087,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14088,14088,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14089,14089,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14090,14090,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


In [4]:
# Machine Learning algorithms don't like null values
pd.isnull(stats).sum()

Unnamed: 0       0
Player           0
Pos              0
Age              0
Tm               0
G                0
GS               0
MP               0
FG               0
FGA              0
FG%             50
3P               0
3PA              0
3P%           2042
2P               0
2PA              0
2P%             84
eFG%            50
FT               0
FTA              0
FT%            462
ORB              0
DRB              0
TRB              0
AST              0
STL              0
BLK              0
TOV              0
PF               0
PTS              0
Year             0
Pts Won          0
Pts Max          0
Share            0
Team             0
W                0
L                0
W/L%             0
GB               0
PS/G             0
PA/G             0
SRS              0
dtype: int64

In [5]:
del stats["Unnamed: 0"]

In [6]:
stats[pd.isnull(stats["3P%"])][["Player", "3PA"]].head()

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0


In [7]:
# This means that these players attempted 0 3Ps

In [8]:
stats = stats.fillna(0)

In [9]:
# This will replace the NaN valeus with a 0. ie, if 0 3Ps are attempted, that means 0% 3P% 

In [10]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [11]:
# Get rid of all the columns which are target columns: like pts won, pts max and vote share
predictors = ["Age", "G", "GS", "MP", "FG", "FGA", 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

In [12]:
train = stats[stats["Year"] < 2021]
test = stats[stats["Year"] == 2021]
# This is only for data till 2022, because the tutorial guy only has that data.

In [13]:
# This is time series data

In [14]:
from sklearn.linear_model import Ridge
# Form of linear regression which reduces overfitting. Shrinks the coefficients
reg = Ridge(alpha=.1)

In [15]:
reg.fit(train[predictors], train["Share"])
# This mean that we're training the data from predictors to predict the share column

In [16]:
predictions = reg.predict(test[predictors])

In [17]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [18]:
combo = pd.concat([test[["Player", "Share"]],predictions], axis=1)

In [19]:
combo.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
641,Nikola Jokić,0.961,0.155155
8624,Joel Embiid,0.58,0.163637
3651,Stephen Curry,0.449,0.14249
9907,Giannis Antetokounmpo,0.345,0.206428
1389,Chris Paul,0.138,0.073984
10997,Luka Dončić,0.042,0.15026
7464,Damian Lillard,0.038,0.116646
3536,Julius Randle,0.02,0.089415
3531,Derrick Rose,0.01,0.035417
11358,Rudy Gobert,0.008,0.095294


In [20]:
# Need to pick an error metric: ie how did the algorithm do?
from sklearn.metrics import mean_squared_error

In [21]:
mean_squared_error(combo["Share"], combo["predictions"])

0.002666683168763711

In [22]:
combo = combo.sort_values("Share", ascending=False)

In [23]:
combo["Rk"] = list(range(1, combo.shape[0]+1))

In [24]:
combo.head(10)

Unnamed: 0,Player,Share,predictions,Rk
641,Nikola Jokić,0.961,0.155155,1
8624,Joel Embiid,0.58,0.163637,2
3651,Stephen Curry,0.449,0.14249,3
9907,Giannis Antetokounmpo,0.345,0.206428,4
1389,Chris Paul,0.138,0.073984,5
10997,Luka Dončić,0.042,0.15026,6
7464,Damian Lillard,0.038,0.116646,7
3536,Julius Randle,0.02,0.089415,8
3531,Derrick Rose,0.01,0.035417,9
11358,Rudy Gobert,0.008,0.095294,10


In [25]:
combo = combo.sort_values("predictions", ascending=False)
combo["Predicted_Rk"] = list(range(1, combo.shape[0]+1))

In [26]:
combo.head()

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
9907,Giannis Antetokounmpo,0.345,0.206428,4,1
8624,Joel Embiid,0.58,0.163637,2,2
641,Nikola Jokić,0.961,0.155155,1,3
10997,Luka Dončić,0.042,0.15026,6,4
3736,LeBron James,0.001,0.147459,15,5


Going to judge the model by the top 5<br>
If you got a top 5 candidate, you get a perfect score<br>
If you didn't get an actual top 5 candidate you check how far you have to go down in order to get to that player, and penalize based on that<br>

In [28]:
def find_ap(combo_df):
    actual = combo_df.sort_values("Share", ascending=False).head(5)
    predicted = combo_df.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps)/len(ps)

In [29]:
find_ap(combo)

0.7636363636363636

In [30]:
# This error metric is for how good the model is at predicting the top 5 mvp candidates and/or the winner

In [31]:
# Now we will implement backtesting to make predictions for all years

In [32]:
years = list(range(1991, 2022))

In [33]:
def add_ranks(predictions):
    predictions = predictions.sort_values("predictions", ascending=False)
    predictions["Predicted_Rk"] = list(range(1,predictions.shape[0]+1))
    predictions = predictions.sort_values("Share", ascending=False)
    predictions["Rk"] = list(range(1,predictions.shape[0]+1))
    predictions["Diff"] = (predictions["Rk"] - predictions["Predicted_Rk"])
    return predictions

In [34]:
aps = []
all_predictions = []
for year in years[5:]:
    train = stats[stats["Year"]<year]
    test = stats[stats["Year"]==year]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
    combination = add_ranks(combination)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [35]:
sum(aps)/len(aps)

0.7110668523458931

In [36]:
def backtest(stats, model, years, predictors):
    aps = []
    all_predictions = []
    for year in years:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors],train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [37]:
# Lets check the coefficients of the regression to find out which variables the algorithm is looking at the most
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.065312,eFG%
18,0.034362,DRB
28,0.028501,W/L%
17,0.02179,ORB
10,0.015914,2P
21,0.012054,STL
15,0.011474,FTA
22,0.011175,BLK
20,0.007435,AST
25,0.00614,PTS


In [43]:
# This is regression model diagnosis. Now we can add some more predictors to give the model more information
stat_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean(), include_groups=False)

In [49]:
stat_ratios = stat_ratios.reset_index(drop=True)

In [51]:
stat_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P
0,1.013334,0.420714,0.961127,0.673469,0.508587
1,1.614653,1.028412,1.647646,0.673469,4.577279
2,0.311795,0.093492,0.274608,1.571429,0.000000
3,0.200440,0.186984,0.274608,0.000000,0.000000
4,2.383005,1.636110,1.784950,0.897959,1.525760
...,...,...,...,...,...
14087,1.207728,1.207491,0.971223,0.000000,2.393794
14088,0.257201,0.201248,0.647482,0.240964,0.099741
14089,1.274824,0.704370,1.133094,1.445783,1.695604
14090,0.279567,0.553433,0.323741,0.000000,0.099741


In [53]:
stats[["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stat_ratios[["PTS", "AST", "STL", "BLK", "3P"]]
# This will add the ratios instead of the actual values.

In [55]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.707,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.707,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.707,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,0.707,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.707,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576


In [57]:
predictors += ["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]


In [59]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)


In [61]:
mean_ap

0.7115657976281106

In [63]:
# A linear regression algorithm can't pick up on categorical relationships

In [65]:
# Using a random forest algorithm:
from sklearn.ensemble import RandomForestRegressor

In [67]:
rf = RandomForestRegressor(n_estimators=400, random_state=1, min_samples_split=5)

In [72]:
stats["NPos"] = stats["Pos"].astype("category").cat.codes
stats["NTm"] = stats["Tm"].astype("category").cat.codes

In [74]:
mean_ap, aps, all_predictions = backtest(stats, rf, years[28:], predictors + ["NPos", "NTm"])

In [75]:
mean_ap

0.8132834757834758