In [69]:
import pandas as pd

In [70]:
stats = pd.read_csv("player_mvp_stats.csv", index_col=0) # reading in the csv file
stats

Unnamed: 0,Player,Age,Tm,Pos,GP_x,G,A,PTS_x,+/-,PIM,...,PTS%,GF,GA,SRS,SOS,RPt%,ROW,RgRec,RgPt%,RW
0,A.J. Greer,20.0,COL,LW,5.0,0.0,1.0,1.0,-2.0,4.0,...,,,,,,,,,,
1,Anton Lindholm,22.0,COL,D,12.0,0.0,0.0,0.0,-8.0,2.0,...,,,,,,,,,,
2,Ben Smith,28.0,COL,RW,40.0,2.0,2.0,4.0,-7.0,4.0,...,,,,,,,,,,
3,Blake Comeau,30.0,COL,LW,77.0,8.0,12.0,20.0,-19.0,58.0,...,,,,,,,,,,
4,Carl Soderberg,31.0,COL,C,80.0,6.0,8.0,14.0,-26.0,22.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10035,,,,,,,,,,,...,0.348,207.0,313.0,-1.23,0.06,0.323,,18-50-14,0.305,18.0
10036,,,,,,,,,,,...,0.604,239.0,236.0,0.03,-0.01,0.537,,35-27-20,0.549,35.0
10037,,,,,,,,,,,...,0.573,266.0,248.0,0.21,-0.01,0.524,,34-31-17,0.518,34.0
10038,,,,,,,,,,,...,0.470,214.0,264.0,-0.58,0.03,0.390,,22-37-23,0.409,22.0


In [71]:
pd.isnull(stats).sum() # Are there any missing values in data?

Player     115
Age        163
Tm         163
Pos        163
GP_x       163
G          163
A          163
PTS_x      163
+/-        163
PIM        163
PS         163
EV         163
PP         163
SH         163
GW         163
EV.1       163
PP.1       163
SH.1       163
S          163
S%         437
TOI        163
ATOI       163
BLK        168
HIT        163
FOW        168
FOL        168
FO%       3907
Year         0
Votes      115
Vote%      115
Team      3077
GP_y      6996
W         6996
L         6996
OL        6996
PTS_y     6996
PTS%      6996
GF        6996
GA        6996
SRS       6996
SOS       6996
RPt%      6996
ROW       7473
RgRec     6996
RgPt%     6996
RW        9563
dtype: int64

In [101]:
# We are guessing the null values are when a player did not have an attempt
stats[pd.isnull(stats["G"])][["Player", "A"]].head()

Unnamed: 0,Player,A


In [73]:
stats = stats.fillna(0) # replace na values with zero

In [74]:
stats.columns # look at the columns to determine what we want to use for predictors

Index(['Player', 'Age', 'Tm', 'Pos', 'GP_x', 'G', 'A', 'PTS_x', '+/-', 'PIM',
       'PS', 'EV', 'PP', 'SH', 'GW', 'EV.1', 'PP.1', 'SH.1', 'S', 'S%', 'TOI',
       'ATOI', 'BLK', 'HIT', 'FOW', 'FOL', 'FO%', 'Year', 'Votes', 'Vote%',
       'Team', 'GP_y', 'W', 'L', 'OL', 'PTS_y', 'PTS%', 'GF', 'GA', 'SRS',
       'SOS', 'RPt%', 'ROW', 'RgRec', 'RgPt%', 'RW'],
      dtype='object')

In [75]:
predictors = ['Age', 'GP_x', 'G', 'A', 'PTS_x', '+/-', 'PIM',
       'PS', 'EV', 'PP', 'SH', 'GW', 'EV.1', 'PP.1', 'SH.1', 'S', 'S%', 'TOI',
       'BLK', 'HIT', 'FOW', 'FOL', 'FO%', 'Year', 'Votes', 'Vote%',
       'W', 'GF', 'GA', 'SRS','RPt%', 'ROW', 'RgPt%', 'RW']

In [82]:
# split data into test and training dataset
train = stats[~(stats["Year"] == 2014)] #
test = stats[stats["Year"] == 2014]

In [83]:
from sklearn.linear_model import Ridge

# using ridge regression
reg = Ridge(alpha=.05) #shrinks to 0.05 to avoid overfitting

In [84]:
reg.fit(train[predictors],train["Vote%"]) # fit the regression model to the training dataset

Ridge(alpha=0.05)

In [85]:
predictions = reg.predict(test[predictors]) # make predictions  
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index) # writing predictions to pandas dataframe

In [86]:
combination = pd.concat([test[["Player", "Vote%"]], predictions], axis=1) # combining the player and share columns with the test data 

In [87]:
combination.sort_values("Vote%", ascending=False).head(10)

Unnamed: 0,Player,Vote%,predictions
5117,Sidney Crosby,97.81,97.809757
6161,Ryan Getzlaf,64.01,64.009843
4532,Claude Giroux,31.75,31.749924
3832,Semyon Varlamov,18.69,18.689953
4928,Patrice Bergeron,10.0,9.999976
3236,Tyler Seguin,6.13,6.129988
3682,Joe Pavelski,5.18,5.179986
3594,Anze Kopitar,4.23,4.229993
8730,Jonathan Toews,3.87,3.86999
3833,Ben Bishop,3.8,3.79999


In [88]:
combination.sort_values("predictions", ascending=False).head(10)

Unnamed: 0,Player,Vote%,predictions
5117,Sidney Crosby,97.81,97.809757
6161,Ryan Getzlaf,64.01,64.009843
4532,Claude Giroux,31.75,31.749924
3832,Semyon Varlamov,18.69,18.689953
4928,Patrice Bergeron,10.0,9.999976
3236,Tyler Seguin,6.13,6.129988
3682,Joe Pavelski,5.18,5.179986
3594,Anze Kopitar,4.23,4.229993
8730,Jonathan Toews,3.87,3.86999
3833,Ben Bishop,3.8,3.79999


In [89]:
from sklearn.metrics import mean_squared_error

mean_squared_error(combination["Vote%"], combination["predictions"])

# mean difference between predicted and actual value

1.0346014385257226e-10

In [91]:
combination["Vote%"].value_counts() # most of the share values are zero (received zero votes)

0.00     880
0.07       4
2.55       2
0.66       2
31.75      1
3.87       1
1.39       1
0.80       1
64.01      1
2.12       1
97.81      1
10.00      1
3.80       1
1.46       1
18.69      1
0.29       1
0.22       1
5.18       1
4.23       1
0.95       1
6.13       1
0.51       1
Name: Vote%, dtype: int64

In [92]:
# we need to consider the player's rank (don't necesssarily care about players not receiving votes)
actual = combination.sort_values("Vote%", ascending=False)
predicted = combination.sort_values("predictions", ascending=False) # sorting by predictions
actual["Rk"] = list(range(1,actual.shape[0]+1)) 
predicted["Predicted_Rk"] = list(range(1,predicted.shape[0]+1)) # assign a predicted rank column

In [93]:
actual.merge(predicted, on="Player").head(5) 

Unnamed: 0,Player,Vote%_x,predictions_x,Rk,Vote%_y,predictions_y,Predicted_Rk
0,Sidney Crosby,97.81,97.809757,1,97.81,97.809757,1
1,Ryan Getzlaf,64.01,64.009843,2,64.01,64.009843,2
2,Claude Giroux,31.75,31.749924,3,31.75,31.749924,3
3,Semyon Varlamov,18.69,18.689953,4,18.69,18.689953,4
4,Patrice Bergeron,10.0,9.999976,5,10.0,9.999976,5


In [94]:
def find_ap(combination):
    '''Determines how precise our predictions are. 
    Comparing actuals vs. predicted. If we correctly
    predict a player within the top 5 in mvp voting, we 
    earn a point. If we incorrectly predict a player, we 
    are penalized.'''
    
    actual = combination.sort_values("Vote%", ascending=False).head(5)
    predicted = combination.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index,row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found / seen)
        seen += 1

    return sum(ps) / len(ps)

In [95]:
ap = find_ap(combination)

In [96]:
ap

1.0

In [97]:
years = list(range(2012,2021))

In [98]:
# Back testing for last 5 years
aps = []
all_predictions = []
for year in years[5:]:
    
    train = stats[stats["Year"] < year] # ridge regression from earlier
    test = stats[stats["Year"] == year] # ridge regression from earlier
    reg.fit(train[predictors],train["Vote%"]) # ridge regression from earlier
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Vote%"]], predictions], axis=1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [99]:
sum(aps) / len(aps) # mean average precision 

1.0

In [49]:
def add_ranks(predictions):
    '''Comparing predicted to actual rank. Sorting 
    by actual share of mvp votes.'''
    predictions = predictions.sort_values("predictions", ascending=False)
    predictions["Predicted_Rk"] = list(range(1,predictions.shape[0]+1))
    predictions = predictions.sort_values("Vote%", ascending=False)
    predictions["Rk"] = list(range(1,predictions.shape[0]+1))
    predictions["Diff"] = (predictions["Rk"] - predictions["Predicted_Rk"])
    return predictions

In [50]:
add_ranks(all_predictions[1]) # shows the difference between predicted and actual ranking

Unnamed: 0,Player,Vote%,predictions,Predicted_Rk,Rk,Diff
1765,Taylor Hall,77.07,7.707054e+01,1,1,0
50,Nathan MacKinnon,72.80,7.280053e+01,2,2,0
1541,Anze Kopitar,33.60,3.360026e+01,3,3,0
8575,Claude Giroux,33.29,3.329025e+01,4,4,0
4734,Connor McDavid,16.46,1.646013e+01,5,5,0
...,...,...,...,...,...,...
4385,Erik Gudbranson,0.00,4.761942e-08,319,904,585
5775,Alexander Wennberg,0.00,3.854665e-08,320,905,585
4139,Dillon Heatherington,0.00,3.756019e-08,321,906,585
1712,Nic Dowd,0.00,3.705987e-08,322,907,585


In [52]:
def backtest(stats, model, years, predictors):
    '''Building a function to perform the earlier code.'''
    aps = []
    all_predictions = []
    for year in years:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors],train["Vote%"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Vote%"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions) # returns mean average precision

In [53]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors) # running testing of regression model for last 10 years

In [54]:
mean_ap # mean average precision

1.0

In [55]:
all_predictions[all_predictions["Rk"] < 5].sort_values("Diff").head(10)

Unnamed: 0,Player,Vote%,predictions,Predicted_Rk,Rk,Diff
4707,Connor McDavid,96.05,96.051407,1,1,0
9723,Sidney Crosby,66.11,66.110967,2,2,0
1512,Sergei Bobrovsky,28.08,28.080398,3,3,0
1294,Brent Burns,16.35,16.350253,4,4,0
1765,Taylor Hall,77.07,77.070539,1,1,0
50,Nathan MacKinnon,72.8,72.800526,2,2,0
1541,Anze Kopitar,33.6,33.600259,3,3,0
8575,Claude Giroux,33.29,33.290248,4,4,0
2012,Nikita Kucherov,98.07,98.071234,1,1,0
4667,Sidney Crosby,43.22,43.220553,2,2,0


In [56]:
# joining two pandas series to determine most important predictors for mvp voting
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
25,0.9998612,Vote%
29,2.039293e-05,SRS
30,1.996242e-05,RPt%
24,8.678326e-06,Votes
7,3.203583e-06,PS
8,8.109897e-07,EV
11,7.586407e-07,GW
3,5.172643e-07,A
28,2.406889e-07,GA
1,6.687698e-08,GP_x


In [58]:
# Looking at several variables and determine their effect on precision

stat_ratios = stats[["G","S", "BLK", "HIT", "Year"]].groupby("Year").apply(lambda x: x/x.mean())

In [59]:
stats[["G_R","S_R", "BLK_R", "HIT_R"]] = stat_ratios[["G","S", "BLK", "HIT"]]

In [60]:
predictors += ["G_R","S_R", "BLK_R", "HIT_R"] # adding these values to our predictors

In [61]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [62]:
mean_ap # 

1.0