In [58]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import datetime
from sklearn.ensemble import RandomForestRegressor

In [3]:
stats = pd.read_csv("/Users/paramjaswal/Desktop/NBa/Player_mvp_stats.csv")
del stats["Unnamed: 0"]

In [4]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [5]:
predictors = ['Age',  'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
        'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [6]:
train = stats[stats["Year"] < 2022]
test = stats[stats["Year"] == 2022]

In [7]:
reg = Ridge(alpha = 0.1)

In [8]:
reg.fit(train[predictors], train['Share'])

In [9]:
predictions = reg.predict(test[predictors])
predictions = pd.DataFrame(predictions, columns=['Predictions'], index=test.index)

In [10]:
predictions

Unnamed: 0,Predictions
727,0.014073
728,-0.033466
729,-0.003374
730,0.015952
731,-0.009124
...,...
18717,-0.018172
18718,-0.011725
18719,0.005339
18720,0.004157


In [11]:
combinations = pd.concat([test[["Player", "Share"]], predictions], axis=1)
combinations

Unnamed: 0,Player,Share,Predictions
727,Aaron Gordon,0.0,0.014073
728,Austin Rivers,0.0,-0.033466
729,Bol Bol,0.0,-0.003374
730,Bones Hyland,0.0,0.015952
731,Bryn Forbes,0.0,-0.009124
...,...,...,...
18717,Micah Potter,0.0,-0.018172
18718,Rodney McGruder,0.0,-0.011725
18719,Saben Lee,0.0,0.005339
18720,Saddiq Bey,0.0,0.004157


In [12]:
combinations.sort_values("Share", ascending=False).head(40)

Unnamed: 0,Player,Share,Predictions
742,Nikola Jokić,0.875,0.205691
916,Joel Embiid,0.706,0.189934
17762,Giannis Antetokounmpo,0.595,0.211422
986,Devin Booker,0.216,0.087892
17541,Luka Dončić,0.146,0.166332
1325,Jayson Tatum,0.043,0.100802
18435,Ja Morant,0.01,0.118898
8626,Stephen Curry,0.004,0.10457
984,Chris Paul,0.002,0.091914
10787,LeBron James,0.001,0.160361


In [13]:
mean_squared_error(combinations["Share"], combinations["Predictions"])

0.002277897532206951

In [14]:
combinations["Share"].value_counts()

0.000    593
0.001      3
0.875      1
0.706      1
0.002      1
0.216      1
0.043      1
0.004      1
0.146      1
0.595      1
0.010      1
Name: Share, dtype: int64

In [20]:
combinations = combinations.sort_values("Share", ascending=False)
combinations['Rk'] = list(range(1, combinations.shape[0] + 1))
combinations.head(20)

Unnamed: 0,Player,Share,Predictions,Rk,Predicted Rk
742,Nikola Jokić,0.875,0.205691,1,2
916,Joel Embiid,0.706,0.189934,2,3
17762,Giannis Antetokounmpo,0.595,0.211422,3,1
986,Devin Booker,0.216,0.087892,4,19
17541,Luka Dončić,0.146,0.166332,5,4
1325,Jayson Tatum,0.043,0.100802,6,14
18435,Ja Morant,0.01,0.118898,7,8
8626,Stephen Curry,0.004,0.10457,8,11
984,Chris Paul,0.002,0.091914,9,17
8413,Kevin Durant,0.001,0.144556,10,6


In [21]:
combinations = combinations.sort_values("Predictions", ascending=False)
combinations['Predicted Rk'] = list(range(1, combinations.shape[0] + 1))
combinations.head(20)

Unnamed: 0,Player,Share,Predictions,Rk,Predicted Rk
17762,Giannis Antetokounmpo,0.595,0.211422,3,1
742,Nikola Jokić,0.875,0.205691,1,2
916,Joel Embiid,0.706,0.189934,2,3
17541,Luka Dončić,0.146,0.166332,5,4
10787,LeBron James,0.001,0.160361,12,5
8413,Kevin Durant,0.001,0.144556,10,6
915,James Harden,0.0,0.124531,406,7
18435,Ja Morant,0.01,0.118898,7,8
17932,Trae Young,0.0,0.11751,407,9
10777,Anthony Davis,0.0,0.105644,408,10


In [22]:
# Average Precision

In [23]:
combinations = combinations.sort_values("Share", ascending=False)
combinations.head(20)

Unnamed: 0,Player,Share,Predictions,Rk,Predicted Rk
742,Nikola Jokić,0.875,0.205691,1,2
916,Joel Embiid,0.706,0.189934,2,3
17762,Giannis Antetokounmpo,0.595,0.211422,3,1
986,Devin Booker,0.216,0.087892,4,19
17541,Luka Dončić,0.146,0.166332,5,4
1325,Jayson Tatum,0.043,0.100802,6,14
18435,Ja Morant,0.01,0.118898,7,8
8626,Stephen Curry,0.004,0.10457,8,11
984,Chris Paul,0.002,0.091914,9,17
8413,Kevin Durant,0.001,0.144556,10,6


In [25]:
def avg_precision(combinations):
    actual = combinations.sort_values("Share", ascending=False).head(5)
    predicted = combinations.sort_values("Predictions", ascending=False)

    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)

        seen += 1
    return sum(ps)/len(ps)

avg_precision(combinations)

0.8526315789473685

In [26]:
# Backtesting to predict years

In [28]:
current_year = datetime.datetime.now().year
years = list(range(1956, current_year))

In [37]:
avg_p = []
all_predictions = []
for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=['Predictions'], index=test.index)
    combinations = pd.concat([test[["Player", "Share"]], predictions], axis=1)
    all_predictions.append(combinations)
    avg_p.append(avg_precision(combinations))

In [38]:
total_avg_p = sum(avg_p)/len(avg_p)
total_avg_p

0.7510646413317402

In [44]:
def add_ranks(combinations):
    combinations = combinations.sort_values("Share", ascending=False)
    combinations['Rk'] = list(range(1, combinations.shape[0] + 1))
    combinations = combinations.sort_values("Predictions", ascending=False)
    combinations['Predicted Rk'] = list(range(1, combinations.shape[0] + 1))
    combinations["Difference"] = combinations["Predicted Rk"] - combinations["Rk"]
    return combinations


ranks = add_ranks(all_predictions[1])
ranks[ranks['Rk'] <= 5].sort_values("Difference", ascending=False)

Unnamed: 0,Player,Share,Predictions,Rk,Predicted Rk,Difference
15854,Bill Russell,0.699,0.275321,1,3,2
17330,Jerry West,0.141,0.165128,5,7,2
2786,Oscar Robertson,0.318,0.213218,3,4,1
2936,Wilt Chamberlain,0.358,0.440722,2,1,-1
17326,Elgin Baylor,0.193,0.279414,4,2,-2


In [51]:
def backtesting(stats, model, years, predictors):
    avg_p = []
    all_predictions = []
    for year in years[5:]:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors], train["Share"])
        predictions = reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=['Predictions'], index=test.index)
        combinations = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combinations = add_ranks(combinations)
        all_predictions.append(combinations)
        avg_p.append(avg_precision(combinations))
    return sum(avg_p)/len(avg_p), avg_p, pd.concat(all_predictions)



mean_avg_p, avg_p, all_predictions = backtesting(stats, reg, years, predictors)

In [53]:
mean_avg_p

0.7483113014816261

In [55]:
all_predictions[all_predictions['Rk'] <= 5].sort_values("Difference", ascending=False).head(20)

Unnamed: 0,Player,Share,Predictions,Rk,Predicted Rk,Difference
21811,Dennis Johnson,0.009,0.015418,5,66,61
11220,Glen Rice,0.117,0.028179,5,57,52
14888,Bernard King,0.491,0.047402,2,31,29
1645,Chris Paul,0.138,0.079411,5,30,25
11488,Peja Stojaković,0.228,0.04157,4,28,24
1060,Pete Maravich,0.061,0.053195,3,23,20
6989,Sidney Moncrief,0.301,0.060797,4,23,19
15062,Sam Jones,0.04,0.047812,4,23,19
20046,Joakim Noah,0.258,0.064595,4,22,18
14187,Spencer Haywood,0.054,0.06873,5,23,18


In [57]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.058209,eFG%
29,0.034463,W/L%
10,0.021339,2P
22,0.012816,BLK
21,0.01137,STL
7,0.008545,3P
20,0.008512,AST
19,0.008161,TRB
25,0.008096,PTS
12,0.007978,2P%


In [59]:
# Random Forest Regressor

In [60]:
rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

In [61]:
mean_avg_p, avg_p, all_predictions = backtesting(stats, rf, years, predictors)

In [62]:
mean_avg_p

0.7756968936311986