In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#engineering
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#prediction
from xgboost import XGBRegressor

#validation
from sklearn.metrics import mean_squared_error

In [6]:
full = pd.read_csv('data/nba_player_stats.csv')

In [12]:
full['pts w/l'] = full['PTS'] * full['W/L%']
full['usage w/l'] = full['USG%'] * full['W/L%']
full['bpm w/l'] = full['BPM'] * full['W/L%']
full['per w/l'] = full['PER'] * full['W/L%']
full['ws w/l'] = full['WS'] * full['W/L%']

In [13]:
quant = ['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'W', 'L', 'W/L%',
       'PS/G', 'PA/G', 'SRS', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS',
       'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'pts w/l',
        'usage w/l', 'bpm w/l', 'per w/l', 'ws w/l']
corrs = []
for col in quant:
    correlation = full['Share'].corr(full[col])
    corrs.append((col, correlation))
corrs = sorted(corrs, key = lambda x: x[1], reverse = True)
correlated = [tup[0] for tup in corrs[:15]] 

In [8]:
def get_train(year, df):
    training = df[df['Year'] != year]
    x_train = training[correlated]
    y_train = training['Share']
    
    test = df[df['Year'] == year]
    x_test = test[correlated]
    y_test = test['Share']
    
    return x_train, y_train, x_test, y_test

In [9]:
def mvp_table(df, regressor):
    predicted_winners = [name_mvp(predict_share(year, df, regressor)) 
         for year in df.sort_values(by = 'Year')['Year'].unique()]
    
    winners = df[df['Rank'] == '1'].reset_index().sort_values(by = 'Year')
    final = winners[['Year', 'Player']].rename({'Player': 'Actual MVP'}, axis = 1)
    final['Predicted MVP'] = predicted_winners
    
    final['Correct'] = final['Actual MVP'] == final['Predicted MVP']
    return final

In [10]:
def name_mvp(df):
    return df.sort_values(by = 'Predicted share', ascending = False)['Player'].iloc[0]

In [11]:
def predict_share(year, df, pred):
    X_train, Y_train, X_test, Y_test = get_train(year, df)
    players = df['Player']
    
    pred.fit(X_train, Y_train)
    predictions = pred.predict(X_test)
    
    out = X_test.copy()
    out['Actual share'] = Y_test
    out['Predicted share'] = predictions
    out['Player'] = players
    return out[['Actual share', 'Predicted share', 'Player']]
    

In [14]:
learn = [0.00275, 0.005, 0.02, 0.1, 0.3, 0.5, 0.8]
depth = [2, 4, 6, 8, 10, 12, 15, 20, 25]
estimators = [10, 15, 20, 30, 50, 70, 100, 200]

results = []

for rate in learn:
    for d in depth:
        for n in estimators:
            predicted = mvp_table(full, XGBRegressor(learning_rate = rate, max_depth = d, n_estimators = n,
                                                    subsample = 1, colsample_bytree = 1))
            results.append(f"learning rate {rate}, depth {d}, estimators {n}: {predicted['Correct'].mean()}")

KeyboardInterrupt: 

In [15]:
results

['learning rate 0.00275, depth 2, estimators 10: 0.5581395348837209',
 'learning rate 0.00275, depth 2, estimators 15: 0.5581395348837209',
 'learning rate 0.00275, depth 2, estimators 20: 0.5813953488372093',
 'learning rate 0.00275, depth 2, estimators 30: 0.6046511627906976',
 'learning rate 0.00275, depth 2, estimators 50: 0.5813953488372093',
 'learning rate 0.00275, depth 2, estimators 70: 0.6046511627906976',
 'learning rate 0.00275, depth 2, estimators 100: 0.5813953488372093',
 'learning rate 0.00275, depth 2, estimators 200: 0.627906976744186',
 'learning rate 0.00275, depth 4, estimators 10: 0.5813953488372093',
 'learning rate 0.00275, depth 4, estimators 15: 0.5813953488372093',
 'learning rate 0.00275, depth 4, estimators 20: 0.5813953488372093',
 'learning rate 0.00275, depth 4, estimators 30: 0.5813953488372093',
 'learning rate 0.00275, depth 4, estimators 50: 0.5581395348837209',
 'learning rate 0.00275, depth 4, estimators 70: 0.5813953488372093',
 'learning rate 0.0