In [32]:
import os
import time
import random
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import re
from scipy.spatial.distance import cdist
from datetime import date
from numpy import asarray
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
import xgboost as xgb

from nba_mvp_predictor_helper_functions import *

notebook_path = os.path.abspath("")
os.chdir(notebook_path)

pd.options.mode.chained_assignment = None  # default='warn'

** Create a more structured version of this
1. Scrape data from basketball references
    - Look at mvp voting
    - Pull individual counting and advanced stats, and team stats
    - scale stats accordingly
2. Feature selection and engineering
3. Train model off of previous year data
4. Predict MVP

## Load Previous MVP Data
- Scraped using NBA Data Scrape Notebook

In [2]:
mvp_df = pd.read_csv('Data/mvp_results_2013_2022.csv')
mvp_df = (mvp_df.groupby('Season')
                      .apply(lambda x: x.nlargest(5, 'Pts Won'))
                      .reset_index(drop=True))
mvp_df.head()

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.nlargest(5, 'Pts Won'))


Unnamed: 0,Player,Age,Season,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,LeBron James,28,2013,MIA,120.0,1207.0,1210.0,0.998,76.0,37.9,26.8,8.0,7.3,1.7,0.9,0.565,0.406,0.753,19.3,0.322
1,Carmelo Anthony,28,2013,NYK,1.0,475.0,1210.0,0.393,67.0,37.0,28.7,6.9,2.6,0.8,0.5,0.449,0.379,0.83,9.5,0.184
2,Kevin Durant,25,2014,OKC,119.0,1232.0,1250.0,0.986,81.0,38.5,32.0,7.4,5.5,1.3,0.7,0.503,0.391,0.873,19.2,0.295
3,LeBron James,29,2014,MIA,6.0,891.0,1250.0,0.713,77.0,37.7,27.1,6.9,6.3,1.6,0.3,0.567,0.379,0.75,15.9,0.264
4,Stephen Curry,26,2015,GSW,100.0,1198.0,1300.0,0.922,80.0,32.7,23.8,4.3,7.7,2.0,0.2,0.487,0.443,0.914,15.7,0.288


## Feature Selection & Engineering
- Add more features

In [5]:
#Stats_Df scraped using the NBA Scraper Notebook
stats_df = pd.read_csv('Data/season_stats_13_22.csv')
stats_df = df_transform(stats_df)
#Join to MVP DataFrame
mvp_train = join_dataframes(mvp_df, stats_df)
mvp_train.head()

Unnamed: 0,Player,Age,Season,Tm,Actual_Rank,First,Share,G,3P,3P%,...,TRB%,AST%,TOV%,USG%,OWS,DWS,OBPM,DBPM,Win_Contrib,Minutes
0,LeBron James,28,2013,MIA,1.0,120.0,0.998,0.93,0.26,0.41,...,13.1,36.4,12.4,30.2,14.6,4.7,9.3,2.4,14.6,0.79
1,Carmelo Anthony,28,2013,NYK,2.0,1.0,0.393,0.82,0.43,0.38,...,10.8,14.1,9.3,35.6,7.5,2.0,6.0,-1.7,12.1,0.77
2,Kevin Durant,25,2014,OKC,1.0,119.0,0.986,0.99,0.45,0.39,...,10.8,26.7,12.2,33.0,14.8,4.4,8.8,1.4,15.4,0.8
3,LeBron James,29,2014,MIA,2.0,6.0,0.713,0.94,0.28,0.38,...,11.5,32.0,14.4,31.0,12.3,3.7,7.8,1.1,12.3,0.79
4,Stephen Curry,26,2015,GSW,1.0,100.0,0.922,0.98,0.68,0.44,...,7.0,38.6,14.3,28.9,11.5,4.1,8.2,1.7,12.9,0.68


## Train New Model

In [84]:
def xy_split(df):
    y = df.Share

    non_feature_cols = df.iloc[:, :7].columns
    X = df.drop(columns = non_feature_cols)
    return X, y

In [85]:
def season_train_test_split(df, test_seasons, validation = False, random_state = 42):
    
    #Select random seasons as test data
    start = df.Season.min()
    end = df.Season.max()
    
    #set Random State
    if random_state:
        random.seed(random_state)
        
    #Select random seasons for test and validation set    
    sample = random.sample(range(start, end+1), test_seasons)
    
    
    if validation == False:
        #Split data
        test = df[df.Season.isin(sample)]
        train = df[~df.Season.isin(sample)]

        #Split by X and y
        X_train, y_train = xy_split(train)
        X_test, y_test = xy_split(test)
        return X_train, X_test, y_train, y_test, test.iloc[:, :7], train.iloc[:, :7]
    else:
        #Split the random seasons to validation 
        half = len(sample) // 2
        valid_half = sample[:half]
        test_half = sample[half:]
        
        #Split data into 3 sets
        test = df[df.Season.isin(test_half)]
        train = df[~df.Season.isin(sample)]
        valid = df[df.Season.isin(valid_half)]
        
        #Split X and Y
        X_train, y_train = xy_split(train)
        X_test, y_test = xy_split(test)
        X_valid, y_valid = xy_split(valid)
        
        return X_train, X_test, y_train, y_test, test.iloc[:, :7], train.iloc[:, :7]

In [83]:
season_train_test_split(mvp_train, 4, validation=True, random_state = 42)

([2014, 2013], [2017, 2022])

In [78]:
type(random_numbers(4))

list

In [60]:
y = mvp_train.Share

non_feature_cols = mvp_train.iloc[:, :7].columns
X = mvp_train.drop(columns = non_feature_cols)

In [51]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test, test, train = season_train_test_split(mvp_train, 3, random_state = 42)

In [57]:
def weighted_error(y_pred, y_true, ref_df):
    
    #Add Predicted and True Outputs into Reference df
    data = ref_df[['Player', 'Season']]
    data['y_pred'] = y_pred
    data['y_true'] = y_true
    
    # Calculate predicted and true ranks by season
    data['predicted_rank'] = data.groupby('Season')['y_pred'].rank(ascending=False)
    data['true_rank'] = data.groupby('Season')['y_true'].rank(ascending=False)

    # Calculate the absolute difference between predicted and true ranks
    data['rank_diff'] = np.abs(data['predicted_rank'] - data['true_rank'])

    # Assign weights based on true_rank
    data['weight'] = np.where(data['true_rank'] == 1, 3,
                              np.where(data['true_rank'] == 2, 2, 1))

    # Calculate the weighted error
    data['weighted_error'] = data['rank_diff'] * data['weight']

    # Return the sum of all weighted errors
    return data['weighted_error'].sum()

### 1. Linear Regression

In [69]:
# create a LinearRegression model
linear_reg = LinearRegression()

# train the LR model on the training data
linear_reg.fit(X_train, y_train)

# make predictions on the test set
y_pred = linear_reg.predict(X_test)

#Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False).round(2)
print('Linear Regression RMSE:', rmse)
lr_weightd_error = weighted_error(y_test, y_pred, test)
print(f"Weighted Error for Linear Regression model is {lr_weightd_error}")
r2 = r2_score(y_test, y_pred)
print("R-squared score: %.2f" % r2)

Linear Regression RMSE: 0.3
Weighted Error for Linear Regression model is 0.0
R-squared score: -0.55


### 2. Ridge Regression

In [75]:
# Initialize the model
ridge = Ridge(alpha=0.5)

# Fit the model on the training data
ridge.fit(X_train, y_train)

# Predict on the test data
y_pred = ridge.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False).round(2)
print('Ridge Regression RMSE:', rmse)
r_weightd_error = weighted_error(y_test, y_pred, test)
print(f"Weighted Error for Ridge Regression model is {r_weightd_error}")
r2 = r2_score(y_test, y_pred)
print("R-squared score: %.2f" % r2)

Ridge Regression RMSE: 0.18
Weighted Error for Ridge Regression model is 0.0
R-squared score: 0.46


### 4. XgBoost

In [74]:
def custom_obj(y_true, y_pred):
    # Compute the weights based on true_rank
    true_rank = y_true.argsort().argsort() + 1
    weights = np.where(true_rank == 1, 3, np.where(true_rank == 2, 2, 1))

    # Compute the gradient and hessian
    grad = (y_pred - y_true) * weights
    hess = np.ones_like(y_true) * weights

    return grad, hess

def custom_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    
    # Assume ref_df is available as a global variable or accessible in the current scope
    # You may also modify this function to pass ref_df as an argument if needed
    error = weighted_error(y_pred, y_true, ref_df)
    return 'weighted_error', error

In [15]:
import xgboost as xgb

# Prepare the data
train_data = xgb.DMatrix(X_train, label=y_train)
valid_data = xgb.DMatrix(X_valid, label=y_valid)

# Set the parameters for the XGBoost model
params = {
    'learning_rate': 0.1,
    'max_depth': 3,
    'objective': custom_obj,  # Use the custom objective function
    'n_jobs': -1,
}

# Train the model
num_rounds = 1000
watchlist = [(train_data, 'train'), (valid_data, 'valid')]

model = xgb.train(
    params,
    train_data,
    num_rounds,
    watchlist,
    early_stopping_rounds=50,
    obj=custom_obj,  # Use the custom objective function
    feval=custom_eval,  # Use the custom evaluation metric
    maximize=False,
    verbose_eval=10,
)


Weighted Rank Error: 1.5


In [16]:
#Show Results of the Naive Model on the Trained Data
mvp2_xgb = mvp_train.copy()
mvp2_xgb['Pred_Share'] = xg_reg.predict(X)
mvp2_xgb['Predicted_Rank'] = mvp2_xgb.groupby('Season')['Pred_Share'].rank(ascending=False, method='dense')
mvp2_xgb = mvp2_xgb[['Player', 'Season', 'Actual_Rank', 'Predicted_Rank']]
mvp2_xgb.loc[mvp2_xgb.Actual_Rank != mvp2_xgb.Predicted_Rank]

Unnamed: 0,Player,Season,Actual_Rank,Predicted_Rank
10,Kawhi Leonard,2017,3.0,4.0
11,LeBron James,2017,4.0,3.0
16,Giannis Antetokounmpo,2020,1.0,2.0
17,LeBron James,2020,2.0,1.0
24,Joel Embiid,2022,2.0,3.0
25,Giannis Antetokounmpo,2022,3.0,2.0


In [None]:
param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.5, 1.0],
    'colsample_bytree': [0.5, 1.0],
    'n_estimators': [100, 200],
    'reg_alpha': [0.0, 0.5],
    'reg_lambda': [1, 0.5],
    'gamma': [0.0, 0.5]
}

# Define the cross-validation object
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the model
model = xgb.XGBRegressor(objective='reg:squarederror', n_jobs=-1)

scorer = make_scorer(weighted_error, greater_is_better=False)

# Define the grid search object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring=scorer, verbose=1)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

In [17]:
# Train model with tuned paramters
best_params = {'colsample_bytree': 0.5, 
                   'gamma': 0.5, 
                   'learning_rate': 0.01, 
                   'max_depth': 3, 'n_estimators': 100,
                   'reg_alpha': 0.5, 'reg_lambda': 1, 
                   'subsample': 0.5}

xg_tuned = xgb.XGBRegressor(**best_params, evalmetric = weighted_error)
xg_tuned.fit(X_train, y_train)

# Predict on test set
y_pred = xg_tuned.predict(X_test)

# Evaluate model
test_error = weighted_error(y_test.values, y_pred)
print(f"Weighted error on test data: {test_error:.3f}")

Weighted error on test data: 3.000


In [18]:
#Show Results of the Tuned Model on the Trained Data
mvp2_xgb = mvp_train.copy()
mvp2_xgb['Pred_Share'] = xg_reg.predict(X)
mvp2_xgb['Predicted_Rank'] = mvp2_xgb.groupby('Season')['Pred_Share'].rank(ascending=False, method='dense')
mvp2_xgb = mvp2_xgb[['Player', 'Season', 'Actual_Rank', 'Predicted_Rank']]
mvp2_xgb.loc[mvp2_xgb.Actual_Rank != mvp2_xgb.Predicted_Rank]

Unnamed: 0,Player,Season,Actual_Rank,Predicted_Rank
10,Kawhi Leonard,2017,3.0,4.0
11,LeBron James,2017,4.0,3.0
16,Giannis Antetokounmpo,2020,1.0,2.0
17,LeBron James,2020,2.0,1.0
24,Joel Embiid,2022,2.0,3.0
25,Giannis Antetokounmpo,2022,3.0,2.0


## 2023 Prediction

In [19]:
df_23 = pd.read_csv('Data/2023_data.csv')
df_23 = df_transform(df_23)
df_23 = df_23.loc[(df_23.G > 0.5) & (df_23.Minutes > 0.5)]
non_feature_cols = df_23.iloc[:, :3].columns
X_23 = df_23.drop(columns = non_feature_cols)

In [26]:
#Linear Regression Prediction
lr_23 = df_23.copy()
lr_23['Pred_Share'] = linear_reg.predict(X_23).round(2)
lr_23 = lr_23.loc[(lr_23.Win_Contrib > 6) & (lr_23.G > .6)]
lr_23['Predicted_Rank'] = lr_23['Pred_Share'].rank(ascending=False, method='dense')
lr_23 = lr_23[['Player','Win_Contrib', 'PER', 'Pred_Share', 'Predicted_Rank']]
lr_23.sort_values(by = ['Predicted_Rank'], ascending = True).head(5)

Unnamed: 0,Player,Win_Contrib,PER,Pred_Share,Predicted_Rank
341,Luka Dončić,8.7,28.7,0.95,1.0
252,Joel Embiid,11.6,31.4,0.79,2.0
399,Nikola Jokić,8.5,31.5,0.66,3.0
401,Nikola Vučević,6.1,19.1,0.52,4.0
333,LeBron James,7.1,23.9,0.45,5.0


In [22]:
#Ridge Regression Prediction
rr_23 = df_23.copy()
rr_23['Pred_Share'] = ridge.predict(X_23).round(2)
rr_23 = rr_23.loc[rr_23.Win_Contrib > 0]
rr_23['Predicted_Rank'] = rr_23['Pred_Share'].rank(ascending=False, method='dense')
rr_23 = rr_23[['Player','Win_Contrib', 'PER', 'Pred_Share', 'Predicted_Rank']]
rr_23.sort_values(by = ['Predicted_Rank'], ascending = True).head(10)

Unnamed: 0,Player,Win_Contrib,PER,Pred_Share,Predicted_Rank
399,Nikola Jokić,8.5,31.5,0.85,1.0
252,Joel Embiid,11.6,31.4,0.65,2.0
341,Luka Dončić,8.7,28.7,0.6,3.0
238,Jayson Tatum,12.9,23.7,0.55,4.0
166,Giannis Antetokounmpo,11.6,29.0,0.51,5.0
220,James Harden,7.3,21.6,0.49,6.0
129,Domantas Sabonis,7.1,23.5,0.48,7.0
90,Damian Lillard,6.0,26.7,0.4,8.0
22,Anthony Davis,5.9,27.8,0.39,9.0
401,Nikola Vučević,6.1,19.1,0.38,10.0


In [23]:
#Lasso Regression Prediction
lasso_23 = df_23.copy()
lasso_23['Pred_Share'] = lasso.predict(X_23).round(2)
lasso_23 = lasso_23.loc[lasso_23.Win_Contrib > 5]
lasso_23['Predicted_Rank'] = lasso_23['Pred_Share'].rank(ascending=False, method='dense')
lasso_23 = lasso_23[['Player','Win_Contrib', 'PER', 'Pred_Share', 'Predicted_Rank']]
lasso_23.sort_values(by = ['Predicted_Rank'], ascending = True).head()

Unnamed: 0,Player,Win_Contrib,PER,Pred_Share,Predicted_Rank
399,Nikola Jokić,8.5,31.5,0.72,1.0
193,Ja Morant,8.8,23.3,0.71,2.0
220,James Harden,7.3,21.6,0.71,2.0
341,Luka Dončić,8.7,28.7,0.71,2.0
495,Trae Young,8.6,22.0,0.71,2.0


In [25]:
#xGboost Prediction
xgb_23 = df_23.copy()
xgb_23['Pred_Share'] = xg_tuned.predict(X_23).round(2)
xgb_23 = xgb_23.loc[(xgb_23.Win_Contrib > 5) & (xgb_23['USG%'] > 25)]
xgb_23['Predicted_Rank'] = xgb_23['Pred_Share'].rank(ascending=False, method='dense')
xgb_23 = xgb_23[['Player','Win_Contrib', 'PER', 'Pred_Share','Predicted_Rank']]
xgb_23.sort_values(by = ['Predicted_Rank'], ascending = True).head()

Unnamed: 0,Player,Win_Contrib,PER,Pred_Share,Predicted_Rank
238,Jayson Tatum,12.9,23.7,0.58,1.0
29,Bam Adebayo,7.3,20.1,0.58,1.0
333,LeBron James,7.1,23.9,0.58,1.0
471,Stephen Curry,6.7,24.1,0.58,1.0
123,Devin Booker,6.7,22.0,0.58,1.0
