In [51]:
import numpy as np
import pandas as pd
import os
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from scipy.interpolate import UnivariateSpline
import statsmodels.api as sm
import matplotlib.pyplot as plt
import collections
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

pd.set_option("display.max_column", 999)

# Preprocessing

In [52]:
df_mvp = pd.read_excel('../collect-data/src/files/nba_data.xlsx', sheet_name='MVP')
df_mvp.head()

Unnamed: 0.1,Unnamed: 0,RANK,FIRST NAME,LAST NAME,SUFFIX,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,SEASON
0,12971,2,Karl,Malone,,UTA,33,82,64,18,2997.7,2249,864,1571,55.0,0,13,0.0,521,690,75.5,193,616,809,368,233,113,48,216,4022,43,1,767,1996-97
1,12531,1,Michael,Jordan,,CHI,35,82,62,20,3182.4,2357,881,1893,46.5,30,126,23.8,565,721,78.4,130,345,475,283,185,141,45,151,3725,5,0,598,1997-98
2,12093,3,Karl,Malone,,UTA,35,49,36,13,1831.6,1164,393,797,49.3,0,1,0.0,378,480,78.8,107,356,463,201,162,62,28,134,2129,21,1,354,1998-99
3,11652,1,Shaquille,O'Neal,,LAL,28,79,66,13,3165.2,2344,956,1665,57.4,0,1,0.0,432,824,52.4,336,742,1078,299,223,36,239,255,4688,63,0,705,1999-00
4,11212,2,Allen,Iverson,,PHI,26,71,50,21,2975.9,2207,762,1813,42.0,98,306,32.0,585,719,81.4,50,223,273,325,237,178,20,147,3379,4,0,371,2000-01


In [53]:
df_all = pd.read_excel('../collect-data/src/files/nba_data.xlsx', sheet_name='NBA Stats')
df_all.head()

Unnamed: 0.1,Unnamed: 0,RANK,FIRST NAME,LAST NAME,SUFFIX,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,SEASON
0,0,1,Luka,Doncic,,DAL,25,63,40,23.0,2355.6,2137,727,1492.0,48.7,244,652.0,37.4,439,558.0,78.7,53,519,572,617,249,93,34,131,3881,43,19,222,2023-24
1,1,2,Shai,Gilgeous-Alexander,,OKC,25,70,50,20.0,2409.1,2131,756,1400.0,54.0,91,248.0,36.7,528,606.0,87.1,61,331,392,444,154,147,64,175,3746,8,0,553,2023-24
2,2,3,Giannis,Antetokounmpo,,MIL,29,68,43,25.0,2393.9,2087,783,1272.0,61.6,34,118.0,28.8,487,741.0,65.7,182,598,780,435,234,81,69,199,3892,54,9,314,2023-24
3,3,4,Jayson,Tatum,,BOS,26,68,53,15.0,2444.7,1858,627,1328.0,47.2,214,566.0,37.8,390,469.0,83.2,62,498,560,335,173,70,40,136,3190,24,0,599,2023-24
4,4,5,Jalen,Brunson,,NYK,27,67,43,24.0,2342.4,1834,656,1376.0,47.7,179,446.0,40.1,343,406.0,84.5,38,204,242,436,161,61,13,125,2839,5,0,408,2023-24


In [54]:
df_all['Is_MVP'] = df_all.apply(lambda row: 1 if any(
    (row['LAST NAME'] == mvp['LAST NAME'] and row['SEASON'] == mvp['SEASON']) 
    for _, mvp in df_mvp.iterrows()) else 0, axis=1)

# Drop the 'Unnamed: 0' column as it's likely an index column from Excel
df_all.drop('Unnamed: 0', axis=1, inplace=True)
df_all.head()

Unnamed: 0,RANK,FIRST NAME,LAST NAME,SUFFIX,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,SEASON,Is_MVP
0,1,Luka,Doncic,,DAL,25,63,40,23.0,2355.6,2137,727,1492.0,48.7,244,652.0,37.4,439,558.0,78.7,53,519,572,617,249,93,34,131,3881,43,19,222,2023-24,0
1,2,Shai,Gilgeous-Alexander,,OKC,25,70,50,20.0,2409.1,2131,756,1400.0,54.0,91,248.0,36.7,528,606.0,87.1,61,331,392,444,154,147,64,175,3746,8,0,553,2023-24,0
2,3,Giannis,Antetokounmpo,,MIL,29,68,43,25.0,2393.9,2087,783,1272.0,61.6,34,118.0,28.8,487,741.0,65.7,182,598,780,435,234,81,69,199,3892,54,9,314,2023-24,0
3,4,Jayson,Tatum,,BOS,26,68,53,15.0,2444.7,1858,627,1328.0,47.2,214,566.0,37.8,390,469.0,83.2,62,498,560,335,173,70,40,136,3190,24,0,599,2023-24,0
4,5,Jalen,Brunson,,NYK,27,67,43,24.0,2342.4,1834,656,1376.0,47.7,179,446.0,40.1,343,406.0,84.5,38,204,242,436,161,61,13,125,2839,5,0,408,2023-24,0


In [55]:
# Filter the DataFrame for the 2023-24 season and apply game and minutes thresholds
current_season_stats = df_all[(df_all['SEASON'] == '2023-24') &
                                 (df_all['GP'] >= 55) &
                                 ((df_all['MIN'] / df_all['GP']) > 20)]

current_season_stats

Unnamed: 0,RANK,FIRST NAME,LAST NAME,SUFFIX,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,SEASON,Is_MVP
0,1,Luka,Doncic,,DAL,25,63,40,23.0,2355.6,2137,727,1492.0,48.7,244,652.0,37.4,439,558.0,78.7,53,519,572,617,249,93,34,131,3881,43,19,222,2023-24,0
1,2,Shai,Gilgeous-Alexander,,OKC,25,70,50,20.0,2409.1,2131,756,1400.0,54.0,91,248.0,36.7,528,606.0,87.1,61,331,392,444,154,147,64,175,3746,8,0,553,2023-24,0
2,3,Giannis,Antetokounmpo,,MIL,29,68,43,25.0,2393.9,2087,783,1272.0,61.6,34,118.0,28.8,487,741.0,65.7,182,598,780,435,234,81,69,199,3892,54,9,314,2023-24,0
3,4,Jayson,Tatum,,BOS,26,68,53,15.0,2444.7,1858,627,1328.0,47.2,214,566.0,37.8,390,469.0,83.2,62,498,560,335,173,70,40,136,3190,24,0,599,2023-24,0
4,5,Jalen,Brunson,,NYK,27,67,43,24.0,2342.4,1834,656,1376.0,47.7,179,446.0,40.1,343,406.0,84.5,38,204,242,436,161,61,13,125,2839,5,0,408,2023-24,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,214,Kyle,Anderson,,MIN,30,69,49,20.0,1576.9,457,179,380.0,47.1,9,40.0,22.5,90,126.0,71.4,57,195,252,293,82,60,37,107,1408,0,0,163,2023-24,0
232,233,Ochai,Agbaji,,TOR,23,72,29,43.0,1457.7,417,164,396.0,41.4,61,200.0,30.5,28,42.0,66.7,66,128,194,73,55,42,38,102,944,0,0,-249,2023-24,0
242,243,Pat,Connaughton,,MIL,31,67,42,25.0,1457.2,377,134,292.0,45.9,75,201.0,37.3,34,45.0,75.6,42,168,210,133,42,34,16,81,937,0,0,0,2023-24,0
249,250,Matisse,Thybulle,,POR,27,64,18,46.0,1480.3,354,126,315.0,40.0,80,229.0,34.9,22,29.0,75.9,30,105,135,88,40,113,49,92,1094,0,0,-224,2023-24,0


# Execution
## Training

In [56]:
# Prepare the data for training
# Drop non-numeric columns and the 'Is_MVP' column to form the feature matrix X
X = df_all.drop(['FIRST NAME', 'LAST NAME', 'SUFFIX', 'TEAM', 'SEASON', 'Is_MVP'], axis=1)
y = df_all['Is_MVP']

In [57]:
# Rename problematic column correctly
X.rename(columns={'+/-': 'Plus_Minus'}, inplace=True)

# Ensure all data types are numeric or handled correctly
X = X.apply(pd.to_numeric, errors='coerce')

In [58]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [59]:
# Train the model
xgb_model.fit(X_train, y_train)

# Get feature importances
feature_importances = xgb_model.feature_importances_
feature_importance_dict = {feature: importance for feature, importance in zip(X.columns, feature_importances)}

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# sorted_features

In [60]:
# Assuming 'sorted_features' is your list of features sorted by importance
threshold = 0.01
top_features = [feature for feature, importance in sorted_features if importance > threshold]


# Create new feature matrix with selected features
X_selected = X[top_features]

In [61]:
# Split the data again with the selected features
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)

# Retrain the model with selected features
xgb_model.fit(X_train, y_train)

In [62]:
# Evaluate and interpret the new model
new_feature_importances = xgb_model.feature_importances_
new_feature_importance_dict = {feature: importance for feature, importance in zip(X_selected.columns, new_feature_importances)}

# Sort features by importance
new_sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

new_sorted_features

[('FP', 0.13696824),
 ('RANK', 0.12910928),
 ('L', 0.08078213),
 ('Plus_Minus', 0.064723305),
 ('REB', 0.044167574),
 ('3PM', 0.041965507),
 ('FGA', 0.034033734),
 ('TD3', 0.03348775),
 ('TOV', 0.032431714),
 ('FTM', 0.032327645),
 ('DREB', 0.028278053),
 ('W', 0.027096739),
 ('FTA', 0.02699002),
 ('DD2', 0.026715627),
 ('AST', 0.025307635),
 ('GP', 0.02469655),
 ('STL', 0.024289705),
 ('OREB', 0.023307825),
 ('FG%', 0.022936549),
 ('BLK', 0.02266042),
 ('FGM', 0.021185486),
 ('AGE', 0.018507442),
 ('FT%', 0.016893726),
 ('MIN', 0.0155581515),
 ('PF', 0.014349373),
 ('3PA', 0.013256316),
 ('3P%', 0.009280606),
 ('PTS', 0.008693003)]

In [63]:
# Split the data with the selected features
X_train_selected, X_test_selected, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y)

# Retrain the model with selected features
xgb_model_selected = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model_selected.fit(X_train_selected, y_train)

In [64]:
# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1]
}

# Set up the grid search
grid_search = GridSearchCV(xgb_model_selected, param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(X_train_selected, y_train)

# Best model
best_model = grid_search.best_estimator_


Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [65]:
# Cross-validate the model
cv_scores = cross_val_score(best_model, X_selected, y, cv=5, scoring='accuracy')
print("CV Mean Score: ", cv_scores.mean())

CV Mean Score:  0.9970920005347559


In [66]:
# Predict on the test set
y_pred = best_model.predict(X_test_selected)

# Evaluation metrics
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy:  0.997018263138278
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2675
           1       0.00      0.00      0.00         8

    accuracy                           1.00      2683
   macro avg       0.50      0.50      0.50      2683
weighted avg       0.99      1.00      1.00      2683

Confusion Matrix:
 [[2675    0]
 [   8    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Prediction

In [67]:
# Assuming the encoding and scaling operations are already defined
current_season_features = pd.get_dummies(current_season_stats)
# Ensure it includes all necessary columns, even those that might not have data in the current season
for col in X_train_selected.columns:
    if col not in current_season_features.columns:
        current_season_features[col] = 0  # Adding missing columns with default value of 0

# Filter to only use top features, assuming 'top_features' has been previously determined
current_season_selected = current_season_features[top_features]


In [68]:
# Predict MVP probabilities
predicted_mvp_probabilities = xgb_model_selected.predict_proba(current_season_selected)[:, 1]

# Add predictions back to the data
current_season_stats['MVP_Probability'] = predicted_mvp_probabilities

# Sort by probability to find top candidates
top_mvp_candidates = current_season_stats.sort_values(by='MVP_Probability', ascending=False).head(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_season_stats['MVP_Probability'] = predicted_mvp_probabilities


In [69]:
print(top_mvp_candidates[['FIRST NAME', 'LAST NAME', 'TEAM', 'MVP_Probability']])

  FIRST NAME           LAST NAME TEAM  MVP_Probability
5     Nikola               Jokic  DEN         0.083813
0       Luka              Doncic  DAL         0.027128
2    Giannis       Antetokounmpo  MIL         0.008616
1       Shai  Gilgeous-Alexander  OKC         0.002338
3     Jayson               Tatum  BOS         0.001220
