# Machine learning and data exploration on NBA player data

This last part of the NBA project the data is to build a foundation to forecast the MVP's in NBA.

Overall this project lays a foundation for further exploration of this data. 

In [1]:
import pandas as pd

In [2]:
stats = pd.read_csv('player_mvp_stats.csv') ## Importing the full statistics file created/merged in previous notebook

In [3]:
stats.head()

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,26,LAL,82,82,33.0,4.7,9.8,...,920.0,0.0,Los Angeles Lakers,63,19,0.768,0.0,110.7,103.9,6.74
1,1,Byron Scott,SG,28,LAL,77,77,33.7,6.1,13.1,...,920.0,0.0,Los Angeles Lakers,63,19,0.768,0.0,110.7,103.9,6.74
2,2,James Worthy,SF,28,LAL,80,80,37.0,8.9,16.2,...,920.0,0.0,Los Angeles Lakers,63,19,0.768,0.0,110.7,103.9,6.74
3,3,Jawann Oldham,C,32,LAL,6,0,7.5,0.5,1.0,...,920.0,0.0,Los Angeles Lakers,63,19,0.768,0.0,110.7,103.9,6.74
4,4,Jay Vincent,SF,30,LAL,41,6,11.2,2.1,4.5,...,920.0,0.0,Los Angeles Lakers,63,19,0.768,0.0,110.7,103.9,6.74


## Final clean-up of data

In [4]:
## Removing the unwanted column.

del stats['Unnamed: 0']

In [5]:
pd.isnull(stats).sum() ## Checking the null stats for players

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          62
3P            0
3PA           0
3P%        2162
2P            0
2PA           0
2P%         106
eFG%         62
FT            0
FTA           0
FT%         551
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [6]:
## Seeing that 3P% has a lot of null stats, we will check do those players have attempts at 3P (3PA).

stats[pd.isnull(stats['3P%'])][['Player', '3PA']] 

Unnamed: 0,Player,3PA
3,Jawann Oldham,0.0
7,Mark McNamara,0.0
8,Mel McCants,0.0
10,Mychal Thompson,0.0
16,Elden Campbell,0.0
...,...,...
15584,Evan Eschmeyer,0.0
15585,Gheorghe Mureșan,0.0
15587,Jim McIlvaine,0.0
15593,Mark Hendrickson,0.0


In [7]:
## Filling the null stats with zero

stats = stats.fillna(0)

## Machine learning

This part of the project aims to build a model that can be used to predict the NBA MVP's based on historical data.

In [8]:
## Checking the current columns to determine the predictors

stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [9]:
## Some columns are taken off, so only numerical values are left

predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

In [10]:
train = stats[stats['Year'] < 2023]

In [11]:
test = stats[stats['Year'] == 2023]

In [12]:
## A linear model is chosen to predict based on previous years

from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1)

In [13]:
reg.fit(train[predictors], train['Share'])

In [14]:
predictions = reg.predict(test[predictors])

In [15]:
predictions = pd.DataFrame(predictions, columns = ['predictions'], index = test.index)

In [16]:
## Here we can see the predicted shares for the MVP award winners

predictions

Unnamed: 0,predictions
225,0.002324
226,0.032336
227,0.041438
228,0.226010
229,-0.002465
...,...
15490,-0.013661
15491,0.003761
15492,-0.015425
15493,0.017828


In [17]:
## Combining the predictions with the actual shares of votes

combination = pd.concat([test[['Player', 'Share']], predictions], axis = 1)

In [18]:
combination.sort_values('Share', ascending = False).head(10)

Unnamed: 0,Player,Share,predictions
15204,Joel Embiid,0.915,0.203343
750,Nikola Jokić,0.674,0.175211
228,Giannis Antetokounmpo,0.606,0.22601
2954,Jayson Tatum,0.28,0.139143
1336,Shai Gilgeous-Alexander,0.046,0.148992
14037,Donovan Mitchell,0.03,0.08748
4344,Domantas Sabonis,0.027,0.094655
320,Luka Dončić,0.01,0.19719
6804,Stephen Curry,0.005,0.107983
11077,Jimmy Butler,0.003,0.111396


In [19]:
## Calculating the mean squared error between the actual Share values and the predicted values.

from sklearn.metrics import mean_squared_error

mean_squared_error(combination['Share'], combination['predictions'])

0.002664727617642146

In [20]:
## Assigning Rank for each player based on the actual Share of votes the player had

combination = combination.sort_values('Share', ascending = False)
combination['Rk'] = list(range(1, combination.shape[0]+1))

In [21]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
15204,Joel Embiid,0.915,0.203343,1
750,Nikola Jokić,0.674,0.175211,2
228,Giannis Antetokounmpo,0.606,0.22601,3
2954,Jayson Tatum,0.28,0.139143,4
1336,Shai Gilgeous-Alexander,0.046,0.148992,5
14037,Donovan Mitchell,0.03,0.08748,6
4344,Domantas Sabonis,0.027,0.094655,7
320,Luka Dončić,0.01,0.19719,8
6804,Stephen Curry,0.005,0.107983,9
11077,Jimmy Butler,0.003,0.111396,10


In [22]:
## Adding the rank based on what the players predicted share of votes was

combination = combination.sort_values('predictions', ascending = False)
combination['Predicted_Rk'] = list(range(1, combination.shape[0]+1))

In [23]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
228,Giannis Antetokounmpo,0.606,0.22601,3,1
15204,Joel Embiid,0.915,0.203343,1,2
320,Luka Dončić,0.01,0.19719,8,3
750,Nikola Jokić,0.674,0.175211,2,4
1336,Shai Gilgeous-Alexander,0.046,0.148992,5,5
11792,Kevin Durant,0.0,0.141267,50,6
8712,Anthony Davis,0.0,0.140545,139,7
8232,Damian Lillard,0.0,0.139651,90,8
2954,Jayson Tatum,0.28,0.139143,4,9
8719,LeBron James,0.0,0.134207,132,10


In [24]:
def find_ap(combination):
    
    ## Function to calculate the Average Precision (AP) for a given combination of actual and predicted values.
    ## Returns AP score, considers both the accuracy and the order of the predictions.    
    
    actual = combination.sort_values('Share', ascending = False).head(5) ## Top 5 players in actual MVP voting
    predicted = combination.sort_values('predictions', ascending = False) ## Predictions sorted by biggest predicted share
    
    ## Variables to track precision, found, and seen items
    ps = []
    found = 0
    seen = 1
    
    for index, row in predicted.iterrows():
        if row['Player'] in actual['Player'].values: ## Checking if player in among actual top 5
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [25]:
find_ap(combination)

0.821111111111111

In [26]:
years = list(range(1990,2024))

In [27]:
## Now doing the same process for all the years
 
aps = []
all_predictions = []

for year in years[5:]:
    
    ## Split the dataset into training and testing sets based on the current 'year'
    train = stats[stats['Year'] < year]
    test = stats[stats['Year'] == year]
    
    reg.fit(train[predictors], train['Share'])  ## Fit the regression model using training data
    predictions = reg.predict(test[predictors])  ## Generating predictions using the trained model on the test set
    predictions = pd.DataFrame(predictions, columns = ['predictions'], index = test.index)
    combination = pd.concat([test[['Player', 'Share']], predictions], axis = 1)
    
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [28]:
## Final AP score for the model

sum(aps) / len(aps)

0.7339562287464433

In [29]:
def add_ranks(combination):
    
    ## Adds ranks to the combination list along with the difference between actual ranks and predicted ranks
    
    combination = combination.sort_values('Share', ascending = False)
    combination['Rk'] = list(range(1, combination.shape[0]+1))
    combination = combination.sort_values('predictions', ascending = False)
    combination['Predicted_Rk'] = list(range(1, combination.shape[0]+1))
    combination['Difference'] = combination['Rk'] - combination['Predicted_Rk']
    return combination

In [30]:
add_ranks(all_predictions[1]).head()

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Difference
10823,David Robinson,0.508,0.202143,2,1,1
8097,Shaquille O'Neal,0.056,0.196072,9,2,7
5553,Hakeem Olajuwon,0.211,0.185327,4,3,1
1821,Karl Malone,0.075,0.175327,7,4,3
11597,Michael Jordan,0.986,0.170441,1,5,-4


In [31]:
## Comparing top 5

ranking = add_ranks(all_predictions[1])
ranking[ranking['Rk'] < 6].sort_values('Difference', ascending = False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Difference
10823,David Robinson,0.508,0.202143,2,1,1
5553,Hakeem Olajuwon,0.211,0.185327,4,3,1
11597,Michael Jordan,0.986,0.170441,1,5,-4
8083,Anfernee Hardaway,0.319,0.104194,3,11,-8
11600,Scottie Pippen,0.2,0.067388,5,19,-14


In [32]:
def backtest(stats, model, year, predictors):
    
    ## Function to perform a backtest using a regression model on the given dataset
    ## Takes into account the stats, the regression model and predictors

    ## Returns a tuple containing: Average Precision (AP) score across multiple years, list of AP score for each year and
    ## a DataFrame of all predictions and ranks.
    
    aps = []
    all_predictions = []

    for year in years[5:]:
        train = stats[stats['Year'] < year]
        test = stats[stats['Year'] == year]
        model.fit(train[predictors], train['Share'])
        predictions = reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns = ['predictions'], index = test.index)
        combination = pd.concat([test[['Player', 'Share']], predictions], axis = 1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [33]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [34]:
## Mean AP score for the model

mean_ap

0.7339562287464433

In [35]:
all_predictions[all_predictions['Rk'] <= 5].sort_values('Difference').head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Difference
9176,Glen Rice,0.117,0.031333,5,56,-51
1455,Jason Kidd,0.712,0.031768,2,44,-42
5792,Steve Nash,0.839,0.036178,1,41,-40
5810,Steve Nash,0.739,0.05548,1,34,-33
14155,Joakim Noah,0.258,0.048159,4,36,-32
9444,Peja Stojaković,0.228,0.038256,4,34,-30
1620,Chris Paul,0.138,0.072281,5,33,-28
4066,Chauncey Billups,0.344,0.056863,5,32,-27
5825,Steve Nash,0.785,0.075863,2,21,-19
5616,Jason Kidd,0.135,0.052672,5,20,-15


In [36]:
## Player stats that influence the ranking

pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.099303,eFG%
18,0.030704,DRB
29,0.029337,W/L%
10,0.0208,2P
17,0.0182,ORB
21,0.012453,STL
22,0.010121,BLK
15,0.010015,FTA
12,0.007539,2P%
25,0.007146,PTS


In [37]:
## Checking the stat ratios: comparing to mean of every category

stat_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean())

stat_ratios.index = stat_ratios.index.droplevel()

In [38]:
stat_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,1.446394,0.524403,1.101744,1.430189,1.129657,1.0
1,1.737915,1.716226,1.377180,0.953459,6.777943,1.0
2,2.365807,1.716226,1.652616,1.430189,1.129657,1.0
3,0.168185,0.095346,0.413154,1.191824,0.000000,1.0
4,0.583042,0.190692,0.550872,0.238365,0.000000,1.0
...,...,...,...,...,...,...
15490,0.811285,0.435028,1.476263,4.707424,0.000000,1.0
15491,0.811285,0.483365,0.492088,0.523047,1.312418,1.0
15492,1.238854,1.015066,1.148205,1.046094,2.221015,1.0
15493,2.148808,1.353421,0.656117,0.523047,1.716239,1.0


In [39]:
stats[["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stat_ratios[["PTS", "AST", "STL", "BLK", "3P"]]

In [40]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,PF,26,LAL,82,82,33.0,4.7,9.8,0.478,...,0.768,0.0,110.7,103.9,6.74,1.446394,0.524403,1.101744,1.430189,1.129657
1,Byron Scott,SG,28,LAL,77,77,33.7,6.1,13.1,0.47,...,0.768,0.0,110.7,103.9,6.74,1.737915,1.716226,1.37718,0.953459,6.777943
2,James Worthy,SF,28,LAL,80,80,37.0,8.9,16.2,0.548,...,0.768,0.0,110.7,103.9,6.74,2.365807,1.716226,1.652616,1.430189,1.129657
3,Jawann Oldham,C,32,LAL,6,0,7.5,0.5,1.0,0.5,...,0.768,0.0,110.7,103.9,6.74,0.168185,0.095346,0.413154,1.191824,0.0
4,Jay Vincent,SF,30,LAL,41,6,11.2,2.1,4.5,0.47,...,0.768,0.0,110.7,103.9,6.74,0.583042,0.190692,0.550872,0.238365,0.0


In [41]:
## Adding stat ratios to predictors

predictors += ["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]

In [42]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [43]:
mean_ap

0.7374743980871056

In [44]:
## Adding positions and teams as numericals to the DataFrame.

stats["NPos"] = stats["Pos"].astype("category").cat.codes
stats["NTm"] = stats["Tm"].astype("category").cat.codes

In [45]:
stats.head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R,NPos,NTm
0,A.C. Green,PF,26,LAL,82,82,33.0,4.7,9.8,0.478,...,110.7,103.9,6.74,1.446394,0.524403,1.101744,1.430189,1.129657,2,15
1,Byron Scott,SG,28,LAL,77,77,33.7,6.1,13.1,0.47,...,110.7,103.9,6.74,1.737915,1.716226,1.37718,0.953459,6.777943,12,15
2,James Worthy,SF,28,LAL,80,80,37.0,8.9,16.2,0.548,...,110.7,103.9,6.74,2.365807,1.716226,1.652616,1.430189,1.129657,8,15
3,Jawann Oldham,C,32,LAL,6,0,7.5,0.5,1.0,0.5,...,110.7,103.9,6.74,0.168185,0.095346,0.413154,1.191824,0.0,0,15
4,Jay Vincent,SF,30,LAL,41,6,11.2,2.1,4.5,0.47,...,110.7,103.9,6.74,0.583042,0.190692,0.550872,0.238365,0.0,8,15
5,Larry Drew,PG,31,LAL,80,3,16.7,2.1,4.8,0.444,...,110.7,103.9,6.74,0.583042,1.28717,0.826308,0.238365,2.259314,5,15
6,Magic Johnson,PG,30,LAL,79,79,37.2,6.9,14.4,0.48,...,110.7,103.9,6.74,2.500355,5.48239,2.341206,0.953459,7.342772,5,15
7,Mark McNamara,C,30,LAL,33,1,5.8,1.2,2.6,0.442,...,110.7,103.9,6.74,0.347583,0.047673,0.137718,0.0,0.0,0,15
8,Mel McCants,SF,22,LAL,13,0,5.0,0.6,2.0,0.308,...,110.7,103.9,6.74,0.19061,0.095346,0.275436,0.238365,0.0,8,15
9,Michael Cooper,SG,33,LAL,80,10,23.1,2.4,6.2,0.387,...,110.7,103.9,6.74,0.717591,1.28717,1.101744,1.191824,3.388972,12,15


### Random Forest model

Another model to test the 

In [46]:
## Forest model might get a little more accurate predictions

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

mean_ap, aps, all_predictions = backtest(stats, rf, years[28:], predictors)


In [47]:
mean_ap

0.7551626910073486

In [48]:
## For comparison, the result with the linear model:

mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)

In [49]:
## As seen, the mean AP score is a little higher with the Random Forest model
mean_ap

0.7374743980871056