## NBA Model TEST 1

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import os
import pickle

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Defining the seasons and models used
seasons = ['2021-22', '2024-25', '2023-24', '2022-23','2020-21','2019-20','2018-19','2017-18','2016-17','2015-16','2014-15','2013-14',
          '2012-13','2011-12','2010-11','2009-10','2008-09','2007-08','2006-07','2005-06', '2004-05', '2003-04','2002-03','2001-02'] 
          
modelos = ['SVM','Elastic Net','Random Forest','AdaBoost','Gradient Boosting','LGBM']

# seasons = ['2021_2022', '2001_2002', '2002_2003', '2003_2004', '2004_2005', '2005_2006', 
#           '2006_2007', '2007_2008', '2008_2009',  '2009_2010', '2010_2011', '2011_2012', '2012_2013', 
#           '2013_2014', '2014_2015', '2015_2016', '2016_2017', '2017_2018', '2018_2019', '2019_2020',   
#           '2020_2021']
# modelos = ['SVM','Elastic Net','Random Forest','AdaBoost','Gradient Boosting','LGBM']

# Path to local folder
path_data = r'./data/'

# sep = r'/'

In [3]:
# _perGame = pd.DataFrame()
# _perGame = pd.read_csv('./data/2024-25 Advanced.csv')
# _perGame['Player'] = _perGame['Player'] + '\\' + _perGame['Player-additional']
# _perGame = _perGame.drop(columns=['Player-additional', 'Awards'])
# # Check if all columns in totals are dropped
# if _perGame.shape[1] == 0:
#     print("All columns have been dropped from totals.")
# else:
#     print(f"Remaining columns: {_perGame.columns.tolist()}")
# _perGame.to_csv('./2024-25 Advanced.csv', index=False)


In [4]:
def get_data(seasons):
# Opening the data
    perGame = pd.DataFrame()
    totals = pd.DataFrame()
    advanced = pd.DataFrame()
    standings = pd.DataFrame()
    
    for season in seasons:

        _advanced = pd.read_csv(path_data  + season + " Advanced.csv")
        
        _perGame = pd.read_csv(path_data + season + ' perGame.csv')

        _standings = pd.read_csv(path_data  + season + ' Standings.csv')
        
        _totals = pd.read_csv(path_data  + season + ' Totals.csv')
        

        _perGame['Season'] = season
        _totals['Season'] = season
        _advanced['Season'] = season
        _standings['Season'] = season
        
        perGame = pd.concat([perGame,_perGame], ignore_index=True)
        totals = pd.concat([totals,_totals], ignore_index=True)
        advanced = pd.concat([advanced,_advanced], ignore_index=True)
        standings = pd.concat([standings,_standings], ignore_index=True)
            
    return advanced, perGame, standings, totals;



In [5]:
# for season in seasons:
#   _standings = pd.read_csv(path_data + "nba" + season + " Standings.csv", skiprows=1)
#   # _standings.drop(index=_standings.index[0], axis=0, inplace=True)
#   df = _standings[['Rk', 'Team', 'Overall']]
#   df.rename(columns={'Rk': 'Seed', 'Overall': 'Record'}, inplace=True)
#   print(df.columns)
#   df.to_csv(path_data + "nba" + season + " Standings.csv", index=False)


In [6]:
def treat_data(advanced, perGame, standings, totals, seasons):
    
    # Removing duplicate/empty columns
    perGame = perGame.drop(['Rk','Pos',], axis=1)
    totals = totals.drop(['Rk','Pos','Age','G','GS',], axis=1)
    advanced = advanced.drop(['Rk','Pos','Age','G','MP','Unnamed: 24','Unnamed: 19', ], axis=1)
    
    cols = ['Player','Season','Pos','Age','Tm','G','GS']
    
    # Identifying the variables
    for column in perGame.columns:
        if column not in cols:
            newCol = column+'_perGame'
            perGame = perGame.rename(columns={column:newCol})
    for column in totals.columns:
        if column not in cols:
            newCol = column+'_totals'
            totals = totals.rename(columns={column:newCol})
    for column in advanced.columns:
        if column not in cols:
            newCol = column+'_advanced'
            advanced = advanced.rename(columns={column:newCol})
            
    # Merging the bases
    data = perGame.merge(advanced, on=['Player','Season','Tm'], how='left', validate='1:1')
    data = data.merge(totals, on=['Player','Season','Tm'], how='left', validate='1:1')
    
    dict_teams = {'Utah Jazz':'UTA','Phoenix Suns':'PHO',
                'Philadelphia 76ers':'PHI','Brooklyn Nets':'BRK',
                'Denver Nuggets':'DEN','Los Angeles Clippers':'LAC',
                'Milwaukee Bucks':'MIL','Dallas Mavericks':'DAL',
                'Los Angeles Lakers':'LAL','Portland Trail Blazers':'POR',
                'Atlanta Hawks':'ATL','New York Knicks':'NYK',
                'Miami Heat':'MIA','Golden State Warriors':'GSW',
                'Memphis Grizzlies':'MEM','Boston Celtics':'BOS',
                'Washington Wizards':'WAS','Indiana Pacers':'IND',
                'Charlotte Hornets':'CHO','Charlotte Bobcats':'CHA',
                'San Antonio Spurs':'SAS','Chicago Bulls':'CHI',
                'New Orleans Pelicans':'NOP','Sacramento Kings':'SAC',
                'Toronto Raptors':'TOR','Minnesota Timberwolves':'MIN',
                'Cleveland Cavaliers':'CLE','Oklahoma City Thunder':'OKC',
                'Orlando Magic':'ORL','Detroit Pistons':'DET',
                'Houston Rockets':'HOU','New Jersey Nets':'NJN',
                'New Orleans Hornets':'NOH','Seattle SuperSonics':'SEA'}
    
    teams = pd.DataFrame.from_dict(dict_teams, orient='index').reset_index()
    teams = teams.rename(columns={'index':'Team',0:'Tm'})
    standings = standings.merge(teams, on='Team', how='left', validate='m:1')
    wins = (standings['Record'].str.split('-',expand=True)[0]).astype(int)
    games = ((standings['Record'].str.split('-',expand=True)[0]).astype(int)+(standings['Record'].str.split('-',expand=True)[1]).astype(int))
    standings['WIN%'] = wins/games
    
    data = data.merge(standings, on=['Tm','Season'], how='left', validate='m:1')
    
    data['Player'] = data['Player'].str.replace('*','')
    
    mvps = pd.read_csv(path_data + "MVPs.csv")
    data = data.merge(mvps, on=['Player','Season'], how='left', validate='m:1').fillna(0)  

    data['Player'] = data['Player'].str.split('\\', expand=True)[0]
    
    # Removing duplicate lines from traded players
    dataf = pd.DataFrame()
    for season in seasons:
        data_season = data[data['Season']==season]
        data_season = data_season.drop_duplicates(subset=['Player'], keep='first')

        dataf = pd.concat([dataf,data_season], ignore_index=True)
    
    # Filtering the data
    # dataf = dataf[((dataf['G']>48)&(dataf['PTS_perGame']>13.5)&(dataf['MP_perGame']>30)
    #             &(dataf['Seed']<=16)&(dataf['AST_perGame']>1)&(dataf['TRB_perGame']>3)
    #             &(dataf['Tm']!='TOT')&(dataf['FG%_perGame']>0.37)&(dataf['FGA_perGame']>10)
    #             &(dataf['PER_advanced']>18))].reset_index(drop=True)
    dataf = dataf[((dataf['PTS_perGame']>13.5)&(dataf['MP_perGame']>30)
            &(dataf['Seed']<=16)&(dataf['AST_perGame']>1)&(dataf['TRB_perGame']>3)
            &(dataf['Tm']!='TOT')&(dataf['FG%_perGame']>0.37)&(dataf['FGA_perGame']>10)
            &(dataf['PER_advanced']>18))].reset_index(drop=True)
    # Base for the criteria:
    # Karl Malone was MVP in 98-99 with 49 games
    # Wes Unseld was MVP at 68-69 with 13.8 PPG and with 10.9 FGA
    # Steve Nash was MVP at 04-05 with 3.3 REB
    # Moses Malone was MVP at 82-83 with 1.3 AST
    # Bob Cousy was MVP at 56-57 with 37.8% FG
    # Giannis Antetokounmpo was MVP in 19-20 with 30.4 min
    # Kareem Abdul-Jabbar was the only MVP not to make the playoffs in 1976 (40-42)
    # Dave Cowens was MVP at 72-73 with a PER of 18.1
    # Never has an MVP been traded in the middle of the season that he won the award
        
    dataf = dataf.drop(['Tm','Team','Record'], axis=1)
    
    return dataf

In [7]:
advanced, perGame, standings, totals = get_data(seasons)

In [8]:
data = treat_data(advanced, perGame, standings, totals, seasons)


In [9]:
data

Unnamed: 0,Player,Age,G,GS_x,MP_perGame,FG_perGame,FGA_perGame,FG%_perGame,3P_perGame,3PA_perGame,...,AST_totals,STL_totals,BLK_totals,TOV_totals,PF_totals,PTS_totals,Seed,WIN%,MVP Rank,MVP Votes Share
0,Bam Adebayo,24,56,56,32.6,7.3,13.0,0.557,0.0,0.1,...,190,80,44,148,171,1068,4.0,0.646341,0,0.000
1,Giannis Antetokounmpo,27,67,67,32.9,10.3,18.6,0.553,1.1,3.6,...,388,72,91,219,212,2002,7.0,0.621951,3,0.595
2,Devin Booker,25,68,68,34.5,9.7,20.9,0.466,2.7,7.0,...,329,77,26,162,180,1822,1.0,0.780488,4,0.216
3,Jaylen Brown,25,66,66,33.6,8.7,18.4,0.473,2.5,7.0,...,231,70,18,178,163,1559,6.0,0.621951,0,0.000
4,Jimmy Butler,32,57,57,33.9,7.0,14.5,0.480,0.5,2.0,...,312,94,27,121,88,1219,4.0,0.646341,0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580,Jerry Stackhouse,27,76,76,35.3,6.9,17.4,0.397,1.1,3.9,...,403,77,37,266,163,1629,6.0,0.609756,15,0.002
581,Peja Stojaković,24,71,71,37.3,7.7,15.9,0.484,1.8,4.4,...,175,81,14,140,120,1506,1.0,0.743902,16T,0.001
582,Rasheed Wallace,27,79,79,37.5,7.6,16.3,0.469,1.4,4.0,...,152,101,101,131,212,1521,9.0,0.597561,0,0.000
583,Chris Webber,28,54,54,38.4,9.9,19.9,0.495,0.1,0.4,...,258,90,76,158,181,1322,1.0,0.743902,7,0.029


In [10]:
data.columns # Variables


Index(['Player', 'Age', 'G', 'GS_x', 'MP_perGame', 'FG_perGame', 'FGA_perGame',
       'FG%_perGame', '3P_perGame', '3PA_perGame', '3P%_perGame', '2P_perGame',
       '2PA_perGame', '2P%_perGame', 'eFG%_perGame', 'FT_perGame',
       'FTA_perGame', 'FT%_perGame', 'ORB_perGame', 'DRB_perGame',
       'TRB_perGame', 'AST_perGame', 'STL_perGame', 'BLK_perGame',
       'TOV_perGame', 'PF_perGame', 'PTS_perGame', 'Season', 'PER_advanced',
       'TS%_advanced', '3PAr_advanced', 'FTr_advanced', 'ORB%_advanced',
       'DRB%_advanced', 'TRB%_advanced', 'AST%_advanced', 'STL%_advanced',
       'BLK%_advanced', 'TOV%_advanced', 'USG%_advanced', 'OWS_advanced',
       'DWS_advanced', 'WS_advanced', 'WS/48_advanced', 'OBPM_advanced',
       'DBPM_advanced', 'BPM_advanced', 'VORP_advanced', 'GS_y', 'MP_totals',
       'FG_totals', 'FGA_totals', 'FG%_totals', '3P_totals', '3PA_totals',
       '3P%_totals', '2P_totals', '2PA_totals', '2P%_totals', 'eFG%_totals',
       'FT_totals', 'FTA_totals', 'FT

In [11]:
data['Season'].value_counts() # Number of players in the data per season


Season
2023-24    30
2020-21    29
2024-25    29
2007-08    28
2022-23    28
2006-07    27
2016-17    27
2010-11    26
2005-06    25
2021-22    25
2011-12    24
2015-16    24
2014-15    23
2013-14    23
2018-19    23
2009-10    23
2003-04    23
2004-05    22
2001-02    22
2019-20    22
2017-18    21
2012-13    21
2008-09    20
2002-03    20
Name: count, dtype: int64

In [12]:
# Function for metrics
def func_metricas(y_test, y_pred, metricas, modelo, season):
    rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),3) # RMSE
    r2 = round(r2_score(y_test, y_pred),3) # R²
    
    dict_met = {'Modelo': [modelo],
                'Season': [season],
                'RMSE': [rmse],
                'R²': [r2]}
    
    metrica = pd.DataFrame(data=dict_met)
    metricas = pd.concat([metricas,metrica])
    
    return metricas


In [13]:
def func_modelos(data, seasons, modelos, n_seasons_to_test):
    final_results = pd.DataFrame()
    metricas = pd.DataFrame()
    best_params = []
    i = 1

    for season in seasons:

        # Separating training and testing bases
        season_teste = season

        data_train = data[data['Season']!=season_teste]
        data_test = data[data['Season']==season_teste]

        X_train = data_train.drop(['MVP Votes Share','MVP Rank','Player','Season'], axis=1)
        y_train = data_train['MVP Votes Share']

        X_test = data_test.drop(['MVP Votes Share','MVP Rank','Player','Season'], axis=1)
        y_test = data_test['MVP Votes Share']

        initial_results = data_test[['Player','Season','MVP Votes Share','MVP Rank']]
        results = initial_results.copy()

        scaler = StandardScaler()

        scaled_X_train = scaler.fit_transform(X_train)
        scaled_X_test = scaler.transform(X_test)

        for modelo in modelos:
            # Creating instance for each model
            if modelo=='SVM':
                param_grid = {'C': [0.001,0.01,0.1,0.5,1,2,5,10],
                             'kernel': ['linear','rbf','poly'],
                             'gamma': ['scale','auto'],
                             'degree': [2,3,4],
                             'epsilon': [0.1,0.5,1]}
                svr_model = SVR()
                grid = GridSearchCV(svr_model, param_grid)
                grid.fit(scaled_X_train, y_train)
                model = SVR(**grid.best_params_)
                best_params.append(grid.best_params_)

            elif modelo=='Elastic Net':
                param_grid = {'alpha':[0.01,0.1,1,5,10,50,100],
                              'l1_ratio':[0.01,0.1,0.5,0.7,0.95,0.99,1]}
                elastic_net_model = ElasticNet()
                grid = GridSearchCV(elastic_net_model, param_grid)
                grid.fit(scaled_X_train, y_train)
                grid.best_params_
                model = ElasticNet(**grid.best_params_)
                best_params.append(grid.best_params_)
                
            elif modelo=='Random Forest':
                param_grid = {'n_estimators': [15,25,50,64,100,150,200],
                             'max_features': [2,3,4,5],
                             'bootstrap': [True,False],
                             'oob_score': [True]}
                rfc = RandomForestRegressor()
                grid = GridSearchCV(rfc, param_grid)
                grid.fit(scaled_X_train, y_train)
                model = RandomForestRegressor(**grid.best_params_)
                best_params.append(grid.best_params_)
                
            elif modelo=='AdaBoost':
                param_grid = {'n_estimators': [5,10,20,30,40,50,100],
                             'learning_rate': [0.01,0.05,0.1,0.2,0.5]}
                ada_model = AdaBoostRegressor()
                grid = GridSearchCV(ada_model, param_grid)
                grid.fit(scaled_X_train, y_train)
                model = AdaBoostRegressor(**grid.best_params_)
                best_params.append(grid.best_params_)
                
            elif modelo=='Gradient Boosting':
                param_grid = {'n_estimators': [10,20,30,40,50],
                             'learning_rate': [0.01,0.05,0.1,0.2,0.5],
                             'max_depth': [3,4,5]}
                gb_model = GradientBoostingRegressor()
                grid = GridSearchCV(gb_model, param_grid)
                grid.fit(scaled_X_train, y_train)
                model = GradientBoostingRegressor(**grid.best_params_)
                best_params.append(grid.best_params_)
                
            elif modelo=='LGBM':
                param_grid = {'learning_rate':[0.01,0.1,0.2,0.3],
                              'num_leaves':[5,10,20,30],
                              'min_data_in_leaf':[10,25,50],
                              'max_depth':[2,3,4],
                              'feature_fraction':[0.6,0.7,0.8,0.9],
                              'min_gain_to_split':[0,0.01,0.1,0.2],
                              'verbose':[-1]}
                lgbm_model = LGBMRegressor()
                grid = GridSearchCV(lgbm_model, param_grid)
                grid.fit(scaled_X_train, y_train)
                model = LGBMRegressor(**grid.best_params_)
                best_params.append(grid.best_params_)
                
            model.fit(scaled_X_train, y_train)
            
            pickle.dump(model, open(path_data + modelo + '.dat', 'wb'))

            
            y_pred = model.predict(scaled_X_test)
            metricas = func_metricas(y_test, y_pred, metricas, modelo, season)

            apoio = initial_results.copy()
            apoio['Predicted MVP Share '+modelo] = pd.Series(y_pred).values

            results_sorted = apoio.sort_values(by='Predicted MVP Share '+modelo,
                                                ascending=False).reset_index(drop=True)
            results_sorted['MVP Rank '+modelo] = results_sorted.index+1

            results = results.merge(results_sorted, on=['Player','Season','MVP Votes Share','MVP Rank'])

        final_results = pd.concat([final_results,results], ignore_index=True)

        if i == n_seasons_to_test:
            break

        i = i + 1
    
    np.savetxt(path_data+ 'params.csv', best_params, delimiter =', ', fmt ='% s')
    
    return final_results, metricas, best_params

In [14]:
def media_metricas(metricas):
    # Averages of each of the models
    final_metricas = pd.DataFrame()
    for modelo in metricas['Modelo'].unique():
        metrica = metricas[metricas['Modelo']==modelo]
        rmse = round(metrica['RMSE'].mean(),3)
        r2 = round(metrica['R²'].mean(),3)

        dict_met = {'Modelo': [modelo],
                    'RMSE': [rmse],
                    'R²': [r2]}

        apoio = pd.DataFrame(data=dict_met)
        final_metricas = pd.concat([final_metricas,apoio], ignore_index=True)
    return final_metricas

In [15]:
final_results, metricas, best_params = func_modelos(data, seasons, modelos, 1)
final_metricas = media_metricas(metricas)
final_metricas

Unnamed: 0,Modelo,RMSE,R²
0,SVM,0.096,0.839
1,Elastic Net,0.146,0.623
2,Random Forest,0.096,0.838
3,AdaBoost,0.117,0.758
4,Gradient Boosting,0.099,0.826
5,LGBM,0.104,0.809


In [16]:
def create_rank(final_results, n_rank):
    rank_final = pd.DataFrame()

    for season in final_results['Season'].unique():
        temp = final_results[final_results['Season']==season]
        rank = pd.DataFrame()
        rank_real = temp.sort_values(by='MVP Votes Share', ascending=False)[:n_rank].reset_index(drop=True)
        rank['MVP Rank Real'] = rank_real['Player']
        rank['MVP Share Real'] = rank_real['MVP Votes Share']
        for modelo in modelos:
            try:
                temp2 = temp.sort_values(by='Predicted MVP Share '+modelo, ascending=False)[:n_rank].reset_index(drop=True)
                rank['MVP Rank '+modelo] = temp2['Player']
                rank['MVP Share '+modelo] = round(temp2['Predicted MVP Share '+modelo],3)
            except:
                continue

        rank['Season'] = season    
        rank_final = pd.concat([rank_final, rank], ignore_index=True)
    
    return rank_final

In [17]:
rank_final = create_rank(final_results, 1)
rank_final

Unnamed: 0,MVP Rank Real,MVP Share Real,MVP Rank SVM,MVP Share SVM,MVP Rank Elastic Net,MVP Share Elastic Net,MVP Rank Random Forest,MVP Share Random Forest,MVP Rank AdaBoost,MVP Share AdaBoost,MVP Rank Gradient Boosting,MVP Share Gradient Boosting,MVP Rank LGBM,MVP Share LGBM,Season
0,Nikola Jokić,0.875,Giannis Antetokounmpo,0.734,Nikola Jokić,0.526,Giannis Antetokounmpo,0.609,Giannis Antetokounmpo,0.475,Giannis Antetokounmpo,0.628,Nikola Jokić,0.674,2021-22


In [18]:
def temporadas_antigas(data, seasons, modelos, n_seasons_to_test):
    final_results = pd.DataFrame()
    metricas = pd.DataFrame()
    best_params = []
    i = 1

    for season in seasons:
            
        # Separating training and testing bases
        season_teste = season

        data_train = data[data['Season']!=season_teste]
        data_test = data[data['Season']==season_teste]

        X_train = data_train.drop(['MVP Votes Share','MVP Rank','Player','Season'], axis=1)
        y_train = data_train['MVP Votes Share']

        X_test = data_test.drop(['MVP Votes Share','MVP Rank','Player','Season'], axis=1)
        y_test = data_test['MVP Votes Share']

        initial_results = data_test[['Player','Season','MVP Votes Share','MVP Rank']]
        results = initial_results.copy()

        scaler = StandardScaler()

        scaled_X_train = scaler.fit_transform(X_train)
        scaled_X_test = scaler.transform(X_test)

        for modelo in modelos:
            
            # if modelo=='Elastic Net':
            #     continue
            
            # Opening the models already created          
            model = pickle.load(open(path_data + modelo + '.dat', 'rb'))

            model.fit(scaled_X_train, y_train)
            y_pred = model.predict(scaled_X_test)
            metricas = func_metricas(y_test, y_pred, metricas, modelo, season)

            apoio = initial_results.copy()
            apoio['Predicted MVP Share '+modelo] = pd.Series(y_pred).values

            results_sorted = apoio.sort_values(by='Predicted MVP Share '+modelo,
                                                ascending=False).reset_index(drop=True)
            results_sorted['MVP Rank '+modelo] = results_sorted.index+1

            results = results.merge(results_sorted, on=['Player','Season','MVP Votes Share','MVP Rank'])

        final_results = pd.concat([final_results,results], ignore_index=True)

        if i == n_seasons_to_test:
            break

        i = i + 1
        
    return final_results, metricas

In [19]:
final_results, metricas = temporadas_antigas(data, seasons, modelos, 21)
final_metricas = media_metricas(metricas)
final_metricas

Unnamed: 0,Modelo,RMSE,R²
0,SVM,0.148,0.592
1,Elastic Net,0.161,0.505
2,Random Forest,0.142,0.608
3,AdaBoost,0.142,0.601
4,Gradient Boosting,0.141,0.605
5,LGBM,0.136,0.642


In [22]:
rank_final = create_rank(final_results, 3)
filtered_rank = rank_final[rank_final['Season'] == "2007-08"]
filtered_rank = filtered_rank[["MVP Rank Real", "Season"]]
filtered_rank
rank_final
rank_final.to_csv("mvppred2.csv", index=False)