In [1]:
# Import packages
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor
from IPython.display import clear_output

# Set working path
path = '/Users/martinbogaert/Desktop/NBA Data Analysis/nba_ai_awards/'

print('MAKE SURE YOU CHANGE THE WEEK VARIABLE')

MAKE SURE YOU CHANGE THE WEEK VARIABLE


In [2]:
week = 10 # CHANGE THIS
date = datetime.now().strftime('%d %B %Y')
time = datetime.now().strftime('%H:%M:%S')
print(date + ' ; ' + time)

26 December 2022 ; 14:22:22


In [3]:
scrape_year = 2023

# Data preparation

In [4]:
# Load to-be-predicted data
data = pd.read_csv(path + f'Algorithm/weekly data/week_{week}.csv').groupby('roy').get_group(0) # Load up-to-date data, omit rookies
# Load big data file (to compute careers and comparison)
final_data = pd.read_csv(path + 'final_data.csv')
final_data = final_data[final_data['Year'] < scrape_year]
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A,roy,smoy
1,Bobby Portis,PF,27,MIL,33,8,26.0,5.7,11.5,0.5,...,2.52,113.63,111.03,2.6,2.73,113.8,110.96,2.84,0,1
2,Brook Lopez,C,34,MIL,32,32,30.6,5.6,10.9,0.511,...,2.52,113.63,111.03,2.6,2.73,113.8,110.96,2.84,0,0
3,George Hill,PG,36,MIL,29,0,19.8,1.6,3.8,0.427,...,2.52,113.63,111.03,2.6,2.73,113.8,110.96,2.84,0,1
4,Giannis Antetokounmpo,PF,28,MIL,28,28,33.3,11.2,20.9,0.536,...,2.52,113.63,111.03,2.6,2.73,113.8,110.96,2.84,0,0
5,Grayson Allen,SG,27,MIL,30,28,27.4,3.3,7.3,0.457,...,2.52,113.63,111.03,2.6,2.73,113.8,110.96,2.84,0,0


In [5]:
# For each player, determine his career lifetime, his career high in points, assists, rebounds and minutes played
# as well as his previous playing year
def years_in_league(df):
    player = list(df['Player'])[0] # get player name
    temp = final_data.groupby('Player').get_group(player).sort_values('Year') # get player data sorted by year
    #if len(list(temp['Year'])) < 2 : # if it's his first year set dummy previous year (easy to cut)
      #  df['prev_year'] = [0]
    #else : # else save previous playing year
    df['prev_year'] = [list(temp['Year'])[-1]]

    df['lifetime'] = [len(temp) + 1] # compute career lifetime
    
    df_prev = temp[temp['Year'] < scrape_year] # get previous data
    # Compute career high
    df['PTS_high'] = [df_prev['PTS'].max()]
    df['AST_high'] = [df_prev['AST'].max()]
    df['TRB_high'] = [df_prev['TRB'].max()]
    df['MP_high'] = [df_prev['MP'].max()]

    return df

data = data.groupby('Player').apply(years_in_league)
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DRtg/A,NRtg/A,roy,smoy,prev_year,lifetime,PTS_high,AST_high,TRB_high,MP_high
1,Bobby Portis,PF,27,MIL,33,8,26.0,5.7,11.5,0.5,...,110.96,2.84,0,1,2022,8,14.6,1.7,9.1,28.2
2,Brook Lopez,C,34,MIL,32,32,30.6,5.6,10.9,0.511,...,110.96,2.84,0,0,2022,15,20.7,2.3,8.6,36.9
3,George Hill,PG,36,MIL,29,0,19.8,1.6,3.8,0.427,...,110.96,2.84,0,1,2022,15,16.9,5.1,4.2,34.5
4,Giannis Antetokounmpo,PF,28,MIL,28,28,33.3,11.2,20.9,0.536,...,110.96,2.84,0,0,2022,10,29.9,5.9,13.6,36.7
5,Grayson Allen,SG,27,MIL,30,28,27.4,3.3,7.3,0.457,...,110.96,2.84,0,0,2022,5,11.1,2.2,3.4,27.3


In [6]:
# Only keep players having played in 2022 and merge with last year data : may be a case where MIP might be relative to a further year !!!
data = data.groupby('prev_year').get_group(scrape_year-1) 
data = data.merge(final_data.rename(columns = {'Year' : 'prev_year'}), on = ['Player', 'prev_year'], suffixes = ['', '_prev'], how = 'inner')

### FEATURES ####
# Calculate the difference in stats for all numerical features
features =  ['G','GS','MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB','AST',
             'STL','BLK','TOV','PF','PTS','PER','TS%','3PAr','FTr','ORB%','DRB%','TRB%','AST%','STL%','BLK%','TOV%','USG%',
             'OWS','DWS','WS','WS/48','OBPM','DBPM','BPM','VORP','W','L','W/L%','PS/G','PA/G','SRS','Seed','MOV','ORtg','DRtg',
             'NRtg','MOV/A','ORtg/A','DRtg/A','NRtg/A','ORtg/100','DRtg/100']
for f in features :
    data[f'{f}_diff'] = data[f] - data[f'{f}_prev']

# Additional features
data['PTS_rel'] = [(pts-prev)/prev if prev > 0 else 0 for pts, prev in zip(data['PTS'], data['PTS_prev'])] # Points relative to previous year
data['PER_rel'] = (data['PER'] - data['PER_prev']) / data['PER_prev'] # PER relative to previous year
data['PTS_high_diff'] = data['PTS'] - data['PTS_high'] # Points difference with career high

### CUTS ###
# Previous seasons requirements (minutes and game)
data = data[data['MP_prev'] >= 5] # Minimum 5 minutes played in previous season
data = data[data['G_prev'] >= 10] # Minimum 10 games played in previous season
# Eligibility
data = data[data['PTS'] > data['PTS_high']] # Career high in points
data = data[(data['AST'] >= data['AST_high']) | (data['TRB'] >= data['TRB_high']) | (data['MP'] >= data['MP_high'])] # At least a career high in minutes, assists or rebounds

data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,NRtg_diff,MOV/A_diff,ORtg/A_diff,DRtg/A_diff,NRtg/A_diff,ORtg/100_diff,DRtg/100_diff,PTS_rel,PER_rel,PTS_high_diff
5,Jevon Carter,PG,27,MIL,33,25,25.2,2.8,6.9,0.414,...,-0.71,-0.49,-2.06,-1.71,-0.34,-33.0,-4.0,0.809524,0.145833,2.7
15,Anthony Edwards,SG,21,MIN,33,33,36.6,8.4,18.3,0.458,...,-3.42,-3.9,-2.4,1.37,-3.77,-1.0,1.0,0.089202,0.024242,1.9
19,Jaden McDaniels,PF,22,MIN,30,30,30.7,4.4,8.5,0.52,...,-3.42,-3.9,-2.4,1.37,-3.77,4.0,1.0,0.206522,0.116505,1.9
20,Jaylen Nowell,SG,23,MIN,33,0,19.9,4.5,10.6,0.429,...,-3.42,-3.9,-2.4,1.37,-3.77,-16.0,1.0,0.352941,-0.160714,2.5
31,Bogdan Bogdanović,SG,30,ATL,10,2,29.8,6.3,14.8,0.426,...,-1.93,-1.97,-3.92,-1.87,-2.05,-7.0,-1.0,0.172185,-0.077922,1.3


# Features, model

In [7]:
features = ['PTS_high','FG','VORP','FGA','OWS','WS','MP','PER','TRB','AST','OBPM','USG%']
features = [f + '_diff' for f in features] + ['PER_rel','lifetime']
model = RandomForestRegressor(n_estimators = 100, max_features = 'auto', min_samples_leaf = 2)

# Minimum minutes per game
mp = 10
# Propotion of maximum games played
gp = 1/3

In [8]:
# Load training data
train_data = pd.read_csv(path + 'Algorithm/mip/mip_data.csv')

# Load weekly info
with open(path + f'Algorithm/weekly data/week_{week}.txt', 'r') as file:
            date1 = file.readline().splitlines()[0][7:]
            date2 = file.readline().splitlines()[0][7:]
            n = int(file.readline()[12:])
            
season_progress = n / 15

In [9]:
# Set training data
X_train = train_data[features]
y_train = train_data['Share']

# Set minimum requirements (games and minutes played)
data = data[data['MP'] >= mp] # Minimum of minutes per game
data = data[data['G'] >= season_progress * gp] # At least 1/3rd of max games played

pred = []
for step in range(0, 10) :
    print('Model ' + str(step+1) + '/10 ...')
    clear_output(wait = True)
    # Fit Machine Learning model
    model.fit(X_train, y_train)

    # Predict shares of test data
    pred.append(model.predict(data[features]))

Model 10/10 ...


In [10]:
# Assemble results DataFrame
res = data.assign(Share = [np.array(pred)[:,i].mean() for i in range(0, len(data))])
res = res.sort_values('Share', ascending = False) # Sort values by predictions
res['Rank'] = list(range(1, len(res)+1)) # Add rank
res['week'] = len(res) * [week] # Add week

res = res.reset_index(drop = True)[['Player', 'Tm'] + features + ['Share']]
res.head(5)

Unnamed: 0,Player,Tm,PTS_high_diff,FG_diff,VORP_diff,FGA_diff,OWS_diff,WS_diff,MP_diff,PER_diff,TRB_diff,AST_diff,OBPM_diff,USG%_diff,PER_rel,lifetime,Share
0,Tyrese Haliburton,IND,5.4,1.8,3.512903,3.6,4.322581,4.903226,-1.4,5.7,-0.1,2.3,4.3,5.2,0.313187,3,0.397911
1,Shai Gilgeous-Alexander,OKC,7.1,2.0,3.86,2.2,6.766667,8.246667,1.2,6.3,-0.2,-0.2,2.7,2.6,0.301435,5,0.210808
2,Bol Bol,ORL,6.4,3.9,1.688235,6.5,2.170588,4.964706,20.4,4.2,5.7,0.6,1.0,0.4,0.311111,4,0.141699
3,Nick Richards,CHO,6.0,2.2,0.59697,3.7,3.872727,4.715152,11.5,4.3,4.7,0.3,3.1,1.5,0.27044,3,0.09616
4,Santi Aldama,MEM,5.8,1.8,2.416129,3.3,3.803226,6.312903,12.6,5.3,2.7,0.7,4.6,-3.1,0.519608,2,0.081434


In [11]:
res.to_csv(path + f'Results/mip/results_week_{week}.csv', index = None)