In [1]:
# Import packages
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor
from IPython.display import clear_output

# Set working path
path = '/Users/martinbogaert/Desktop/NBA Data Analysis/nba_ai_awards/'

print('MAKE SURE YOU CHANGE THE WEEK VARIABLE')

MAKE SURE YOU CHANGE THE WEEK VARIABLE


In [2]:
week = 9 # CHANGE THIS
date = datetime.now().strftime('%d %B %Y')
time = datetime.now().strftime('%H:%M:%S')
print(date + ' ; ' + time)

19 December 2022 ; 16:29:04


In [3]:
scrape_year = 2023

# Data preparation

In [4]:
# Load to-be-predicted data
data = pd.read_csv(path + f'Algorithm/weekly data/week_{week}.csv').groupby('roy').get_group(0) # Load up-to-date data, omit rookies
# Load big data file (to compute careers and comparison)
final_data = pd.read_csv(path + 'final_data.csv')
final_data = final_data[final_data['Year'] < scrape_year]
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A,roy,smoy
1,Bobby Portis,PF,27,MIL,29,8,26.5,6.1,12.1,0.504,...,4.17,113.54,109.23,4.31,3.78,113.43,109.5,3.93,0,1
2,Brook Lopez,C,34,MIL,28,28,30.5,5.1,10.7,0.48,...,4.17,113.54,109.23,4.31,3.78,113.43,109.5,3.93,0,0
3,George Hill,PG,36,MIL,29,0,19.8,1.6,3.8,0.427,...,4.17,113.54,109.23,4.31,3.78,113.43,109.5,3.93,0,1
4,Giannis Antetokounmpo,PF,28,MIL,24,24,32.8,11.1,21.0,0.527,...,4.17,113.54,109.23,4.31,3.78,113.43,109.5,3.93,0,0
5,Grayson Allen,SG,27,MIL,26,24,27.2,3.5,7.5,0.464,...,4.17,113.54,109.23,4.31,3.78,113.43,109.5,3.93,0,0


In [5]:
# For each player, determine his career lifetime, his career high in points, assists, rebounds and minutes played
# as well as his previous playing year
def years_in_league(df):
    player = list(df['Player'])[0] # get player name
    temp = final_data.groupby('Player').get_group(player).sort_values('Year') # get player data sorted by year
    #if len(list(temp['Year'])) < 2 : # if it's his first year set dummy previous year (easy to cut)
      #  df['prev_year'] = [0]
    #else : # else save previous playing year
    df['prev_year'] = [list(temp['Year'])[-1]]

    df['lifetime'] = [len(temp) + 1] # compute career lifetime
    
    df_prev = temp[temp['Year'] < scrape_year] # get previous data
    # Compute career high
    df['PTS_high'] = [df_prev['PTS'].max()]
    df['AST_high'] = [df_prev['AST'].max()]
    df['TRB_high'] = [df_prev['TRB'].max()]
    df['MP_high'] = [df_prev['MP'].max()]

    return df

data = data.groupby('Player').apply(years_in_league)
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DRtg/A,NRtg/A,roy,smoy,prev_year,lifetime,PTS_high,AST_high,TRB_high,MP_high
1,Bobby Portis,PF,27,MIL,29,8,26.5,6.1,12.1,0.504,...,109.5,3.93,0,1,2022,8,14.6,1.7,9.1,28.2
2,Brook Lopez,C,34,MIL,28,28,30.5,5.1,10.7,0.48,...,109.5,3.93,0,0,2022,15,20.7,2.3,8.6,36.9
3,George Hill,PG,36,MIL,29,0,19.8,1.6,3.8,0.427,...,109.5,3.93,0,1,2022,15,16.9,5.1,4.2,34.5
4,Giannis Antetokounmpo,PF,28,MIL,24,24,32.8,11.1,21.0,0.527,...,109.5,3.93,0,0,2022,10,29.9,5.9,13.6,36.7
5,Grayson Allen,SG,27,MIL,26,24,27.2,3.5,7.5,0.464,...,109.5,3.93,0,0,2022,5,11.1,2.2,3.4,27.3


In [6]:
# Only keep players having played in 2022 and merge with last year data : may be a case where MIP might be relative to a further year !!!
data = data.groupby('prev_year').get_group(scrape_year-1) 
data = data.merge(final_data.rename(columns = {'Year' : 'prev_year'}), on = ['Player', 'prev_year'], suffixes = ['', '_prev'], how = 'inner')

### FEATURES ####
# Calculate the difference in stats for all numerical features
features =  ['G','GS','MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB','AST',
             'STL','BLK','TOV','PF','PTS','PER','TS%','3PAr','FTr','ORB%','DRB%','TRB%','AST%','STL%','BLK%','TOV%','USG%',
             'OWS','DWS','WS','WS/48','OBPM','DBPM','BPM','VORP','W','L','W/L%','PS/G','PA/G','SRS','Seed','MOV','ORtg','DRtg',
             'NRtg','MOV/A','ORtg/A','DRtg/A','NRtg/A','ORtg/100','DRtg/100']
for f in features :
    data[f'{f}_diff'] = data[f] - data[f'{f}_prev']

# Additional features
data['PTS_rel'] = [(pts-prev)/prev if prev > 0 else 0 for pts, prev in zip(data['PTS'], data['PTS_prev'])] # Points relative to previous year
data['PER_rel'] = (data['PER'] - data['PER_prev']) / data['PER_prev'] # PER relative to previous year
data['PTS_high_diff'] = data['PTS'] - data['PTS_high'] # Points difference with career high

### CUTS ###
# Previous seasons requirements (minutes and game)
data = data[data['MP_prev'] >= 5] # Minimum 5 minutes played in previous season
data = data[data['G_prev'] >= 10] # Minimum 10 games played in previous season
# Eligibility
data = data[data['PTS'] > data['PTS_high']] # Career high in points
data = data[(data['AST'] >= data['AST_high']) | (data['TRB'] >= data['TRB_high']) | (data['MP'] >= data['MP_high'])] # At least a career high in minutes, assists or rebounds

data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,NRtg_diff,MOV/A_diff,ORtg/A_diff,DRtg/A_diff,NRtg/A_diff,ORtg/100_diff,DRtg/100_diff,PTS_rel,PER_rel,PTS_high_diff
5,Jevon Carter,PG,27,MIL,29,24,25.8,2.9,6.8,0.429,...,1.0,0.56,-2.43,-3.17,0.75,-29.0,-6.0,0.857143,0.197917,2.9
14,Anthony Edwards,SG,21,MIN,30,30,36.4,8.2,18.0,0.456,...,-3.27,-4.08,-2.38,1.58,-3.96,-2.0,1.0,0.075117,0.0,1.6
18,Jaden McDaniels,PF,22,MIN,27,27,30.1,4.3,8.1,0.527,...,-3.27,-4.08,-2.38,1.58,-3.96,4.0,1.0,0.184783,0.116505,1.7
19,Jaylen Nowell,SG,23,MIN,30,0,19.5,4.5,10.3,0.439,...,-3.27,-4.08,-2.38,1.58,-3.96,-14.0,1.0,0.352941,-0.095238,2.5
30,Bogdan Bogdanović,SG,30,ATL,7,1,28.4,6.9,15.1,0.453,...,-2.74,-2.58,-4.33,-1.69,-2.64,-6.0,-1.0,0.278146,0.058442,2.9


# Features, model

In [7]:
features = ['PTS_high','FG','VORP','FGA','OWS','WS','MP','PER','TRB','AST','OBPM','USG%']
features = [f + '_diff' for f in features] + ['PER_rel','lifetime']
model = RandomForestRegressor(n_estimators = 100, max_features = 'auto', min_samples_leaf = 2)

# Minimum minutes per game
mp = 10
# Propotion of maximum games played
gp = 1/3

In [8]:
# Load training data
train_data = pd.read_csv(path + 'Algorithm/mip/mip_data.csv')

# Load weekly info
with open(path + f'Algorithm/weekly data/week_{week}.txt', 'r') as file:
            date1 = file.readline().splitlines()[0][7:]
            date2 = file.readline().splitlines()[0][7:]
            n = int(file.readline()[12:])
            
season_progress = n / 15

In [9]:
# Set training data
X_train = train_data[features]
y_train = train_data['Share']

# Set minimum requirements (games and minutes played)
data = data[data['MP'] >= mp] # Minimum of minutes per game
data = data[data['G'] >= season_progress * gp] # At least 1/3rd of max games played

pred = []
for step in range(0, 10) :
    print('Model ' + str(step+1) + '/10 ...')
    clear_output(wait = True)
    # Fit Machine Learning model
    model.fit(X_train, y_train)

    # Predict shares of test data
    pred.append(model.predict(data[features]))

Model 10/10 ...


In [10]:
# Assemble results DataFrame
res = data.assign(Share = [np.array(pred)[:,i].mean() for i in range(0, len(data))])
res = res.sort_values('Share', ascending = False) # Sort values by predictions
res['Rank'] = list(range(1, len(res)+1)) # Add rank
res['week'] = len(res) * [week] # Add week

res = res.reset_index(drop = True)[['Player', 'Tm'] + features + ['Share']]
res.head(5)

Unnamed: 0,Player,Tm,PTS_high_diff,FG_diff,VORP_diff,FGA_diff,OWS_diff,WS_diff,MP_diff,PER_diff,TRB_diff,AST_diff,OBPM_diff,USG%_diff,PER_rel,lifetime,Share
0,Shai Gilgeous-Alexander,OKC,6.7,1.9,3.677778,1.8,6.918519,8.155556,1.0,6.3,-0.3,-0.1,2.7,2.4,0.301435,5,0.19526
1,Tyrese Haliburton,IND,4.2,1.4,2.837931,3.2,3.282759,4.027586,-1.6,4.7,0.0,2.5,3.5,4.7,0.258242,3,0.193914
2,Desmond Bane,MEM,6.5,1.6,2.083333,3.3,3.7,3.733333,3.8,3.4,0.5,2.1,2.9,3.9,0.193182,3,0.160534
3,Bol Bol,ORL,6.7,4.0,1.851613,6.6,2.380645,5.190323,20.6,4.6,6.0,0.7,1.3,0.8,0.340741,4,0.147492
4,Devin Vassell,SAS,7.5,2.6,1.43913,5.2,1.908696,0.978261,4.4,3.7,-0.3,1.5,2.9,5.9,0.278195,3,0.137528


In [11]:
res.to_csv(path + f'Results/mip/results_week_{week}.csv', index = None)