In [1]:
# Import packages
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor
from IPython.display import clear_output

# Set working path
path = '/Users/martinbogaert/Desktop/NBA Data Analysis/nba_ai_awards/'

print('MAKE SURE YOU CHANGE THE WEEK VARIABLE')

MAKE SURE YOU CHANGE THE WEEK VARIABLE


In [2]:
week = 23 # CHANGE THIS
date = datetime.now().strftime('%d %B %Y')
time = datetime.now().strftime('%H:%M:%S')
print(date + ' ; ' + time)

29 March 2023 ; 12:24:40


In [3]:
scrape_year = 2023

# Data preparation

In [4]:
# Load to-be-predicted data
data = pd.read_csv(path + f'Algorithm/weekly data/week_{week}.csv').groupby('roy').get_group(0) # Load up-to-date data, omit rookies
# Load big data file (to compute careers and comparison)
final_data = pd.read_csv(path + 'final_data.csv')
final_data = final_data[final_data['Year'] < scrape_year]
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A,roy,smoy
1,Bobby Portis,PF,27,MIL,64,20,26.0,5.6,11.3,0.496,...,4.36,115.92,111.72,4.2,4.14,115.78,111.79,4.0,0,1
2,Brook Lopez,C,34,MIL,73,73,30.5,6.0,11.5,0.524,...,4.36,115.92,111.72,4.2,4.14,115.78,111.79,4.0,0,0
3,Giannis Antetokounmpo,PF,28,MIL,59,59,32.2,11.2,20.3,0.55,...,4.36,115.92,111.72,4.2,4.14,115.78,111.79,4.0,0,0
4,Goran Dragić,PG,36,MIL,52,0,15.3,2.4,5.8,0.422,...,4.36,115.92,111.72,4.2,4.14,115.78,111.79,4.0,0,1
5,Grayson Allen,SG,27,MIL,69,67,27.6,3.5,7.9,0.442,...,4.36,115.92,111.72,4.2,4.14,115.78,111.79,4.0,0,0


In [5]:
# For each player, determine his career lifetime, his career high in points, assists, rebounds and minutes played
# as well as his previous playing year
def years_in_league(df):
    player = list(df['Player'])[0] # get player name
    temp = final_data.groupby('Player').get_group(player).sort_values('Year') # get player data sorted by year
    #if len(list(temp['Year'])) < 2 : # if it's his first year set dummy previous year (easy to cut)
      #  df['prev_year'] = [0]
    #else : # else save previous playing year
    df['prev_year'] = [list(temp['Year'])[-1]]

    df['lifetime'] = [len(temp) + 1] # compute career lifetime
    
    df_prev = temp[temp['Year'] < scrape_year] # get previous data
    # Compute career high
    df['PTS_high'] = [df_prev['PTS'].max()]
    df['AST_high'] = [df_prev['AST'].max()]
    df['TRB_high'] = [df_prev['TRB'].max()]
    df['MP_high'] = [df_prev['MP'].max()]

    return df

data = data.groupby('Player').apply(years_in_league)
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DRtg/A,NRtg/A,roy,smoy,prev_year,lifetime,PTS_high,AST_high,TRB_high,MP_high
1,Bobby Portis,PF,27,MIL,64,20,26.0,5.6,11.3,0.496,...,111.79,4.0,0,1,2022,8,14.6,1.7,9.1,28.2
2,Brook Lopez,C,34,MIL,73,73,30.5,6.0,11.5,0.524,...,111.79,4.0,0,0,2022,15,20.7,2.3,8.6,36.9
3,Giannis Antetokounmpo,PF,28,MIL,59,59,32.2,11.2,20.3,0.55,...,111.79,4.0,0,0,2022,10,29.9,5.9,13.6,36.7
4,Goran Dragić,PG,36,MIL,52,0,15.3,2.4,5.8,0.422,...,111.79,4.0,0,1,2022,15,20.3,7.4,4.1,35.1
5,Grayson Allen,SG,27,MIL,69,67,27.6,3.5,7.9,0.442,...,111.79,4.0,0,0,2022,5,11.1,2.2,3.4,27.3


In [6]:
# Only keep players having played in 2022 and merge with last year data : may be a case where MIP might be relative to a further year !!!
data = data.groupby('prev_year').get_group(scrape_year-1) 
data = data.merge(final_data.rename(columns = {'Year' : 'prev_year'}), on = ['Player', 'prev_year'], suffixes = ['', '_prev'], how = 'inner')

### FEATURES ####
# Calculate the difference in stats for all numerical features
features =  ['G','GS','MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB','AST',
             'STL','BLK','TOV','PF','PTS','PER','TS%','3PAr','FTr','ORB%','DRB%','TRB%','AST%','STL%','BLK%','TOV%','USG%',
             'OWS','DWS','WS','WS/48','OBPM','DBPM','BPM','VORP','W','L','W/L%','PS/G','PA/G','SRS','Seed','MOV','ORtg','DRtg',
             'NRtg','MOV/A','ORtg/A','DRtg/A','NRtg/A','ORtg/100','DRtg/100']
for f in features :
    data[f'{f}_diff'] = data[f] - data[f'{f}_prev']

# Additional features
data['PTS_rel'] = [(pts-prev)/prev if prev > 0 else 0 for pts, prev in zip(data['PTS'], data['PTS_prev'])] # Points relative to previous year
data['PER_rel'] = (data['PER'] - data['PER_prev']) / data['PER_prev'] # PER relative to previous year
data['PTS_high_diff'] = data['PTS'] - data['PTS_high'] # Points difference with career high

### CUTS ###
# Previous seasons requirements (minutes and game)
data = data[data['MP_prev'] >= 5] # Minimum 5 minutes played in previous season
data = data[data['G_prev'] >= 10] # Minimum 10 games played in previous season
# Eligibility
data = data[data['PTS'] > data['PTS_high']] # Career high in points
data = data[(data['AST'] >= data['AST_high']) | (data['TRB'] >= data['TRB_high']) | (data['MP'] >= data['MP_high'])] # At least a career high in minutes, assists or rebounds

data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,NRtg_diff,MOV/A_diff,ORtg/A_diff,DRtg/A_diff,NRtg/A_diff,ORtg/100_diff,DRtg/100_diff,PTS_rel,PER_rel,PTS_high_diff
6,Jevon Carter,PG,27,MIL,75,36,22.2,3.0,7.0,0.427,...,0.89,0.92,-0.08,-0.88,0.82,-29.0,-2.0,0.928571,0.145833,3.2
22,Josh Green,SG,22,DAL,56,21,25.5,3.4,6.4,0.535,...,-3.02,-2.77,3.77,6.81,-3.04,5.0,9.0,0.916667,-0.042017,4.4
26,Luka Dončić,PG,23,DAL,61,61,36.3,11.0,22.2,0.497,...,-3.02,-2.77,3.77,6.81,-3.04,10.0,6.0,0.158451,0.167331,4.1
37,De'Andre Hunter,SF,25,ATL,65,65,32.0,5.7,12.3,0.459,...,-1.38,-1.47,0.07,1.56,-1.49,6.0,2.0,0.149254,0.075472,0.4
40,Jalen Johnson,SF,21,ATL,64,6,14.5,2.0,4.3,0.48,...,-1.38,-1.47,0.07,1.56,-1.49,17.0,-1.0,1.166667,0.176991,2.8


# Features, model

In [7]:
features = ['PTS_high','FG','VORP','FGA','OWS','WS','MP','PER','TRB','AST','OBPM','USG%']
features = [f + '_diff' for f in features] + ['PER_rel','lifetime']
model = RandomForestRegressor(n_estimators = 100, max_features = 'auto', min_samples_leaf = 2)

# Minimum minutes per game
mp = 10
# Propotion of maximum games played
gp = 1/3

In [8]:
# Load training data
train_data = pd.read_csv(path + 'Algorithm/mip/mip_data.csv')

# Load weekly info
with open(path + f'Algorithm/weekly data/week_{week}.txt', 'r') as file:
            date1 = file.readline().splitlines()[0][7:]
            date2 = file.readline().splitlines()[0][7:]
            n = int(file.readline()[12:])
            
season_progress = n / 15

In [9]:
# Set training data
X_train = train_data[features]
y_train = train_data['Share']

# Set minimum requirements (games and minutes played)
data = data[data['MP'] >= mp] # Minimum of minutes per game
data = data[data['G'] >= season_progress * gp] # At least 1/3rd of max games played

pred = []
for step in range(0, 10) :
    print('Model ' + str(step+1) + '/10 ...')
    clear_output(wait = True)
    # Fit Machine Learning model
    model.fit(X_train, y_train)

    # Predict shares of test data
    pred.append(model.predict(data[features]))

Model 10/10 ...


In [10]:
# Assemble results DataFrame
res = data.assign(Share = [np.array(pred)[:,i].mean() for i in range(0, len(data))])
res = res.sort_values('Share', ascending = False) # Sort values by predictions
res['Rank'] = list(range(1, len(res)+1)) # Add rank
res['week'] = len(res) * [week] # Add week

res = res.reset_index(drop = True)[['Player', 'Tm'] + features + ['Share']]
res.head(5)

Unnamed: 0,Player,Tm,PTS_high_diff,FG_diff,VORP_diff,FGA_diff,OWS_diff,WS_diff,MP_diff,PER_diff,TRB_diff,AST_diff,OBPM_diff,USG%_diff,PER_rel,lifetime,Share
0,Tyrese Haliburton,IND,5.4,1.8,3.196429,3.2,4.171429,4.421429,-1.4,5.5,-0.3,2.2,4.2,4.9,0.302198,3,0.387322
1,Lauri Markkanen,UTA,7.0,3.6,3.063077,5.8,5.373846,5.344615,3.6,7.6,2.9,0.6,4.5,7.0,0.517007,6,0.293358
2,Shai Gilgeous-Alexander,OKC,6.8,1.9,4.090625,1.5,7.19375,8.98125,0.8,6.3,-0.2,-0.5,2.8,2.1,0.301435,5,0.221582
3,Jalen Brunson,NYK,7.5,2.1,2.263077,4.6,3.426154,3.096923,3.1,4.1,-0.3,1.4,3.4,5.1,0.239766,5,0.179819
4,Isaiah Joe,OKC,5.7,2.0,1.726471,3.6,3.255882,4.182353,7.5,7.6,1.4,0.6,5.1,1.9,0.987013,3,0.117014


In [11]:
res.to_csv(path + f'Results/mip/results_week_{week}.csv', index = None)