In [1]:
# Import packages
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor
from IPython.display import clear_output

# Set working path
path = '/Users/martinbogaert/Desktop/NBA Data Analysis/nba_ai_awards/'

print('MAKE SURE YOU CHANGE THE WEEK VARIABLE')

MAKE SURE YOU CHANGE THE WEEK VARIABLE


In [2]:
week = 6 # CHANGE THIS
date = datetime.now().strftime('%d %B %Y')
time = datetime.now().strftime('%H:%M:%S')
print(date + ' ; ' + time)

28 November 2022 ; 13:17:17


In [3]:
scrape_year = 2023

# Data preparation

In [4]:
# Load to-be-predicted data
data = pd.read_csv(path + f'Algorithm/weekly data/week_{week}.csv').groupby('roy').get_group(0) # Load up-to-date data, omit rookies
# Load big data file (to compute careers and comparison)
final_data = pd.read_csv(path + 'final_data.csv')
final_data = final_data[final_data['Year'] < scrape_year]
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A,roy,smoy
0,Precious Achiuwa,C,23,TOR,12,0,20.4,3.0,7.7,0.391,...,1.79,114.14,112.51,1.63,1.89,114.64,112.87,1.77,0,1
1,OG Anunoby,SF,25,TOR,19,19,36.6,7.1,15.3,0.466,...,1.79,114.14,112.51,1.63,1.89,114.64,112.87,1.77,0,0
2,Dalano Banton,PG,23,TOR,15,2,11.2,2.3,5.5,0.427,...,1.79,114.14,112.51,1.63,1.89,114.64,112.87,1.77,0,1
3,Scottie Barnes,PF,21,TOR,16,16,33.5,6.0,13.7,0.438,...,1.79,114.14,112.51,1.63,1.89,114.64,112.87,1.77,0,0
4,Khem Birch,C,30,TOR,9,0,8.0,1.0,1.6,0.643,...,1.79,114.14,112.51,1.63,1.89,114.64,112.87,1.77,0,1


In [5]:
# For each player, determine his career lifetime, his career high in points, assists, rebounds and minutes played
# as well as his previous playing year
def years_in_league(df):
    player = list(df['Player'])[0] # get player name
    temp = final_data.groupby('Player').get_group(player).sort_values('Year') # get player data sorted by year
    #if len(list(temp['Year'])) < 2 : # if it's his first year set dummy previous year (easy to cut)
      #  df['prev_year'] = [0]
    #else : # else save previous playing year
    df['prev_year'] = [list(temp['Year'])[-1]]

    df['lifetime'] = [len(temp) + 1] # compute career lifetime
    
    df_prev = temp[temp['Year'] < scrape_year] # get previous data
    # Compute career high
    df['PTS_high'] = [df_prev['PTS'].max()]
    df['AST_high'] = [df_prev['AST'].max()]
    df['TRB_high'] = [df_prev['TRB'].max()]
    df['MP_high'] = [df_prev['MP'].max()]

    return df

data = data.groupby('Player').apply(years_in_league)
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DRtg/A,NRtg/A,roy,smoy,prev_year,lifetime,PTS_high,AST_high,TRB_high,MP_high
0,Precious Achiuwa,C,23,TOR,12,0,20.4,3.0,7.7,0.391,...,112.87,1.77,0,1,2022,3,9.1,1.1,6.5,23.6
1,OG Anunoby,SF,25,TOR,19,19,36.6,7.1,15.3,0.466,...,112.87,1.77,0,0,2022,6,17.1,2.6,5.5,36.0
2,Dalano Banton,PG,23,TOR,15,2,11.2,2.3,5.5,0.427,...,112.87,1.77,0,1,2022,2,3.2,1.5,1.9,10.9
3,Scottie Barnes,PF,21,TOR,16,16,33.5,6.0,13.7,0.438,...,112.87,1.77,0,0,2022,2,15.3,3.5,7.5,35.4
4,Khem Birch,C,30,TOR,9,0,8.0,1.0,1.6,0.643,...,112.87,1.77,0,1,2022,6,7.2,1.3,5.8,22.8


In [6]:
# Only keep players having played in 2022 and merge with last year data : may be a case where MIP might be relative to a further year !!!
data = data.groupby('prev_year').get_group(scrape_year-1) 
data = data.merge(final_data.rename(columns = {'Year' : 'prev_year'}), on = ['Player', 'prev_year'], suffixes = ['', '_prev'], how = 'inner')

### FEATURES ####
# Calculate the difference in stats for all numerical features
features =  ['G','GS','MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB','AST',
             'STL','BLK','TOV','PF','PTS','PER','TS%','3PAr','FTr','ORB%','DRB%','TRB%','AST%','STL%','BLK%','TOV%','USG%',
             'OWS','DWS','WS','WS/48','OBPM','DBPM','BPM','VORP','W','L','W/L%','PS/G','PA/G','SRS','Seed','MOV','ORtg','DRtg',
             'NRtg','MOV/A','ORtg/A','DRtg/A','NRtg/A','ORtg/100','DRtg/100']
for f in features :
    data[f'{f}_diff'] = data[f] - data[f'{f}_prev']

# Additional features
data['PTS_rel'] = [(pts-prev)/prev if prev > 0 else 0 for pts, prev in zip(data['PTS'], data['PTS_prev'])] # Points relative to previous year
data['PER_rel'] = (data['PER'] - data['PER_prev']) / data['PER_prev'] # PER relative to previous year
data['PTS_high_diff'] = data['PTS'] - data['PTS_high'] # Points difference with career high

### CUTS ###
# Previous seasons requirements (minutes and game)
data = data[data['MP_prev'] >= 5] # Minimum 5 minutes played in previous season
data = data[data['G_prev'] >= 10] # Minimum 10 games played in previous season
# Eligibility
data = data[data['PTS'] > data['PTS_high']] # Career high in points
data = data[(data['AST'] >= data['AST_high']) | (data['TRB'] >= data['TRB_high']) | (data['MP'] >= data['MP_high'])] # At least a career high in minutes, assists or rebounds

data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,NRtg_diff,MOV/A_diff,ORtg/A_diff,DRtg/A_diff,NRtg/A_diff,ORtg/100_diff,DRtg/100_diff,PTS_rel,PER_rel,PTS_high_diff
1,OG Anunoby,SF,25,TOR,19,19,36.6,7.1,15.3,0.466,...,-0.75,-0.48,0.78,1.47,-0.69,-5.0,-2.0,0.105263,0.087838,1.8
2,Dalano Banton,PG,23,TOR,15,2,11.2,2.3,5.5,0.427,...,-0.75,-0.48,0.78,1.47,-0.69,14.0,-1.0,1.0,0.82,3.2
11,Pascal Siakam,C,28,TOR,9,9,35.7,8.9,18.6,0.479,...,-0.75,-0.48,0.78,1.47,-0.69,4.0,2.0,0.087719,0.192118,1.9
16,Santi Aldama,PF,22,MEM,19,15,25.8,3.6,8.0,0.447,...,-4.11,-4.03,-1.03,2.82,-3.85,18.0,0.0,1.341463,0.352941,5.5
17,Desmond Bane,SG,24,MEM,12,12,33.6,8.3,17.8,0.465,...,-4.11,-4.03,-1.03,2.82,-3.85,6.0,6.0,0.357143,0.215909,6.5


# Features, model

In [7]:
features = ['PTS_high','FG','VORP','FGA','OWS','WS','MP','PER','TRB','AST','OBPM','USG%']
features = [f + '_diff' for f in features] + ['PER_rel','lifetime']
model = RandomForestRegressor(n_estimators = 100, max_features = 'auto', min_samples_leaf = 2)

# Minimum minutes per game
mp = 10
# Propotion of maximum games played
gp = 1/3

In [8]:
# Load training data
train_data = pd.read_csv(path + 'Algorithm/mip/mip_data.csv')

# Load weekly info
with open(path + f'Algorithm/weekly data/week_{week}.txt', 'r') as file:
            date1 = file.readline().splitlines()[0][7:]
            date2 = file.readline().splitlines()[0][7:]
            n = int(file.readline()[12:])
            
season_progress = n / 15

In [9]:
# Set training data
X_train = train_data[features]
y_train = train_data['Share']

# Set minimum requirements (games and minutes played)
data = data[data['MP'] >= mp] # Minimum of minutes per game
data = data[data['G'] >= season_progress * gp] # At least 1/3rd of max games played

pred = []
for step in range(0, 10) :
    print('Model ' + str(step+1) + '/10 ...')
    clear_output(wait = True)
    # Fit Machine Learning model
    model.fit(X_train, y_train)

    # Predict shares of test data
    pred.append(model.predict(data[features]))

Model 10/10 ...


In [10]:
# Assemble results DataFrame
res = data.assign(Share = [np.array(pred)[:,i].mean() for i in range(0, len(data))])
res = res.sort_values('Share', ascending = False) # Sort values by predictions
res['Rank'] = list(range(1, len(res)+1)) # Add rank
res['week'] = len(res) * [week] # Add week

res = res.reset_index(drop = True)[['Player', 'Tm'] + features + ['Share']]
res.head(5)

Unnamed: 0,Player,Tm,PTS_high_diff,FG_diff,VORP_diff,FGA_diff,OWS_diff,WS_diff,MP_diff,PER_diff,TRB_diff,AST_diff,OBPM_diff,USG%_diff,PER_rel,lifetime,Share
0,Tyrese Haliburton,IND,4.3,1.6,3.373684,3.2,3.863158,5.084211,-1.3,5.3,0.5,2.9,3.6,4.4,0.291209,3,0.33576
1,Jalen Brunson,NYK,5.5,1.4,2.2,3.1,4.12,2.75,1.3,4.8,-0.3,1.9,3.1,3.2,0.280702,5,0.233431
2,Shai Gilgeous-Alexander,OKC,6.6,2.4,4.205263,2.5,7.557895,8.778947,1.1,7.5,-0.2,0.3,3.5,1.9,0.358852,5,0.216727
3,Desmond Bane,MEM,6.5,1.6,2.083333,3.3,4.383333,3.05,3.8,3.8,0.5,2.1,2.9,3.9,0.215909,3,0.170869
4,Bol Bol,ORL,7.3,4.2,1.64,7.0,2.46,5.23,20.7,5.4,6.6,0.5,1.3,1.4,0.4,4,0.13361


In [11]:
res.to_csv(path + f'Results/mip/results_week_{week}.csv', index = None)