In [1]:
# Import packages
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor
from IPython.display import clear_output

# Set working path
path = '/Users/martinbogaert/Desktop/NBA Data Analysis/2022-2023 Awards Project clean/'

print('MAKE SURE YOU CHANGE THE WEEK VARIABLE')

MAKE SURE YOU CHANGE THE WEEK VARIABLE


In [2]:
week = 2 # CHANGE THIS
date = datetime.now().strftime('%d %B %Y')
time = datetime.now().strftime('%H:%M:%S')
print(date + ' ; ' + time)

31 October 2022 ; 11:09:49


In [3]:
scrape_year = 2023

# Data preparation

In [4]:
# Load to-be-predicted data
data = pd.read_csv(path + f'Algorithm/weekly data/week_{week}.csv').groupby('roy').get_group(0) # Load up-to-date data, omit rookies
# Load big data file (to compute careers and comparison)
final_data = pd.read_csv(path + 'final_data.csv')
final_data = final_data[final_data['Year'] < scrape_year]
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A,roy,smoy
0,Precious Achiuwa,C,23,TOR,6,0,23.2,3.0,8.5,0.353,...,-1.33,110.95,112.39,-1.44,0.2,111.25,110.99,0.26,0,1
1,OG Anunoby,SF,25,TOR,6,6,36.2,4.7,10.8,0.431,...,-1.33,110.95,112.39,-1.44,0.2,111.25,110.99,0.26,0,0
2,Dalano Banton,PG,23,TOR,6,0,8.2,1.0,2.5,0.4,...,-1.33,110.95,112.39,-1.44,0.2,111.25,110.99,0.26,0,1
3,Scottie Barnes,PF,21,TOR,5,5,28.6,5.8,11.0,0.527,...,-1.33,110.95,112.39,-1.44,0.2,111.25,110.99,0.26,0,0
4,Khem Birch,C,30,TOR,2,0,6.5,0.5,1.0,0.5,...,-1.33,110.95,112.39,-1.44,0.2,111.25,110.99,0.26,0,1


In [5]:
# For each player, determine his career lifetime, his career high in points, assists, rebounds and minutes played
# as well as his previous playing year
def years_in_league(df):
    player = list(df['Player'])[0] # get player name
    temp = final_data.groupby('Player').get_group(player).sort_values('Year') # get player data sorted by year
    #if len(list(temp['Year'])) < 2 : # if it's his first year set dummy previous year (easy to cut)
      #  df['prev_year'] = [0]
    #else : # else save previous playing year
    df['prev_year'] = [list(temp['Year'])[-1]]

    df['lifetime'] = [len(temp) + 1] # compute career lifetime
    
    df_prev = temp[temp['Year'] < scrape_year] # get previous data
    # Compute career high
    df['PTS_high'] = [df_prev['PTS'].max()]
    df['AST_high'] = [df_prev['AST'].max()]
    df['TRB_high'] = [df_prev['TRB'].max()]
    df['MP_high'] = [df_prev['MP'].max()]

    return df

data = data.groupby('Player').apply(years_in_league)
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DRtg/A,NRtg/A,roy,smoy,prev_year,lifetime,PTS_high,AST_high,TRB_high,MP_high
0,Precious Achiuwa,C,23,TOR,6,0,23.2,3.0,8.5,0.353,...,110.99,0.26,0,1,2022,3,9.1,1.1,6.5,23.6
1,OG Anunoby,SF,25,TOR,6,6,36.2,4.7,10.8,0.431,...,110.99,0.26,0,0,2022,6,17.1,2.6,5.5,36.0
2,Dalano Banton,PG,23,TOR,6,0,8.2,1.0,2.5,0.4,...,110.99,0.26,0,1,2022,2,3.2,1.5,1.9,10.9
3,Scottie Barnes,PF,21,TOR,5,5,28.6,5.8,11.0,0.527,...,110.99,0.26,0,0,2022,2,15.3,3.5,7.5,35.4
4,Khem Birch,C,30,TOR,2,0,6.5,0.5,1.0,0.5,...,110.99,0.26,0,1,2022,6,7.2,1.3,5.8,22.8


In [6]:
# Only keep players having played in 2022 and merge with last year data : may be a case where MIP might be relative to a further year !!!
data = data.groupby('prev_year').get_group(scrape_year-1) 
data = data.merge(final_data.rename(columns = {'Year' : 'prev_year'}), on = ['Player', 'prev_year'], suffixes = ['', '_prev'], how = 'inner')

### FEATURES ####
# Calculate the difference in stats for all numerical features
features =  ['G','GS','MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB','AST',
             'STL','BLK','TOV','PF','PTS','PER','TS%','3PAr','FTr','ORB%','DRB%','TRB%','AST%','STL%','BLK%','TOV%','USG%',
             'OWS','DWS','WS','WS/48','OBPM','DBPM','BPM','VORP','W','L','W/L%','PS/G','PA/G','SRS','Seed','MOV','ORtg','DRtg',
             'NRtg','MOV/A','ORtg/A','DRtg/A','NRtg/A','ORtg/100','DRtg/100']
for f in features :
    data[f'{f}_diff'] = data[f] - data[f'{f}_prev']

# Additional features
data['PTS_rel'] = [(pts-prev)/prev if prev > 0 else 0 for pts, prev in zip(data['PTS'], data['PTS_prev'])] # Points relative to previous year
data['PER_rel'] = (data['PER'] - data['PER_prev']) / data['PER_prev'] # PER relative to previous year
data['PTS_high_diff'] = data['PTS'] - data['PTS_high'] # Points difference with career high

### CUTS ###
# Previous seasons requirements (minutes and game)
data = data[data['MP_prev'] >= 5] # Minimum 5 minutes played in previous season
data = data[data['G_prev'] >= 10] # Minimum 10 games played in previous season
# Eligibility
data = data[data['PTS'] > data['PTS_high']] # Career high in points
data = data[(data['AST'] >= data['AST_high']) | (data['TRB'] >= data['TRB_high']) | (data['MP'] >= data['MP_high'])] # At least a career high in minutes, assists or rebounds

data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,NRtg_diff,MOV/A_diff,ORtg/A_diff,DRtg/A_diff,NRtg/A_diff,ORtg/100_diff,DRtg/100_diff,PTS_rel,PER_rel,PTS_high_diff
9,Pascal Siakam,C,28,TOR,6,6,37.8,9.2,18.8,0.487,...,-3.82,-2.17,-2.61,-0.41,-2.2,-1.0,2.0,0.109649,0.103448,2.4
10,Gary Trent Jr.,SG,24,TOR,6,6,37.0,7.0,16.0,0.438,...,-3.82,-2.17,-2.61,-0.41,-2.2,-5.0,4.0,0.04918,-0.081633,0.9
14,Santi Aldama,PF,22,MEM,6,6,30.5,4.0,8.5,0.471,...,-6.7,-7.58,2.58,10.07,-7.49,19.0,8.0,1.634146,0.245098,6.7
15,Desmond Bane,SG,24,MEM,6,6,33.0,8.2,18.5,0.441,...,-6.7,-7.58,2.58,10.07,-7.49,3.0,11.0,0.32967,0.1875,6.0
18,Tyus Jones,PG,26,MEM,6,1,25.3,5.5,12.2,0.452,...,-6.7,-7.58,2.58,10.07,-7.49,-6.0,11.0,0.551724,0.018182,4.8


# Features, model

In [7]:
features = ['PTS_high','FG','VORP','FGA','OWS','WS','MP','PER','TRB','AST','OBPM','USG%']
features = [f + '_diff' for f in features] + ['PER_rel','lifetime']
model = RandomForestRegressor(n_estimators = 100, max_features = 'auto', min_samples_leaf = 2)

# Minimum minutes per game
mp = 10
# Propotion of maximum games played
gp = 1/3

In [8]:
# Load training data
train_data = pd.read_csv(path + 'Algorithm/mip/mip_data.csv')

# Load weekly info
with open(path + f'Algorithm/weekly data/week_{week}.txt', 'r') as file:
            date1 = file.readline().splitlines()[0][7:]
            date2 = file.readline().splitlines()[0][7:]
            n = int(file.readline()[12:])
            
season_progress = n / 15

In [9]:
# Set training data
X_train = train_data[features]
y_train = train_data['Share']

# Set minimum requirements (games and minutes played)
data = data[data['MP'] >= mp] # Minimum of minutes per game
data = data[data['G'] >= season_progress * gp] # At least 1/3rd of max games played

pred = []
for step in range(0, 10) :
    print('Model ' + str(step+1) + '/10 ...')
    clear_output(wait = True)
    # Fit Machine Learning model
    model.fit(X_train, y_train)

    # Predict shares of test data
    pred.append(model.predict(data[features]))

Model 10/10 ...


In [10]:
# Assemble results DataFrame
res = data.assign(Share = [np.array(pred)[:,i].mean() for i in range(0, len(data))])
res = res.sort_values('Share', ascending = False) # Sort values by predictions
res['Rank'] = list(range(1, len(res)+1)) # Add rank
res['week'] = len(res) * [week] # Add week

res = res.reset_index(drop = True)[['Player', 'Tm'] + features + ['Share']]
res.head(5)

Unnamed: 0,Player,Tm,PTS_high_diff,FG_diff,VORP_diff,FGA_diff,OWS_diff,WS_diff,MP_diff,PER_diff,TRB_diff,AST_diff,OBPM_diff,USG%_diff,PER_rel,lifetime,Share
0,Tyrese Haliburton,IND,8.1,2.3,2.757143,3.9,6.514286,5.885714,-1.6,7.7,0.1,1.8,5.2,6.4,0.423077,3,0.522461
1,Nick Richards,CHO,9.5,3.5,2.833333,5.4,8.966667,9.933333,13.9,10.1,5.8,0.5,7.4,2.4,0.63522,3,0.394506
2,Luka Dončić,DAL,7.9,2.9,5.033333,3.9,12.6,12.9,1.3,10.8,0.4,0.0,3.1,3.5,0.430279,5,0.29928
3,Trey Murphy III,NOP,9.6,3.5,2.533333,5.5,5.533333,7.666667,17.8,4.3,3.6,0.4,2.5,0.0,0.344,2,0.254805
4,Bol Bol,ORL,5.2,3.5,2.342857,5.1,3.514286,6.928571,14.5,10.1,5.3,-0.1,2.6,1.6,0.748148,4,0.216726


In [11]:
res.to_csv(path + f'Results/mip/results_week_{week}.csv', index = None)