In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests as rqs
from bs4 import BeautifulSoup
from IPython.display import clear_output
from time import sleep
from datetime import datetime

In [10]:
def single_team(df) :
    if len(df) == 1: # If player played for a single team, do nothing
        return df
    else: # If player has moved, return total stats with last team
        row = df[df['Tm'] == 'TOT'].copy()
        row['Tm'] = [str(df['Tm'].iloc[-1])]
        return row

def SoRareScore(df):

    df = df.apply(pd.to_numeric, errors = 'ignore')
    scores = []
    for pts, reb, ast, blk, stl, tov, fg_3 in zip(df['PTS'], df['TRB'], df['AST'], df['BLK'], df['STL'], df['TOV'], df['3P']) :
        
        score = 0
        if pts not in ['Inactive', 'Did Not Play', 'Did Not Dress', 'Not With Team', 'Player Suspended']:
            
            pts, reb, ast, blk, stl, tov, fg_3 = int(pts), int(reb), int(ast), int(blk), int(stl), int(tov), int(fg_3)

            score = 1 * pts + 1.2 * reb + 1.5 * ast + 3 * blk + 3 * stl + 1 * fg_3 - 2 * tov

            doubles = 0 # Count double-digits stats
            if pts >= 10 :
                doubles += 1
            if reb >= 10 :
                doubles += 1
            if ast >= 10 :
                doubles += 1
            if blk >= 10 :
                doubles += 1
            if stl >= 10 :
                doubles += 1

            if doubles == 2 : # Double-double
                score += 1

            if doubles >= 3 : # Tripe-double
                score += 2
        
        else :
            score = 'dnp'

        scores.append(score)
    return scores

def mins_played(str) :
    min, sec = str.split(':')
    return round(float(min) + float(sec) / 60, 2)

In [26]:
# Engineer the training set
def game_logs(year) :

    # Get player data for looped year
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
    page = rqs.get(url)
    soup = BeautifulSoup(page.content, 'html')
    while soup.find('tr', class_ = 'thead') is not None :
        soup.find('tr', class_ = 'thead').decompose()
    table = soup.find('table')
    data = pd.read_html(str(table))[0][['Player', 'Tm', 'MP', 'G']]
    # Get the list of hrefs of players of looped year
    hrefs = []
    for x in table.find_all('a', href = True):
        if x['href'].split('/')[1] == 'players' :
            hrefs.append(x['href'][:-5])
    data['href'] = hrefs
    data = data.groupby('Player').apply(single_team).reset_index(drop = True)
    data = data[(data['MP'] >= 10) & (data['G'] >= 25)]

    player_df = []
    count = 1
    for player, href in zip(data['Player'], data['href']) :
        sleep(2)
        print(f'{year}: {count}/{len(data)} ... {player}')
        # Get looped player game log
        url = f'https://www.basketball-reference.com{href}/gamelog/{year}'
        page = rqs.get(url)
        soup = BeautifulSoup(page.content, 'html')
        while soup.find('tr', class_ = 'thead') is not None :
            soup.find('tr', class_ = 'thead').decompose()
        table = soup.find('table', id = 'pgl_basic')
        df = pd.read_html(str(table))[0][['Date', 'Tm', 'G', 'MP', 'PTS', 'TRB', 'AST', 'BLK', 'STL', 'TOV', '3P']]

        df.insert(0, 'Player', [player] * len(df)) # Insert player name
        df = df.apply(pd.to_numeric, errors = 'ignore')
        df['SORARE'] = SoRareScore(df) # Calculate SoRare score
        # Append player game logs to list
        player_df.append(df)
        count += 1
        clear_output(wait = True)

    data_return = pd.concat(player_df)
    data_return['Year'] = len(data_return) * [year]
   # data_return['MP'] = data_return['MP'].apply(mins_played)

    return data_return

In [27]:
data_18 = game_logs(2018)

2018: 366/366 ... Álex Abrines


In [28]:
data_18.to_csv('game_logs_2018.csv', index = None)

In [29]:
data_train = []
for year in [2018, 2019, 2020, 2021, 2022]:
    data_train.append(pd.read_csv(f'game_logs_{year}.csv'))
data_train = pd.concat(data_train)
data_train['Date'] = data_train['Date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
data_train.head()

Unnamed: 0,Player,Date,Tm,G,MP,PTS,TRB,AST,BLK,STL,TOV,3P,SORARE,Year
0,Aaron Gordon,2017-10-18,ORL,1.0,34:13,14,9,3,2,0,1,1,34.3,2018
1,Aaron Gordon,2017-10-20,ORL,,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,dnp,2018
2,Aaron Gordon,2017-10-21,ORL,,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,dnp,2018
3,Aaron Gordon,2017-10-24,ORL,2.0,36:38,41,14,2,0,0,1,5,64.8,2018
4,Aaron Gordon,2017-10-27,ORL,3.0,25:56,16,6,2,1,0,1,2,29.2,2018


In [30]:
def feature_engineering(df) :
    df = df.sort_values('Date')
    df['SORARE'] = df['SORARE'].replace('dnp', np.nan)
    df['PTS'] = df['PTS'].replace('Inactive', np.nan).replace('Did Not Play', np.nan).replace('Did Not Dress', np.nan).replace('Player Suspended', np.nan).replace('Not With Team', np.nan)

    df['form_1'] = [np.nan] + list(df['SORARE'].rolling(1).mean())[:-1]
    df['form_2'] = [np.nan] + list(df['SORARE'].rolling(2).mean())[:-1]
    df['form_5'] = [np.nan] + list(df['SORARE'].rolling(5, min_periods = 3).mean())[:-1]
    df['form_10'] = [np.nan] + list(df['SORARE'].rolling(10, min_periods = 5).mean())[:-1]
    df['mean_score'] = [np.nan] + list(df['SORARE'].rolling(25, min_periods = 1).mean())[:-1]
    df['Rest'] = [np.nan] + [(t1 - t0).days for t1, t0 in zip(df.Date[1:], df.Date[:-1])]

    return df

player_year_df = []
for x in data_train.groupby(['Player', 'Year']): # Apply feature engineering function to all player-season couples
    player_year_df.append(feature_engineering(x[1]))
data_train = pd.concat(player_year_df)
# Only keep entries which have a SoRare score
data_train = data_train[data_train['SORARE'].notna()]


data_train.head()

Unnamed: 0,Player,Date,Tm,G,MP,PTS,TRB,AST,BLK,STL,TOV,3P,SORARE,Year,form_1,form_2,form_5,form_10,mean_score,Rest
0,Aaron Gordon,2017-10-18,ORL,1.0,34:13,14,9,3,2,0,1,1,34.3,2018,,,,,,
3,Aaron Gordon,2017-10-24,ORL,2.0,36:38,41,14,2,0,0,1,5,64.8,2018,,,,,34.3,3.0
4,Aaron Gordon,2017-10-27,ORL,3.0,25:56,16,6,2,1,0,1,2,29.2,2018,64.8,,,,49.55,3.0
5,Aaron Gordon,2017-10-29,ORL,4.0,36:09,17,9,4,2,2,3,2,41.8,2018,29.2,47.0,42.766667,,42.766667,2.0
6,Aaron Gordon,2017-10-30,ORL,5.0,18:20,17,7,1,0,2,4,3,27.9,2018,41.8,35.5,45.266667,,42.525,1.0


In [31]:
# Only keep entries with non-null features
print(data_train.shape)
data_train = data_train[(data_train['form_1'].notna()) & (data_train['form_2'].notna()) & (data_train['form_5'].notna()) & (data_train['form_10'].notna()) & (data_train['mean_score'].notna())]
print(data_train.shape)
data_train.head()

(114041, 20)
(85047, 20)


Unnamed: 0,Player,Date,Tm,G,MP,PTS,TRB,AST,BLK,STL,TOV,3P,SORARE,Year,form_1,form_2,form_5,form_10,mean_score,Rest
7,Aaron Gordon,2017-11-01,ORL,6.0,34:24,19,7,2,1,2,1,2,39.4,2018,27.9,34.85,40.925,39.6,39.6,2.0
8,Aaron Gordon,2017-11-03,ORL,7.0,29:17,11,7,2,0,1,2,3,24.4,2018,39.4,33.65,40.62,39.566667,39.566667,2.0
9,Aaron Gordon,2017-11-05,ORL,8.0,34:16,18,12,2,1,1,1,1,41.4,2018,24.4,31.9,32.54,37.4,37.4,2.0
10,Aaron Gordon,2017-11-08,ORL,9.0,36:32,21,4,1,1,0,0,4,34.3,2018,41.4,32.9,34.98,37.9,37.9,3.0
11,Aaron Gordon,2017-11-10,ORL,10.0,30:43,22,7,3,1,1,1,2,40.9,2018,34.3,37.85,33.48,37.9,37.5,2.0


In [32]:
data_train['MP'] = data_train['MP'].apply(mins_played)

In [33]:
data_train.to_csv('training_data.csv', index = None)

In [64]:
data = pd.read_csv('game_logs_2022.csv')

In [65]:
def rolling_minutes(df):

    df = df.sort_values('Date')

    for inav in ['Inactive', 'Did Not Play', 'Did Not Dress', 'Not With Team', 'Player Suspended']:
        df['MP'] = df['MP'].replace(inav, 0)
    df['MP'] = df['MP'].apply(lambda x: float(x.split(':')[0]) + float(x.split(':')[1]) / 60 if x != 0 else 0)

    df['MP_2'] = [np.nan] + list(df['MP'].rolling(2, min_periods = 1).mean()[:-1])
    df['MP_3'] = [np.nan] + list(df['MP'].rolling(3, min_periods = 1).mean()[:-1])
    df['MP_5'] = [np.nan] + list(df['MP'].rolling(5, min_periods = 1).mean()[:-1])
    df['MP_10'] = [np.nan] + list(df['MP'].rolling(10, min_periods = 1).mean()[:-1])
    df['MP_25'] = [np.nan] + list(df['MP'].rolling(25, min_periods = 1).mean()[:-1])

    df['MP_med2'] = [np.nan] + list(df['MP'].rolling(2, min_periods = 1).median()[:-1])
    df['MP_med3'] = [np.nan] + list(df['MP'].rolling(3, min_periods = 1).median()[:-1])
    df['MP_med5'] = [np.nan] + list(df['MP'].rolling(5, min_periods = 1).median()[:-1])

    return df

In [66]:
data = data.groupby('Player').apply(rolling_minutes)
data = data[(data['MP_3'].notna()) & (data['MP_10'].notna()) & (data['MP_25'].notna())]

In [67]:
from sklearn.metrics import mean_squared_error

In [77]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size = 0.2)

In [78]:
rf = RandomForestRegressor()
reg = LinearRegression()
rf.fit(train[['MP_2', 'MP_3', 'MP_5', 'MP_10', 'MP_25']], train['MP'])
reg.fit(train[['MP_2', 'MP_3', 'MP_5', 'MP_10', 'MP_25']], train['MP'])

LinearRegression()

In [80]:
test = test.assign(predrf  = rf.predict(test[['MP_2', 'MP_3', 'MP_5', 'MP_10', 'MP_25']]))
test = test.assign(predreg = reg.predict(test[['MP_2', 'MP_3', 'MP_5', 'MP_10', 'MP_25']]))

In [82]:
np.sqrt(mean_squared_error(test.MP, test.MP_2))

9.213126558660633

In [83]:
np.sqrt(mean_squared_error(test.MP, test.predreg))

8.73303140722739

In [84]:
np.sqrt(mean_squared_error(test.MP, test.predrf))

9.042584807868163

In [87]:
data = pd.read_csv('training_data.csv')

In [88]:
train, test = train_test_split(data, test_size = .2)

In [93]:
model = RandomForestRegressor()
model.fit(train[['mean_score', 'form_1', 'form_2', 'form_5', 'form_10', 'Rest']], train['SORARE'])


test = test.assign(pred = model.predict(test[['mean_score', 'form_1', 'form_2', 'form_5', 'form_10', 'Rest']]))

In [94]:
model.feature_importances_

array([0.49536368, 0.10267097, 0.10592114, 0.11021464, 0.16123722,
       0.02459235])