In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests as rqs
from bs4 import BeautifulSoup
from IPython.display import clear_output
from time import sleep
from datetime import datetime

In [2]:
def single_team(df) :
    if len(df) == 1: # If player played for a single team, do nothing
        return df
    else: # If player has moved, return total stats with last team
        row = df[df['Tm'] == 'TOT'].copy()
        row['Tm'] = [str(df['Tm'].iloc[-1])]
        return row

def SoRareScore(df):

    df = df.apply(pd.to_numeric, errors = 'ignore')
    scores = []
    for pts, reb, ast, blk, stl, tov, fg_3 in zip(df['PTS'], df['TRB'], df['AST'], df['BLK'], df['STL'], df['TOV'], df['3P']) :
        
        score = 0
        if pts not in ['Inactive', 'Did Not Play', 'Did Not Dress', 'Not With Team', 'Player Suspended']:
            
            pts, reb, ast, blk, stl, tov, fg_3 = int(pts), int(reb), int(ast), int(blk), int(stl), int(tov), int(fg_3)

            score = 1 * pts + 1.2 * reb + 1.5 * ast + 3 * blk + 3 * stl + 1 * fg_3 - 2 * tov

            doubles = 0 # Count double-digits stats
            if pts >= 10 :
                doubles += 1
            if reb >= 10 :
                doubles += 1
            if ast >= 10 :
                doubles += 1
            if blk >= 10 :
                doubles += 1
            if stl >= 10 :
                doubles += 1

            if doubles == 2 : # Double-double
                score += 1

            if doubles >= 3 : # Tripe-double
                score += 2
        
        else :
            score = 'dnp'

        scores.append(score)
    return scores

def mins_played(str) :
    min, sec = str.split(':')
    return float(min) + float(sec) / 60

In [3]:
# Engineer the training set
def game_logs(year) :

    # Get player data for looped year
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
    page = rqs.get(url)
    soup = BeautifulSoup(page.content, 'html')
    while soup.find('tr', class_ = 'thead') is not None :
        soup.find('tr', class_ = 'thead').decompose()
    table = soup.find('table')
    data = pd.read_html(str(table))[0][['Player', 'Tm', 'MP', 'G']]
    # Get the list of hrefs of players of looped year
    hrefs = []
    for x in table.find_all('a', href = True):
        if x['href'].split('/')[1] == 'players' :
            hrefs.append(x['href'][:-5])
    data['href'] = hrefs
    data = data.groupby('Player').apply(single_team).reset_index(drop = True)
    data = data[(data['MP'] >= 10) & (data['G'] >= 25)]

    player_df = []
    count = 1
    for player, href in zip(data['Player'], data['href']) :
        sleep(1.5)
        print(f'{year}: {count}/{len(data)} ... {player}')
        # Get looped player game log
        url = f'https://www.basketball-reference.com{href}/gamelog/{year}'
        page = rqs.get(url)
        soup = BeautifulSoup(page.content, 'html')
        while soup.find('tr', class_ = 'thead') is not None :
            soup.find('tr', class_ = 'thead').decompose()
        table = soup.find('table', id = 'pgl_basic')
        df = pd.read_html(str(table))[0][['Date', 'Tm', 'G', 'MP', 'PTS', 'TRB', 'AST', 'BLK', 'STL', 'TOV', '3P']]

        df.insert(0, 'Player', [player] * len(df)) # Insert player name
        df = df.apply(pd.to_numeric, errors = 'ignore')
        df['SORARE'] = SoRareScore(df) # Calculate SoRare score
        # Append player game logs to list
        player_df.append(df)
        count += 1
        clear_output(wait = True)

    return pd.concat(player_df)

In [4]:
data_train = []
for year in [2021, 2022]:
    data_train.append(pd.read_csv(f'game_logs_{year}.csv'))
data_train = pd.concat(data_train)
data_train['Date'] = data_train['Date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
data_train.head()

Unnamed: 0,Player,Date,Tm,G,MP,PTS,TRB,AST,BLK,STL,TOV,3P,SORARE,Year
0,Aaron Gordon,2020-12-23,ORL,1.0,26:00,20,7,0,0,4,2,1,37.4,2021
1,Aaron Gordon,2020-12-26,ORL,2.0,29:39,15,9,3,0,0,0,1,31.3,2021
2,Aaron Gordon,2020-12-27,ORL,3.0,19:48,4,6,0,0,0,0,0,11.2,2021
3,Aaron Gordon,2020-12-29,ORL,4.0,22:10,12,5,2,1,1,3,0,21.0,2021
4,Aaron Gordon,2020-12-31,ORL,5.0,20:49,6,3,1,2,0,2,0,13.1,2021


In [5]:
def feature_engineering(df) :
    df = df.sort_values('Date')
    df['SORARE'] = df['SORARE'].replace('dnp', np.nan)
    df['PTS'] = df['PTS'].replace('Inactive', np.nan).replace('Did Not Play', np.nan).replace('Did Not Dress', np.nan).replace('Player Suspended', np.nan).replace('Not With Team', np.nan)

    df['form_1'] = [np.nan] + list(df['SORARE'].rolling(1).mean())[:-1]
    df['form_2'] = [np.nan] + list(df['SORARE'].rolling(2).mean())[:-1]
    df['form_5'] = [np.nan] + list(df['SORARE'].rolling(5, min_periods = 3).mean())[:-1]
    df['form_10'] = [np.nan] + list(df['SORARE'].rolling(10, min_periods = 5).mean())[:-1]
    df['mean_score'] = [np.nan] + list(df['SORARE'].rolling(25, min_periods = 1).mean())[:-1]
    df['Rest'] = [np.nan] + [(t1 - t0).days for t1, t0 in zip(df.Date[1:], df.Date[:-1])]

    return df

player_year_df = []
for x in data_train.groupby(['Player', 'Year']): # Apply feature engineering function to all player-season couples
    player_year_df.append(feature_engineering(x[1]))
data_train = pd.concat(player_year_df)
# Only keep entries which have a SoRare score
data_train = data_train[data_train['SORARE'].notna()]


data_train.head()

Unnamed: 0,Player,Date,Tm,G,MP,PTS,TRB,AST,BLK,STL,TOV,3P,SORARE,Year,form_1,form_2,form_5,form_10,mean_score,Rest
0,Aaron Gordon,2020-12-23,ORL,1.0,26:00,20,7,0,0,4,2,1,37.4,2021,,,,,,
1,Aaron Gordon,2020-12-26,ORL,2.0,29:39,15,9,3,0,0,0,1,31.3,2021,37.4,,,,37.4,3.0
2,Aaron Gordon,2020-12-27,ORL,3.0,19:48,4,6,0,0,0,0,0,11.2,2021,31.3,34.35,,,34.35,1.0
3,Aaron Gordon,2020-12-29,ORL,4.0,22:10,12,5,2,1,1,3,0,21.0,2021,11.2,21.25,26.633333,,26.633333,2.0
4,Aaron Gordon,2020-12-31,ORL,5.0,20:49,6,3,1,2,0,2,0,13.1,2021,21.0,16.1,25.225,,25.225,2.0


In [6]:
# Only keep entries with non-null features
print(data_train.shape)
data_train = data_train[(data_train['form_1'].notna()) & (data_train['form_2'].notna()) & (data_train['form_5'].notna()) & (data_train['form_10'].notna()) & (data_train['mean_score'].notna())]
print(data_train.shape)
data_train.head()

(46358, 20)
(32424, 20)


Unnamed: 0,Player,Date,Tm,G,MP,PTS,TRB,AST,BLK,STL,TOV,3P,SORARE,Year,form_1,form_2,form_5,form_10,mean_score,Rest
5,Aaron Gordon,2021-01-02,ORL,6.0,27:44,15,8,5,0,0,0,1,33.1,2021,13.1,17.05,22.8,22.8,22.8,2.0
6,Aaron Gordon,2021-01-04,ORL,7.0,27:55,24,11,1,0,1,2,6,44.7,2021,33.1,23.1,21.94,24.516667,24.516667,2.0
7,Aaron Gordon,2021-01-06,ORL,8.0,27:09,15,8,3,1,1,5,2,27.1,2021,44.7,38.9,24.62,27.4,27.4,2.0
11,Aaron Gordon,2021-01-15,ORL,11.0,28:43,17,2,3,0,0,2,3,22.9,2021,43.0,37.15,36.525,28.422222,29.32,4.0
12,Aaron Gordon,2021-01-16,ORL,12.0,30:55,11,6,6,0,2,3,0,27.200000000000003,2021,22.9,32.95,31.075,27.488889,28.736364,1.0


In [7]:
data_train.to_csv('training_data.csv', index = None)