In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import time

In [2]:
players_df = pd.read_csv('Player_URL_Stats.csv')
odds_df = pd.read_csv('NBA_Odds.csv')

local_teams = pd.read_csv('local_teams.csv')
visit_teams = pd.read_csv('visit_teams.csv')

In [3]:
joint = odds_df.set_index('Name').join(players_df.set_index('Name'))
joint[joint['href'].isnull()]

Unnamed: 0_level_0,Bet,O,U,href
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [4]:
print(f'# Rows: {joint.shape[0]}')
joint

# Rows: 17


Unnamed: 0_level_0,Bet,O,U,href
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaron Gordon,23.5,−115,−115,/players/g/gordoaa01.html
Al Horford,18.5,−105,−125,/players/h/horfoal01.html
Chris Paul,26.5,−110,−120,/players/p/paulch01.html
Deandre Ayton,26.5,−105,−125,/players/a/aytonde01.html
Derrick White,21.5,−115,−115,/players/w/whitede01.html
Devin Booker,41.5,−110,−120,/players/b/bookede01.html
Jamal Murray,36.5,−110,−120,/players/m/murraja01.html
James Harden,38.5,−115,−115,/players/h/hardeja01.html
Jaylen Brown,33.5,−120,−110,/players/b/brownja02.html
Jayson Tatum,42.5,−115,−115,/players/t/tatumja01.html


In [5]:
def get_time_sleep(idx, length):
    thr = length//10
    bounds = [thr*bound for bound in range(11)]
    
    bound = 0
    for i in range(1,len(bounds)):
        if idx in range(bounds[i-1], bounds[i]):
            bound = i
    
    return bound

In [6]:
def get_game_stats(game: dict) -> dict:
    local_team = local_teams[local_teams['local_team'] == game['team']].to_dict('records')[0]
    visit_team = visit_teams[visit_teams['visit_team'] == game['team_opp']].to_dict('records')[0]

    player = game | local_team | visit_team
    
    return player

In [11]:
def get_player_stats(idx, url):
    df = []
    
    time.sleep(get_time_sleep(idx, joint.shape[0]))
    
    req = requests.get(url)
    soup = bs(req.content, 'lxml')
    
    rows = soup.find('table', {'id': 'pgl_basic'}) \
                .find('tbody').find_all('tr')
    
    for row in rows:
        cond = row.find('td', {'data-stat': 'reason'})
        cond2 = row.find('td')
        
        if (not cond) and (cond2):
            game = {}
            
            game['date'] = row.find('td', {'data-stat': 'date_game'}) \
                              .find('a').text
            
            game['team'] = row.find('td', {'data-stat': 'team_id'}) \
                              .find('a').text
            game['team_href'] = row.find('td', {'data-stat': 'team_id'}) \
                              .find('a').get('href')
            
            game['team_opp'] = row.find('td', {'data-stat': 'opp_id'}) \
                                  .find('a').text
            game['team_opp_href'] = row.find('td', {'data-stat': 'opp_id'}) \
                                  .find('a').get('href')
            
            mp = row.find('td', {'data-stat': 'mp'}).text
            i = mp.index(':')
            game['mp'] = float(mp[:i])
            
            game['fg'] = int(row.find('td', {'data-stat': 'fg'}).text)
            
            try:
                game['fg_pct'] = float(row.find('td', {'data-stat': 'fg_pct'}).text)
            except ValueError:
                game['fg_pct'] = None
                
            game['fg3'] = int(row.find('td', {'data-stat': 'fg3'}).text)
            
            try:
                game['fg3_pct'] = float(row.find('td', {'data-stat': 'fg3_pct'}).text)
            except ValueError:
                game['fg3_pct'] = None
                
            game['ft'] = int(row.find('td', {'data-stat': 'ft'}).text)
            
            try:
                game['ft_pct'] = float(row.find('td', {'data-stat': 'ft_pct'}).text)
            except:
                game['ft_pct'] = None
                
            game['orb'] = int(row.find('td', {'data-stat': 'orb'}).text)
            game['drb'] = int(row.find('td', {'data-stat': 'drb'}).text)
            game['trb'] = int(row.find('td', {'data-stat': 'trb'}).text)
            game['ast'] = int(row.find('td', {'data-stat': 'ast'}).text)
            game['stl'] = int(row.find('td', {'data-stat': 'stl'}).text)
            game['blk'] = int(row.find('td', {'data-stat': 'blk'}).text)
            game['pts'] = int(row.find('td', {'data-stat': 'pts'}).text)
            try:
                game['+/-'] = int(row.find('td', {'data-stat': 'plus_minus'}).text)
            except ValueError:
                print('Error')
                game['+/-'] = 10
            game['pts+ast+trb'] = game['pts'] + game['trb'] + game['ast']
            
            df.append(get_game_stats(game))
            
    return pd.DataFrame(df)

In [12]:
players_df = []
for idx, player in enumerate(joint.index):
    print(f'{idx+1}: {player}')
    
    var_name = player.replace(' ', '_') + '_df'
    url = f'https://www.basketball-reference.com{joint["href"][idx][:-5]}/gamelog/2023'

    df = get_player_stats(idx, url)
    df['target'] = df['pts+ast+trb'].shift(-1)
    
    vars()[var_name] = df
    players_df.append(var_name)

1: Aaron Gordon
Error
2: Al Horford
3: Chris Paul
Error
4: Deandre Ayton
Error
5: Derrick White
6: Devin Booker
Error
7: Jamal Murray
8: James Harden
Error
9: Jaylen Brown
10: Jayson Tatum
11: Kevin Durant
Error
12: Malcolm Brogdon
13: Marcus Smart
14: Michael Porter Jr.
15: Nikola Jokic
16: Tobias Harris
17: Tyrese Maxey
Error


In [13]:
print(players_df[0])
df = vars()[players_df[0]]
df.head()

Aaron_Gordon_df


Unnamed: 0,date,team,team_href,team_opp,team_opp_href,mp,fg,fg_pct,fg3,fg3_pct,...,visit_LgRank_FTr,visit_LgRank_eFG%,visit_LgRank_TOV%,visit_LgRank_ORB%,visit_LgRank_FT/FGA,visit_LgRank_opp_eFG%,visit_LgRank_opp_TOV%,visit_LgRank_DRB%,visit_LgRank_opp_FT/FGA,target
0,2022-10-19,DEN,/teams/DEN/2023.html,UTA,/teams/UTA/2023.html,32.0,10,0.588,0,0.0,...,14.0,15.0,25.0,4.0,14.0,13.0,28.0,23.0,11.0,16.0
1,2022-10-21,DEN,/teams/DEN/2023.html,GSW,/teams/GSW/2023.html,28.0,5,0.417,0,0.0,...,30.0,3.0,30.0,14.0,29.0,10.0,16.0,16.0,20.0,23.0
2,2022-10-22,DEN,/teams/DEN/2023.html,OKC,/teams/OKC/2023.html,28.0,3,0.333,1,0.333,...,23.0,24.0,3.0,13.0,16.0,19.0,3.0,29.0,24.0,33.0
3,2022-10-24,DEN,/teams/DEN/2023.html,POR,/teams/POR/2023.html,27.0,12,0.75,0,0.0,...,6.0,14.0,23.0,24.0,3.0,26.0,19.0,25.0,21.0,20.0
4,2022-10-26,DEN,/teams/DEN/2023.html,LAL,/teams/LAL/2023.html,28.0,2,0.25,0,0.0,...,2.0,18.0,11.0,20.0,2.0,7.0,29.0,13.0,1.0,4.0


In [14]:
import numpy as np
import lightgbm as lgb
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit, train_test_split

In [15]:
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean(np.power(np.log1p(y_true + 1) - np.log1p(y_pred + 1), 2)))

In [16]:
model = lgb.LGBMRegressor()
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(model, n_features_to_select=20,
                               direction='forward', cv=split)

In [17]:
no_predictors = ['date', 'team', 'team_href', 'team_opp', 
                 'team_opp_href', 'local_team', 'visit_team', 'target']
init_predictors = df.columns[~df.columns.isin(no_predictors)]

In [18]:
train = pd.DataFrame()
for player in players_df:
    df = vars()[player]
    train = pd.concat([train, df])

In [19]:
pred = train[pd.isnull(train['target'])]
train = train[~pd.isnull(train['target'])]

In [20]:
nulls = pd.isnull(train[init_predictors]).sum()
nulls = nulls[nulls > 0]

valid_cols = train[init_predictors].columns[~train[init_predictors].columns.isin(nulls.index)]
to_train = train[valid_cols].copy()

In [21]:
sfs.fit(to_train, train['target'])
predictors = list(valid_cols[sfs.get_support()])

In [22]:
all_preds = []
all_errors = []
all_rmse = []
for idx, player in enumerate(players_df):
    name = player[:-3].replace('_', ' ') 
    
    df = vars()[player]
    
    to_pred = df[pd.isnull(df['target'])]
    to_train = df[~pd.isnull(df['target'])]
    
    nulls = pd.isnull(to_train[predictors]).sum()
    nulls = nulls[nulls > 0]
    
    valid_cols = to_train[predictors].columns[~to_train[predictors].columns.isin(nulls.index)]
    train = to_train[valid_cols].copy()
    
    target = to_train['target']
    X_train, X_test, y_train, y_test = train_test_split(
        train, target, test_size=0.3, shuffle=False)
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    error = rmsle(y_test, preds)
    all_errors.append(error)
    norm_error = error/(max(y_test) - min(y_test))
    print(f'Player: {name}\nRMSE = {error}\nNorm RMSLE = {norm_error}')
    all_rmse.append(norm_error)
    
    pred = model.predict(to_pred[predictors])[0]
    all_preds.append(pred)

    print('='*35)
print('\nMean RMSLE = ' + str(np.mean(all_errors)) + ' +/- ' + str(np.std(all_errors)))
print('Mean Norm RMSLE = ' + str(np.mean(all_rmse)) + ' +/- ' + str(np.std(all_rmse)))

Player: Aaron Gordon
RMSE = 0.28965183326455
Norm RMSLE = 0.0074269700837064105
Player: Al Horford
RMSE = 0.4130959072480832
Norm RMSLE = 0.013769863574936108
Player: Chris Paul
RMSE = 0.17513088374987348
Norm RMSLE = 0.007614386249994499
Player: Deandre Ayton
RMSE = 0.3773089014543519
Norm RMSLE = 0.011790903170448497
Player: Derrick White
RMSE = 0.388992633862409
Norm RMSLE = 0.010513314428713757
Player: Devin Booker
RMSE = 0.19495485157301612
Norm RMSLE = 0.007220550058259856
Player: Jamal Murray
RMSE = 0.4173825756337645
Norm RMSLE = 0.011593960434271237
Player: James Harden
RMSE = 0.26063801185022617
Norm RMSLE = 0.006858895048690162
Player: Jaylen Brown
RMSE = 0.4298179839517388
Norm RMSLE = 0.008954541332327893
Player: Jayson Tatum
RMSE = 0.27767009307175394
Norm RMSLE = 0.007713058140882054
Player: Kevin Durant
RMSE = 0.2640335351564378
Norm RMSLE = 0.00910460466056682
Player: Malcolm Brogdon
RMSE = 0.42429937780937255
Norm RMSLE = 0.013687076703528147
Player: Marcus Smart
RMSE

In [23]:
joint['preds'] = all_preds
joint['norm_rmse'] = all_rmse
joint['rmse'] = all_errors
joint

Unnamed: 0_level_0,Bet,O,U,href,preds,norm_rmse,rmse
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aaron Gordon,23.5,−115,−115,/players/g/gordoaa01.html,28.032792,0.007427,0.289652
Al Horford,18.5,−105,−125,/players/h/horfoal01.html,20.516116,0.01377,0.413096
Chris Paul,26.5,−110,−120,/players/p/paulch01.html,24.470286,0.007614,0.175131
Deandre Ayton,26.5,−105,−125,/players/a/aytonde01.html,31.412672,0.011791,0.377309
Derrick White,21.5,−115,−115,/players/w/whitede01.html,17.255522,0.010513,0.388993
Devin Booker,41.5,−110,−120,/players/b/bookede01.html,37.222222,0.007221,0.194955
Jamal Murray,36.5,−110,−120,/players/m/murraja01.html,28.670263,0.011594,0.417383
James Harden,38.5,−115,−115,/players/h/hardeja01.html,38.205128,0.006859,0.260638
Jaylen Brown,33.5,−120,−110,/players/b/brownja02.html,33.839246,0.008955,0.429818
Jayson Tatum,42.5,−115,−115,/players/t/tatumja01.html,41.593841,0.007713,0.27767


In [24]:
def american_to_decimal(player, line):
    l = player[line]
    new_l = []
    
    for num in l:
        american = int(num.replace('−', '-'))
        
        if american > 0:
            decimal = (american/100) + 1
        else:
            decimal = (100/abs(american)) + 1
         
        new_l.append(decimal)
    
    player['decimal_'+line] = new_l
    
    return player

In [25]:
joint_dec = american_to_decimal(joint, 'O')
joint_dec = american_to_decimal(joint_dec, 'U')

In [26]:
joint_dec

Unnamed: 0_level_0,Bet,O,U,href,preds,norm_rmse,rmse,decimal_O,decimal_U
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Aaron Gordon,23.5,−115,−115,/players/g/gordoaa01.html,28.032792,0.007427,0.289652,1.869565,1.869565
Al Horford,18.5,−105,−125,/players/h/horfoal01.html,20.516116,0.01377,0.413096,1.952381,1.8
Chris Paul,26.5,−110,−120,/players/p/paulch01.html,24.470286,0.007614,0.175131,1.909091,1.833333
Deandre Ayton,26.5,−105,−125,/players/a/aytonde01.html,31.412672,0.011791,0.377309,1.952381,1.8
Derrick White,21.5,−115,−115,/players/w/whitede01.html,17.255522,0.010513,0.388993,1.869565,1.869565
Devin Booker,41.5,−110,−120,/players/b/bookede01.html,37.222222,0.007221,0.194955,1.909091,1.833333
Jamal Murray,36.5,−110,−120,/players/m/murraja01.html,28.670263,0.011594,0.417383,1.909091,1.833333
James Harden,38.5,−115,−115,/players/h/hardeja01.html,38.205128,0.006859,0.260638,1.869565,1.869565
Jaylen Brown,33.5,−120,−110,/players/b/brownja02.html,33.839246,0.008955,0.429818,1.833333,1.909091
Jayson Tatum,42.5,−115,−115,/players/t/tatumja01.html,41.593841,0.007713,0.27767,1.869565,1.869565


In [27]:
def add_bet(player):
    bets = player['Bet']
    preds = player['preds']
    
    over = player['decimal_O']
    under = player['decimal_U']
    
    my_bet = []
    for idx, bet in enumerate(bets):
        if bet < preds[idx]:
            my_bet.append(over[idx])
        else:
            my_bet.append(under[idx])
            
    player['my_Bet'] = my_bet
    
    return player

In [28]:
final = add_bet(joint_dec)

In [29]:
thr = np.quantile(all_errors, .25, method='midpoint')
print(f'Threshold: {thr}')
all_errors = np.array(all_errors)
df_errors = final[['Bet', 'preds', 'my_Bet']][all_errors < thr]
print(f'# of Players: {df_errors.shape[0]}')
df_errors

Threshold: 0.2647312555474365
# of Players: 4


Unnamed: 0_level_0,Bet,preds,my_Bet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chris Paul,26.5,24.470286,1.833333
Devin Booker,41.5,37.222222,1.833333
James Harden,38.5,38.205128,1.869565
Kevin Durant,41.5,42.0625,1.869565


In [30]:
thr = np.quantile(all_rmse, .25, method='midpoint')
print(f'Threshold: {thr}')
all_rmse = np.array(all_rmse)
df_rmse = final[['Bet', 'preds', 'my_Bet']][all_rmse < thr]
print(f'# of Players: {df_rmse.shape[0]}')
df_rmse

Threshold: 0.007614386249994499
# of Players: 4


Unnamed: 0_level_0,Bet,preds,my_Bet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aaron Gordon,23.5,28.032792,1.869565
Devin Booker,41.5,37.222222,1.833333
James Harden,38.5,38.205128,1.869565
Nikola Jokic,47.5,50.532022,1.833333


In [31]:
in_both = []
for player in df_errors.index:
    if player in df_rmse.index:
        in_both.append(player)

In [32]:
print(f'# of Players: {len(in_both)}\nPlayers:')
for player in in_both:
    print(f'\t{player}')

# of Players: 2
Players:
	Devin Booker
	James Harden


In [33]:
data = pd.merge(df_errors, df_rmse, left_index=True, right_index=True)[['Bet_x', 'preds_x', 'my_Bet_x']]
data.rename(columns = {'Bet_x':'Bet', 'preds_x':'preds', 'my_Bet_x':'my_Bet'}, inplace = True)
data

Unnamed: 0_level_0,Bet,preds,my_Bet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devin Booker,41.5,37.222222,1.833333
James Harden,38.5,38.205128,1.869565


In [34]:
final[final['norm_rmse'] == min(final['norm_rmse'])]

Unnamed: 0_level_0,Bet,O,U,href,preds,norm_rmse,rmse,decimal_O,decimal_U,my_Bet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
James Harden,38.5,−115,−115,/players/h/hardeja01.html,38.205128,0.006859,0.260638,1.869565,1.869565,1.869565


In [35]:
final[final['rmse'] == min(final['rmse'])]

Unnamed: 0_level_0,Bet,O,U,href,preds,norm_rmse,rmse,decimal_O,decimal_U,my_Bet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Chris Paul,26.5,−110,−120,/players/p/paulch01.html,24.470286,0.007614,0.175131,1.909091,1.833333,1.833333


In [36]:
df_rmse.to_csv('my_preds.csv')