In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import time

In [2]:
players_df = pd.read_csv('Player_URL_Stats.csv')
odds_df = pd.read_csv('NBA_Odds.csv')

local_teams = pd.read_csv('local_teams.csv')
visit_teams = pd.read_csv('visit_teams.csv')

In [3]:
joint = odds_df.set_index('Name').join(players_df.set_index('Name'))
joint[joint['href'].isnull()]

Unnamed: 0_level_0,Bet,O,U,href
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [4]:
print(f'# Rows: {joint.shape[0]}')
joint

# Rows: 27


Unnamed: 0_level_0,Bet,O,U,href
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaron Gordon,23.5,−110,−120,/players/g/gordoaa01.html
Andrew Wiggins,26.5,−105,−125,/players/w/wiggian01.html
Anthony Davis,41.5,−115,−115,/players/d/davisan02.html
Austin Reaves,24.5,−120,−110,/players/r/reaveau01.html
Bam Adebayo,29.5,−135,+105,/players/a/adebaba01.html
D'Angelo Russell,24.5,−120,−110,/players/r/russeda01.html
Derrick White,18.5,−120,−110,/players/w/whitede01.html
Draymond Green,22.5,−130,+100,/players/g/greendr01.html
Gabe Vincent,19.5,−125,−105,/players/v/vincega01.html
Jalen Brunson,34.5,−120,−110,/players/b/brunsja01.html


In [5]:
def get_time_sleep(idx, length):
    thr = length//10
    bounds = [thr*bound for bound in range(11)]
    
    bound = 0
    for i in range(1,len(bounds)):
        if idx in range(bounds[i-1], bounds[i]):
            bound = i
    
    return bound

In [6]:
def get_game_stats(game: dict) -> dict:
    local_team = local_teams[local_teams['local_team'] == game['team']].to_dict('records')[0]
    visit_team = visit_teams[visit_teams['visit_team'] == game['team_opp']].to_dict('records')[0]

    player = game | local_team | visit_team
    
    return player

In [7]:
def get_player_stats(idx, url):
    df = []
    
    time.sleep(get_time_sleep(idx, joint.shape[0]))
    
    req = requests.get(url)
    soup = bs(req.content, 'lxml')
    
    rows = soup.find('table', {'id': 'pgl_basic'}) \
                .find('tbody').find_all('tr')
    
    for row in rows:
        cond = row.find('td', {'data-stat': 'reason'})
        cond2 = row.find('td')
        
        if (not cond) and (cond2):
            game = {}
            
            game['date'] = row.find('td', {'data-stat': 'date_game'}) \
                              .find('a').text
            
            game['team'] = row.find('td', {'data-stat': 'team_id'}) \
                              .find('a').text
            game['team_href'] = row.find('td', {'data-stat': 'team_id'}) \
                              .find('a').get('href')
            
            game['team_opp'] = row.find('td', {'data-stat': 'opp_id'}) \
                                  .find('a').text
            game['team_opp_href'] = row.find('td', {'data-stat': 'opp_id'}) \
                                  .find('a').get('href')
            
            mp = row.find('td', {'data-stat': 'mp'}).text
            i = mp.index(':')
            game['mp'] = float(mp[:i])
            
            game['fg'] = int(row.find('td', {'data-stat': 'fg'}).text)
            
            try:
                game['fg_pct'] = float(row.find('td', {'data-stat': 'fg_pct'}).text)
            except ValueError:
                game['fg_pct'] = None
                
            game['fg3'] = int(row.find('td', {'data-stat': 'fg3'}).text)
            
            try:
                game['fg3_pct'] = float(row.find('td', {'data-stat': 'fg3_pct'}).text)
            except ValueError:
                game['fg3_pct'] = None
                
            game['ft'] = int(row.find('td', {'data-stat': 'ft'}).text)
            
            try:
                game['ft_pct'] = float(row.find('td', {'data-stat': 'ft_pct'}).text)
            except:
                game['ft_pct'] = None
                
            game['orb'] = int(row.find('td', {'data-stat': 'orb'}).text)
            game['drb'] = int(row.find('td', {'data-stat': 'drb'}).text)
            game['trb'] = int(row.find('td', {'data-stat': 'trb'}).text)
            game['ast'] = int(row.find('td', {'data-stat': 'ast'}).text)
            game['stl'] = int(row.find('td', {'data-stat': 'stl'}).text)
            game['blk'] = int(row.find('td', {'data-stat': 'blk'}).text)
            game['pts'] = int(row.find('td', {'data-stat': 'pts'}).text)
            try:
                game['+/-'] = int(row.find('td', {'data-stat': 'plus_minus'}).text)
            except ValueError:
                game['+/-'] = None
            game['pts+ast+trb'] = game['pts'] + game['trb'] + game['ast']
            
            df.append(get_game_stats(game))
            
    return pd.DataFrame(df)

In [8]:
players_df = []
for idx, player in enumerate(joint.index):
    print(f'{idx+1}: {player}')
    
    var_name = player.replace(' ', '_') + '_df'
    url = f'https://www.basketball-reference.com{joint["href"][idx][:-5]}/gamelog/2023'

    df = get_player_stats(idx, url)
    df['target'] = df['pts+ast+trb'].shift(-1)
    
    vars()[var_name] = df
    players_df.append(var_name)

1: Aaron Gordon
Error
2: Andrew Wiggins
3: Anthony Davis
Error
4: Austin Reaves
Error
5: Bam Adebayo
6: D'Angelo Russell
Error
7: Derrick White
8: Draymond Green
9: Gabe Vincent
10: Jalen Brunson
11: Jamal Murray
12: James Harden
Error
13: Jaylen Brown
14: Jayson Tatum
15: Josh Hart
16: Kevon Looney
17: Klay Thompson
18: Kyle Lowry
19: LeBron James
Error
20: Malcolm Brogdon
21: Marcus Smart
22: Mitchell Robinson
23: Nikola Jokic
24: RJ Barrett
25: Stephen Curry
26: Tobias Harris
27: Tyrese Maxey
Error


In [9]:
print(players_df[0])
df = vars()[players_df[0]]
df.head()

Aaron_Gordon_df


Unnamed: 0,date,team,team_href,team_opp,team_opp_href,mp,fg,fg_pct,fg3,fg3_pct,...,visit_LgRank_FTr,visit_LgRank_eFG%,visit_LgRank_TOV%,visit_LgRank_ORB%,visit_LgRank_FT/FGA,visit_LgRank_opp_eFG%,visit_LgRank_opp_TOV%,visit_LgRank_DRB%,visit_LgRank_opp_FT/FGA,target
0,2022-10-19,DEN,/teams/DEN/2023.html,UTA,/teams/UTA/2023.html,32.0,10,0.588,0,0.0,...,14.0,15.0,25.0,4.0,14.0,13.0,28.0,23.0,11.0,16.0
1,2022-10-21,DEN,/teams/DEN/2023.html,GSW,/teams/GSW/2023.html,28.0,5,0.417,0,0.0,...,30.0,3.0,30.0,14.0,29.0,10.0,16.0,16.0,20.0,23.0
2,2022-10-22,DEN,/teams/DEN/2023.html,OKC,/teams/OKC/2023.html,28.0,3,0.333,1,0.333,...,23.0,24.0,3.0,13.0,16.0,19.0,3.0,29.0,24.0,33.0
3,2022-10-24,DEN,/teams/DEN/2023.html,POR,/teams/POR/2023.html,27.0,12,0.75,0,0.0,...,6.0,14.0,23.0,24.0,3.0,26.0,19.0,25.0,21.0,20.0
4,2022-10-26,DEN,/teams/DEN/2023.html,LAL,/teams/LAL/2023.html,28.0,2,0.25,0,0.0,...,2.0,18.0,11.0,20.0,2.0,7.0,29.0,13.0,1.0,4.0


In [10]:
import numpy as np
import lightgbm as lgb
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit, train_test_split

In [11]:
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean(np.power(np.log1p(y_true + 1) - np.log1p(y_pred + 1), 2)))

In [12]:
model = lgb.LGBMRegressor()
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(model, n_features_to_select=20,
                               direction='forward', cv=split)

In [13]:
no_predictors = ['date', 'team', 'team_href', 'team_opp', 
                 'team_opp_href', 'local_team', 'visit_team', 'target']
init_predictors = df.columns[~df.columns.isin(no_predictors)]

In [14]:
train = pd.DataFrame()
for player in players_df:
    df = vars()[player]
    train = pd.concat([train, df])

In [15]:
pred = train[pd.isnull(train['target'])]
train = train[~pd.isnull(train['target'])]

In [16]:
nulls = pd.isnull(train[init_predictors]).sum()
nulls = nulls[nulls > 0]

valid_cols = train[init_predictors].columns[~train[init_predictors].columns.isin(nulls.index)]
to_train = train[valid_cols].copy()

In [17]:
sfs.fit(to_train, train['target'])
predictors = list(valid_cols[sfs.get_support()])

In [18]:
all_preds = []
all_errors = []
all_rmse = []
for idx, player in enumerate(players_df):
    name = player[:-3].replace('_', ' ') 
    
    df = vars()[player]
    
    to_pred = df[pd.isnull(df['target'])]
    to_train = df[~pd.isnull(df['target'])]
    
    nulls = pd.isnull(to_train[predictors]).sum()
    nulls = nulls[nulls > 0]
    
    valid_cols = to_train[predictors].columns[~to_train[predictors].columns.isin(nulls.index)]
    train = to_train[valid_cols].copy()
    
    target = to_train['target']
    X_train, X_test, y_train, y_test = train_test_split(
        train, target, test_size=0.3, shuffle=False)
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    error = rmsle(y_test, preds)
    all_errors.append(error)
    norm_error = error/(max(y_test) - min(y_test))
    print(f'Player: {name}\nRMSE = {error}\nNorm RMSLE = {norm_error}')
    all_rmse.append(norm_error)
    
    pred = model.predict(to_pred[predictors])[0]
    all_preds.append(pred)

    print('='*35)
print('\nMean RMSLE = ' + str(np.mean(all_errors)) + ' +/- ' + str(np.std(all_errors)))
print('Mean Norm RMSLE = ' + str(np.mean(all_rmse)) + ' +/- ' + str(np.std(all_rmse)))

Player: Aaron Gordon
RMSE = 0.28721321302112435
Norm RMSLE = 0.007364441359516009
Player: Andrew Wiggins
RMSE = 0.45869678644226797
Norm RMSLE = 0.013899902619462666
Player: Anthony Davis
RMSE = 0.27365226006697707
Norm RMSLE = 0.007818636001913631
Player: Austin Reaves
RMSE = 0.4877665516926243
Norm RMSLE = 0.013936187191217836
Player: Bam Adebayo
RMSE = 0.42833496442149016
Norm RMSLE = 0.01127197274793395
Player: D'Angelo Russell
RMSE = 0.37022478015034205
Norm RMSLE = 0.010006075139198434
Player: Derrick White
RMSE = 0.41555045906981597
Norm RMSLE = 0.011231093488373405
Player: Draymond Green
RMSE = 0.2384699124863693
Norm RMSLE = 0.011355710118398539
Player: Gabe Vincent
RMSE = 0.5781088424689443
Norm RMSLE = 0.023124353698757774
Player: Jalen Brunson
RMSE = 0.35531548815181696
Norm RMSLE = 0.008075352003450385
Player: Jamal Murray
RMSE = 0.44672807188287855
Norm RMSLE = 0.012409113107857738
Player: James Harden
RMSE = 0.26063801185022617
Norm RMSLE = 0.006858895048690162
Player: J

In [19]:
joint['preds'] = all_preds
joint['norm_rmse'] = all_rmse
joint['rmse'] = all_errors
joint

Unnamed: 0_level_0,Bet,O,U,href,preds,norm_rmse,rmse
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aaron Gordon,23.5,−110,−120,/players/g/gordoaa01.html,26.762533,0.007364,0.287213
Andrew Wiggins,26.5,−105,−125,/players/w/wiggian01.html,25.08,0.0139,0.458697
Anthony Davis,41.5,−115,−115,/players/d/davisan02.html,41.105263,0.007819,0.273652
Austin Reaves,24.5,−120,−110,/players/r/reaveau01.html,17.266165,0.013936,0.487767
Bam Adebayo,29.5,−135,+105,/players/a/adebaba01.html,35.792606,0.011272,0.428335
D'Angelo Russell,24.5,−120,−110,/players/r/russeda01.html,25.639682,0.010006,0.370225
Derrick White,18.5,−120,−110,/players/w/whitede01.html,16.56696,0.011231,0.41555
Draymond Green,22.5,−130,+100,/players/g/greendr01.html,23.944785,0.011356,0.23847
Gabe Vincent,19.5,−125,−105,/players/v/vincega01.html,11.701776,0.023124,0.578109
Jalen Brunson,34.5,−120,−110,/players/b/brunsja01.html,31.559084,0.008075,0.355315


In [20]:
def american_to_decimal(player, line):
    l = player[line]
    new_l = []
    
    for num in l:
        american = int(num.replace('−', '-'))
        
        if american > 0:
            decimal = (american/100) + 1
        else:
            decimal = (100/abs(american)) + 1
         
        new_l.append(decimal)
    
    player['decimal_'+line] = new_l
    
    return player

In [21]:
joint_dec = american_to_decimal(joint, 'O')
joint_dec = american_to_decimal(joint_dec, 'U')

In [22]:
joint_dec

Unnamed: 0_level_0,Bet,O,U,href,preds,norm_rmse,rmse,decimal_O,decimal_U
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Aaron Gordon,23.5,−110,−120,/players/g/gordoaa01.html,26.762533,0.007364,0.287213,1.909091,1.833333
Andrew Wiggins,26.5,−105,−125,/players/w/wiggian01.html,25.08,0.0139,0.458697,1.952381,1.8
Anthony Davis,41.5,−115,−115,/players/d/davisan02.html,41.105263,0.007819,0.273652,1.869565,1.869565
Austin Reaves,24.5,−120,−110,/players/r/reaveau01.html,17.266165,0.013936,0.487767,1.833333,1.909091
Bam Adebayo,29.5,−135,+105,/players/a/adebaba01.html,35.792606,0.011272,0.428335,1.740741,2.05
D'Angelo Russell,24.5,−120,−110,/players/r/russeda01.html,25.639682,0.010006,0.370225,1.833333,1.909091
Derrick White,18.5,−120,−110,/players/w/whitede01.html,16.56696,0.011231,0.41555,1.833333,1.909091
Draymond Green,22.5,−130,+100,/players/g/greendr01.html,23.944785,0.011356,0.23847,1.769231,2.0
Gabe Vincent,19.5,−125,−105,/players/v/vincega01.html,11.701776,0.023124,0.578109,1.8,1.952381
Jalen Brunson,34.5,−120,−110,/players/b/brunsja01.html,31.559084,0.008075,0.355315,1.833333,1.909091


In [23]:
def add_bet(player):
    bets = player['Bet']
    preds = player['preds']
    
    over = player['decimal_O']
    under = player['decimal_U']
    
    my_bet = []
    for idx, bet in enumerate(bets):
        if bet < preds[idx]:
            my_bet.append(over[idx])
        else:
            my_bet.append(under[idx])
            
    player['my_Bet'] = my_bet
    
    return player

In [24]:
final = add_bet(joint_dec)

In [25]:
thr = np.quantile(all_errors, .25, method='midpoint')
print(f'Threshold: {thr}')
all_errors = np.array(all_errors)
df_errors = final[['Bet', 'preds', 'my_Bet']][all_errors < thr]
print(f'# of Players: {df_errors.shape[0]}')
df_errors

Threshold: 0.27933402048883216
# of Players: 7


Unnamed: 0_level_0,Bet,preds,my_Bet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anthony Davis,41.5,41.105263,1.869565
Draymond Green,22.5,23.944785,1.769231
James Harden,37.5,38.205128,1.833333
LeBron James,42.5,45.378378,1.952381
Marcus Smart,19.5,18.605648,1.952381
Nikola Jokic,47.5,51.635754,1.833333
Stephen Curry,43.5,41.921053,1.769231


In [26]:
thr = np.quantile(all_rmse, .25, method='midpoint')
print(f'Threshold: {thr}')
all_rmse = np.array(all_rmse)
df_rmse = final[['Bet', 'preds', 'my_Bet']][all_rmse < thr]
print(f'# of Players: {df_rmse.shape[0]}')
df_rmse

Threshold: 0.007993062963081598
# of Players: 7


Unnamed: 0_level_0,Bet,preds,my_Bet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aaron Gordon,23.5,26.762533,1.909091
Anthony Davis,41.5,41.105263,1.869565
James Harden,37.5,38.205128,1.833333
Jayson Tatum,43.5,35.519399,1.909091
LeBron James,42.5,45.378378,1.952381
Nikola Jokic,47.5,51.635754,1.833333
Stephen Curry,43.5,41.921053,1.769231


In [27]:
in_both = []
for player in df_errors.index:
    if player in df_rmse.index:
        in_both.append(player)

In [28]:
print(f'# of Players: {len(in_both)}\nPlayers:')
for player in in_both:
    print(f'\t{player}')

# of Players: 5
Players:
	Anthony Davis
	James Harden
	LeBron James
	Nikola Jokic
	Stephen Curry


In [29]:
data = pd.merge(df_errors, df_rmse, left_index=True, right_index=True)[['Bet_x', 'preds_x', 'my_Bet_x']]
data.rename(columns = {'Bet_x':'Bet', 'preds_x':'preds', 'my_Bet_x':'my_Bet'}, inplace = True)
data

Unnamed: 0_level_0,Bet,preds,my_Bet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anthony Davis,41.5,41.105263,1.869565
James Harden,37.5,38.205128,1.833333
LeBron James,42.5,45.378378,1.952381
Nikola Jokic,47.5,51.635754,1.833333
Stephen Curry,43.5,41.921053,1.769231


In [30]:
final[final['norm_rmse'] == min(final['norm_rmse'])]

Unnamed: 0_level_0,Bet,O,U,href,preds,norm_rmse,rmse,decimal_O,decimal_U,my_Bet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Stephen Curry,43.5,100,−130,/players/c/curryst01.html,41.921053,0.006189,0.185675,2.0,1.769231,1.769231


In [31]:
final[final['rmse'] == min(final['rmse'])]

Unnamed: 0_level_0,Bet,O,U,href,preds,norm_rmse,rmse,decimal_O,decimal_U,my_Bet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Stephen Curry,43.5,100,−130,/players/c/curryst01.html,41.921053,0.006189,0.185675,2.0,1.769231,1.769231


In [32]:
df_rmse.to_csv('my_preds.csv')