In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo
from time import sleep
import pickle

# from helpers.scrape import get_first_basket, get_roster
from helpers.preprocess import feature_engineering
from helpers.utils import getId
from helpers.email import send_email


yst = datetime.now(ZoneInfo('America/New_York')) - timedelta(days = 1)


url = f'https://www.basketball-reference.com/boxscores/?month={yst.month}&day={yst.day}&year={yst.year}'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')

def predict_first_basket(starting_lineups) :
    players = (starting_lineups[0] + starting_lineups[1])
    ratings = pd.read_csv('data/player_metadata.csv')
    ratings = ratings.copy()[ratings['player_id'].isin(players)].sort_values('rating')
    first_basket_pred = ratings['player_id'].values[-1]
    return first_basket_pred

def random_first_basket(starting_lineups) :
    players = (starting_lineups[0] + starting_lineups[1])
    idx = np.random.randint(0, 10)
    return players[idx]


game_ids = [getId(x) for x in soup.find_all('a', href = True) if 'boxscores/pbp' in x['href']]

# dfs = []
# for i, gameId in enumerate(game_ids) :
#     sleep(5)
#     print(f'[{i+1}/{len(game_ids)}] {gameId}')
#     df, starting_lineups = get_first_basket(gameId)
#     df.insert(1, 'Date', yst.date())
#     df.insert(2, 'Time', np.nan)
#     df.insert(5, 'season', 2025)
#     df.to_csv('data/first_basket_2025.csv', index = False, header = False, mode = 'a')
#     df['first_basket_rand'] = random_first_basket(starting_lineups)
#     df['first_basket_pred'] = predict_first_basket(starting_lineups)
#     dfs.append(df[['game_id', 'Date', 'Home', 'Away', 'first_basket', 'first_basket_tm', 'first_basket_rand', 'first_basket_pred']])
#     sleep(5)
#     roster = get_roster(gameId)
#     roster.to_csv('data/rosters.nosync/rosters_2025.csv', index = None, header = None, mode = 'a')

# first_basket_df = pd.concat(dfs).set_index('game_id')
# first_basket_df['correct_pred'] = (first_basket_df['first_basket'] == first_basket_df['first_basket_pred'])
# first_basket_df['correct_rand'] = (first_basket_df['first_basket'] == first_basket_df['first_basket_rand'])
# first_basket_df = first_basket_df[['Date', 'Home', 'Away', 'first_basket', 'first_basket_tm', 'first_basket_pred', 'correct_pred', 'first_basket_rand', 'correct_rand']]

# # Map id's to player names
# player_metadata = pd.read_csv('data/player_metadata.csv')
# playerId_map = dict(zip(player_metadata['player_id'], player_metadata['name']))
# for player_col in ['first_basket', 'first_basket_pred', 'first_basket_rand'] :
#     first_basket_df[player_col] = first_basket_df[player_col].map(playerId_map)

# acc_pred = first_basket_df['correct_pred'].mean()
# acc_rand = first_basket_df['correct_rand'].mean()

# print(f'\nAccuracy predicted : {round(100 * acc_pred, 1)}%  [{first_basket_df["correct_pred"].sum()}/{first_basket_df.shape[0]}]\n')
# print(f'\nAccuracy random    : {round(100 * acc_rand, 1)}%  [{first_basket_df["correct_rand"].sum()}/{first_basket_df.shape[0]}]\n\n')
# print(first_basket_df, '\n\n\n')

# date = yst.strftime("%d %b %Y")
# synopsis = f'[LeFirstBasket | {date}] Random model: {round(100 * acc_rand, 1)}% | Predicted model: {round(100 * acc_pred, 1)}%'

# send_email(first_basket_df,
#            receivers = ['martinbog19@gmail.com', 'lucas.leforestier@gmail.com'],
#            subject = synopsis)





# Send ML preds
data, features = feature_engineering()

games = data.copy()[data['game_id'].isin(game_ids)].reset_index(drop = True)

X = games[features].to_numpy()

# Load the model from the .pkl file
with open('models/model_rf.pkl', "rb") as f:
    model = pickle.load(f)

y_pred = model.predict_proba(X)[:, -1]

games = games[['game_id', 'player_id', 'Player']]
games['Pred. prob (%)'] = y_pred
games['Pred. prob (%)'] = games['Pred. prob (%)'] / games.groupby('game_id')['Pred. prob (%)'].transform('sum')
games['Pred. odds'] = games['Pred. prob (%)'].apply(lambda x: round(1/x, 1))
games['Pred. prob (%)'] = games['Pred. prob (%)'].apply(lambda x: round(x * 100, 1))
games = games.sort_values(['game_id', 'Pred. odds']).reset_index(drop = True)

send_email(games,
           receivers = ['martinbog19@gmail.com', 'lucas.leforestier@gmail.com'],
           subject = '[LeFirstBasket] MACHINE LEARNING PREDICTIONS!')

KeyboardInterrupt: 

In [2]:
games

Unnamed: 0,game_id,player_id,Player,Pred. prob (%),Pred. odds
0,202412100MIL,antetgi01,Giannis Antetokounmpo,17.0,5.9
1,202412100MIL,lillada01,Damian Lillard,12.8,7.8
2,202412100MIL,lopezbr01,Brook Lopez,11.1,9.0
3,202412100MIL,suggsja01,Jalen Suggs,11.0,9.1
4,202412100MIL,cartewe01,Wendell Carter Jr.,10.2,9.8
5,202412100MIL,caldwke01,Kentavious Caldwell-Pope,8.0,12.6
6,202412100MIL,princta02,Taurean Prince,7.8,12.8
7,202412100MIL,bitadgo01,Goga Bitadze,7.7,13.0
8,202412100MIL,dasiltr01,Tristan Da Silva,7.3,13.7
9,202412100MIL,jacksan01,Andre Jackson Jr.,7.1,14.0


In [4]:
games.tail(10)

Unnamed: 0,game_id,player_id,Player,Pred. prob (%),Pred. odds
10,202412100OKC,doncilu01,Luka Dončić,14.7,6.8
11,202412100OKC,gilgesh01,Shai Gilgeous-Alexander,12.2,8.2
12,202412100OKC,irvinky01,Kyrie Irving,11.8,8.5
13,202412100OKC,thompkl01,Klay Thompson,11.0,9.1
14,202412100OKC,willija06,Jalen Williams,10.9,9.1
15,202412100OKC,livelde01,Dereck Lively II,8.5,11.8
16,202412100OKC,harteis01,Isaiah Hartenstein,8.3,12.1
17,202412100OKC,dortlu01,Luguentz Dort,8.0,12.4
18,202412100OKC,grimequ01,Quentin Grimes,7.5,13.3
19,202412100OKC,wallaca01,Cason Wallace,7.0,14.2


In [5]:
yst = datetime.now(ZoneInfo('America/New_York'))


url = f'https://www.basketball-reference.com/boxscores/?month={yst.month}&day={yst.day}&year={yst.year}'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')

def predict_first_basket(starting_lineups) :
    players = (starting_lineups[0] + starting_lineups[1])
    ratings = pd.read_csv('data/player_metadata.csv')
    ratings = ratings.copy()[ratings['player_id'].isin(players)].sort_values('rating')
    first_basket_pred = ratings['player_id'].values[-1]
    return first_basket_pred

def random_first_basket(starting_lineups) :
    players = (starting_lineups[0] + starting_lineups[1])
    idx = np.random.randint(0, 10)
    return players[idx]


game_ids = [getId(x) for x in soup.find_all('a', href = True) if 'boxscores/pbp' in x['href']]

In [7]:
data

Unnamed: 0,game_id,Player,player_id,first_basket_scorer,PTS_avg,PTS_5,PTS_25,PTS_50,USG%_avg,USG%_5,...,VORP_50,FGA_avg,FGA_5,FGA_25,FGA_50,first_basket_avg,first_basket_5,first_basket_25,first_basket_50,rating
0,201410280LAL,Kobe Bryant,bryanko01,,,,,,,,...,,,,,,,,,,1.519156
1,201410280LAL,Jeremy Lin,linje01,,,,,,,,...,,,,,,,,,,-0.529144
2,201410280LAL,Carlos Boozer,boozeca01,,,,,,,,...,,,,,,,,,,-0.529144
3,201410280LAL,Patrick Beverley,beverpa01,,,,,,,,...,,,,,,,,,,-0.358452
4,201410280LAL,James Harden,hardeja01,,,,,,,,...,,,,,,,,,,1.348464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123396,202412100OKC,Isaiah Hartenstein,harteis01,0.0,-1.161721,-0.802020,-0.586224,-0.771751,-0.861324,-0.861471,...,0.156547,-1.231090,-0.852080,-0.797483,-0.944131,-1.279754,0.316226,0.211541,0.280368,-0.421214
123397,202412100OKC,Cason Wallace,wallaca01,0.0,-1.082273,-0.933859,-1.051481,-1.048371,-1.082985,-0.993698,...,-0.838664,-1.001942,-0.818665,-0.877975,-0.911172,-1.383861,-0.737861,-1.198735,-1.321737,-1.040647
123398,202412100OKC,Luguentz Dort,dortlu01,0.0,-0.389230,-0.867940,-0.754603,-0.643090,-0.346403,-1.056069,...,-1.007331,-0.219399,-0.685005,-0.655076,-0.587573,-0.209323,-0.737861,0.211541,0.280368,-0.545101
123399,202412100OKC,Jalen Williams,willija06,0.0,0.184566,0.692154,0.525962,0.441950,0.059699,0.847500,...,0.409913,0.165238,0.918910,0.583253,0.488096,-0.137044,0.316226,0.211541,-0.120158,0.198219
