In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import json
from time import sleep

from scrape import get_first_basket, getId

In [3]:
yst = datetime.today() - timedelta(days = 2)

url = f'https://www.basketball-reference.com/boxscores/?month={yst.month}&day={yst.day}&year={yst.year}'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')

In [4]:
def predict_first_basket() :
    ######### starting_lineups #########
    return np.random.randint(0, 10)

In [5]:
game_ids = [getId(x) for x in soup.find_all('a', href = True) if 'boxscores/pbp' in x['href']]
dfs = []
for gameId in game_ids :
    sleep(3)
    df, starting_lineups = get_first_basket(gameId, starting_lineups = True)
    idx = predict_first_basket()
    df['first_basket_pred'] = (starting_lineups[0] + starting_lineups[1])[idx]
    dfs.append(df[['game_id', 'Home', 'Away', 'first_basket', 'first_basket_tm', 'first_basket_pred']])

first_basket_df = pd.concat(dfs).set_index('game_id')
first_basket_df['Date'] = yst.date()
first_basket_df['correct_pred'] = (first_basket_df['first_basket'] == first_basket_df['first_basket_pred'])
first_basket_df[['Date', 'Home', 'Away', 'first_basket', 'first_basket_tm', 'first_basket_pred']]

Unnamed: 0_level_0,Date,Home,Away,first_basket,first_basket_tm,first_basket_pred
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
202411170CHI,2024-11-17,CHI,HOU,sengual01,HOU,giddejo01
202411170CLE,2024-11-17,CLE,CHO,jeromty01,CLE,mobleev01
202411170IND,2024-11-17,IND,MIA,mathube01,IND,loveke01
202411170LAC,2024-11-17,LAC,UTA,zubaciv01,LAC,georgke01
202411170MEM,2024-11-17,MEM,DEN,murraja01,DEN,watsope01
202411170MIN,2024-11-17,MIN,PHO,nurkiju01,PHO,jonesty01
202411170NYK,2024-11-17,NYK,BRK,bridgmi01,NYK,thomaca02
202411170OKC,2024-11-17,OKC,DAL,washipj01,DAL,dortlu01
202411170POR,2024-11-17,POR,ATL,youngtr01,ATL,youngtr01
202411170WAS,2024-11-17,WAS,DET,kuzmaky01,WAS,iveyja01


In [7]:
from send_email import send_email

In [9]:
send_email(first_basket_df, receivers = ['martinbog19@gmail.com'])

Email sent successfully!


In [6]:
first_basket_df = first_basket_df[['game_id', 'Date', 'Home', 'Away', 'first_basket', 'first_basket_tm', 'first_basket_pred', 'correct_pred']]

KeyError: "['game_id'] not in index"

In [230]:
acc = first_basket_df['correct_pred'].mean()
print(f'Accuracy: {round(100 * acc, 1)}%  [{first_basket_df["correct_pred"].sum()}/{first_basket_df.shape[0]}]')
first_basket_df

Accuracy: 8.3%  [1/12]


Unnamed: 0_level_0,Home,Away,first_basket,first_basket_tm,first_basket_pred,Date,correct_pred
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
202411150ATL,ATL,WAS,capelca01,ATL,poolejo01,2024-11-15,False
202411150CLE,CLE,CHI,garlada01,CLE,okorois01,2024-11-15,False
202411150GSW,GSW,MEM,wellsja01,MEM,wellsja01,2024-11-15,True
202411150HOU,HOU,LAC,smithja05,HOU,greenja05,2024-11-15,False
202411150IND,IND,MIA,highsha01,MIA,halibty01,2024-11-15,False
202411150NOP,NOP,DEN,murraja01,DEN,bostobr01,2024-11-15,False
202411150NYK,NYK,BRK,anunoog01,NYK,finnedo01,2024-11-15,False
202411150OKC,OKC,PHO,okogijo01,PHO,willija06,2024-11-15,False
202411150ORL,ORL,PHI,embiijo01,PHI,bitadgo01,2024-11-15,False
202411150SAC,SAC,MIN,goberru01,MIN,lenal01,2024-11-15,False


In [8]:
from unidecode import unidecode
import string
from fuzzywuzzy import process



In [123]:
def normalize_name(x) :

    for suffix in [' Jr.', ' Sr.', ' III', ' II', ' IV', ' Jr', ' Sr'] :
        x = x.replace(suffix, '')
    x = x.translate(str.maketrans('', '', string.punctuation))
    x = unidecode(x).lower()

    return x

In [126]:
def get_ratings(year) :
    url = f'https://hoopshype.com/nba2k/{year-1}-{year}/'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html')
    table = soup.find('table')
    ratings = pd.read_html(str(table))[0]
    ratings.columns = ['drop', 'name', 'rating']
    ratings = ratings.drop(columns = 'drop')
    ratings['name_norm'] = ratings['name'].apply(normalize_name)
    ratings = ratings[['name_norm', 'rating']]
    return ratings

In [127]:
ratings = get_ratings(2025)

In [128]:
def get_players(year) :

    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'lxml')
    table = soup.find('table')
    while table.find_all('tr', class_ = 'thead') :
        table.find('tr', class_ = 'thead')
    try :
        table.find('tr', class_ = 'norank').decompose()
    except :
        pass

    players = pd.read_html(str(table))[0].rename(columns = {'Player': 'name'})
    players['name_norm'] = players['name'].apply(normalize_name)
    players['player_id'] = [getId(x) for x in table.find_all('a', href = True) if 'players' in x['href']]
    players = players.copy()[players['GS'] > 0]
    players = players[['name', 'name_norm', 'player_id']].drop_duplicates().reset_index(drop = True)

    return players


In [130]:
merged = pd.merge(players, ratings, on = 'name_norm', how = 'left')

In [150]:
name_map = {'dennis schroeder': 'dennis schroder',
 'santiago aldama': 'santi aldama',
 'scottie pippen': 'scotty pippen',
 'nicolas claxton': 'nic claxton',
 'herb jones': 'herbert jones'}

In [131]:
merged[merged['rating'].isna()]

Unnamed: 0,name,name_norm,player_id,rating
36,Domantas Sabonis,domantas sabonis,sabondo01,
44,Dennis Schröder,dennis schroder,schrode01,
50,Desmond Bane,desmond bane,banede01,
105,Santi Aldama,santi aldama,aldamsa01,
111,Scotty Pippen Jr.,scotty pippen,pippesc02,
112,Brandon Boston Jr.,brandon boston,bostobr01,
149,Nic Claxton,nic claxton,claxtni01,
152,Alex Sarr,alex sarr,sarral01,
158,Al Horford,al horford,horfoal01,
183,Herbert Jones,herbert jones,joneshe01,


In [121]:
normalize_name('Trey Murphy III')

'trey murphyi'

In [143]:
for name_norm in merged[merged['rating'].isna()]['name_norm'] :

    closest_match, match_score = process.extractOne(name_norm, ratings['name_norm'].to_list())

    print(name_norm, closest_match, match_score)

domantas sabonis matas buzelis 62
dennis schroder dennis schroeder 97
desmond bane emoni bates 70
santi aldama santiago aldama 89
scotty pippen scottie pippen 89
brandon boston brandon ingram 64
nic claxton nicolas claxton 85
alex sarr alex caruso 70
al horford jalen hoodschifino 63
herbert jones herb jones 87
andre jackson jaren jackson 85
doug mcdermott josh minott 48


In [149]:
with open('utils/name_map_2k.json', 'w') as f:
    json.dump(name_map, f)

with open('utils/name_map_2k.json', 'r') as f:
    loaded_data = json.load(f)

In [4]:
import json
import pandas as pd
import requests
from bs4 import BeautifulSoup

from unidecode import unidecode
import string
from fuzzywuzzy import process
from scrape import getId

In [130]:
def normalize_name(x) :

    for suffix in [' Jr.', ' Sr.', ' III', ' II', ' IV', ' Jr', ' Sr'] :
        x = x.replace(suffix, '')
    x = x.translate(str.maketrans('', '', string.punctuation))

    return unidecode(x).lower()

def get_ratings(year) :

    url = f'https://hoopshype.com/nba2k/{year-1}-{year}/'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html')
    table = soup.find('table')
    ratings = pd.read_html(str(table))[0]
    ratings.columns = ['drop', 'name', 'rating']
    ratings = ratings.drop(columns = 'drop')
    ratings['name_norm'] = ratings['name'].apply(normalize_name)
    ratings = ratings[['name_norm', 'rating']]

    return ratings

def get_players(year) :

    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'lxml')
    table = soup.find('table')
    while table.find_all('tr', class_ = 'thead') :
        table.find('tr', class_ = 'thead')
    try :
        table.find('tr', class_ = 'norank').decompose()
    except :
        pass

    players = pd.read_html(str(table))[0].rename(columns = {'Player': 'name'})
    players['name_norm'] = players['name'].apply(normalize_name)
    players['player_id'] = [getId(x) for x in table.find_all('a', href = True) if 'players' in x['href']]
    players = players.copy()[players['GS'] > 0]
    players = players.drop_duplicates().reset_index(drop = True)

    return players

def manual_rating(row, hardcoded_players) :

    if np.isnan(row['rating']) and row['player_id'] in hardcoded_players:
        with open('utils/manual_2k.json', 'r') as f:
            manual_dict = json.load(f)
        return manual_dict[row['player_id']]
    else :
        return row['rating']

In [131]:
season = 2025

ratings = get_ratings(season)
players = get_players(season)

In [168]:
# Replace with name_map_2k
with open('utils/name_map_2k.json', 'r') as f:
    name_map = json.load(f)
ratings['name_norm'] = ratings['name_norm'].apply(lambda x: name_map[x] if x in name_map.keys() else x)

# Initial merge
players_ratings = pd.merge(players, ratings,
                  how = 'left',
                  on = 'name_norm')

# Check for duplicates
if players_ratings['player_id'].value_counts().max() > 1 :
    duplicated_ratings = players_ratings.copy()[players_ratings['player_id'].duplicated(keep = False)]
    duplicated_ids = duplicated_ratings['player_id'].unique().tolist()
    players_ratings['rating'].loc[duplicated_ratings.index] = np.nan
    print(f'!!!  playerId {", ".join(duplicated_ids)} duplicated, ratings set to NULL')

# Hard-coded ratings (the Sabonis)
with open('utils/manual_2k.json', 'r') as f:
    manual_dict = json.load(f)
hardcoded_players = list(manual_dict.keys())
players_ratings['rating'] = players_ratings.apply(manual_rating, axis = 1, args = hardcoded_players)

# Suggested additions to name_map_2k
null_ratings = players_ratings.copy()[players_ratings['rating'].isna()]
choices = ratings['name_norm'].tolist()
suggested_map = {name: process.extractOne(name, choices)[0] for name in null_ratings['name_norm']}
with open('utils/name_map_suggestions_2k.json', 'w') as f:
    json.dump(suggested_map, f)

In [169]:
with open('utils/knn_features.txt', 'r') as file:
    knn_features = [f.strip('\n') for f in file.readlines()]

from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

In [171]:
train = players_ratings.copy()[players_ratings['rating'].notna()]
scaler = StandardScaler()
X_train = scaler.fit_transform(train[knn_features])

In [172]:
# Fit a kNN model with 10 neighbors to the data
knn = KNeighborsRegressor(n_neighbors = 10)
knn.fit(X_train, train['rating'])

In [179]:
X_null = scaler.transform(null_ratings[knn_features])
null_ratings['rating'] = knn.predict(X_null)

In [180]:
pd.concat([train, null_ratings])

Unnamed: 0,Rk,name,Age,Team,Pos,G,GS,MP,FG,FGA,...,STL,BLK,TOV,PF,PTS,Awards,name_norm,player_id,rating,pred
0,1,Giannis Antetokounmpo,30,MIL,PF,9,9,34.8,12.9,21.2,...,0.4,0.9,2.7,3.6,31.6,,giannis antetokounmpo,antetgi01,97.0,92.7
1,2,Anthony Davis,31,LAL,PF,9,9,35.1,10.8,18.7,...,1.3,2.0,2.2,1.2,31.2,,anthony davis,davisan02,94.0,92.2
2,3,Jayson Tatum,26,BOS,SF,11,11,36.0,9.5,20.5,...,1.6,0.5,2.9,2.5,30.5,,jayson tatum,tatumja01,95.0,91.1
3,4,Nikola Jokić,29,DEN,C,10,10,38.1,10.8,19.2,...,1.7,1.0,4.1,2.0,29.7,,nikola jokic,jokicni01,97.0,89.5
4,5,LaMelo Ball,23,CHO,PG,10,10,33.4,10.2,23.0,...,1.5,0.3,4.7,4.1,29.4,,lamelo ball,ballla01,87.0,88.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,180,Alex Sarr,19,WAS,PF,9,9,25.3,3.2,9.8,...,0.6,2.4,1.0,2.9,9.2,,alex sarr,sarral01,78.6,78.6
158,190,Al Horford,38,BOS,C,10,10,26.7,3.2,6.6,...,1.1,0.7,1.0,1.3,8.7,,al horford,horfoal01,75.6,75.6
183,229,Herbert Jones,26,NOP,SF,4,4,28.3,3.0,6.3,...,1.8,0.3,1.3,2.3,6.8,,herbert jones,joneshe01,75.3,75.3
218,335,Andre Jackson Jr.,23,MIL,SG,9,3,15.7,1.1,2.8,...,0.9,0.3,1.3,2.3,3.0,,andre jackson,jacksan01,73.0,73.0


In [181]:
players_ratings.columns

Index(['Rk', 'name', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards',
       'name_norm', 'player_id', 'rating'],
      dtype='object')

In [191]:
player_metadata = pd.read_csv('data/player_metadata.csv')

In [189]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
from time import sleep

from scrape import get_first_basket, getId


yst = datetime.today() - timedelta(days = 1)

url = f'https://www.basketball-reference.com/boxscores/?month={yst.month}&day={yst.day}&year={yst.year}'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')

def predict_first_basket(starting_lineups) :
    players = (starting_lineups[0] + starting_lineups[1])
    ratings = pd.read_csv('data/player_metadata.csv')
    ratings = ratings.copy()[ratings['player_id'].isin(players)].sort_values('rating')
    first_basket_pred = ratings['player_id'].values[-1]
    return first_basket_pred

def random_first_basket(starting_lineups) :
    players = (starting_lineups[0] + starting_lineups[1])
    idx = np.random.randint(0, 10)
    return players[idx]



game_ids = [getId(x) for x in soup.find_all('a', href = True) if 'boxscores/pbp' in x['href']]
dfs = []
for gameId in game_ids :
    sleep(10)
    df, starting_lineups = get_first_basket(gameId, starting_lineups = True)
    df['first_basket_rand'] = random_first_basket(starting_lineups)
    df['first_basket_pred'] = predict_first_basket(starting_lineups)
    dfs.append(df[['game_id', 'Home', 'Away', 'first_basket', 'first_basket_tm', 'first_basket_rand', 'first_basket_pred']])

first_basket_df = pd.concat(dfs).set_index('game_id')
first_basket_df['Date'] = yst.date()
first_basket_df['correct_pred'] = (first_basket_df['first_basket'] == first_basket_df['first_basket_pred'])
first_basket_df['correct_rand'] = (first_basket_df['first_basket'] == first_basket_df['first_basket_rand'])
first_basket_df = first_basket_df[['Date', 'Home', 'Away', 'first_basket', 'first_basket_tm', 'first_basket_pred', 'correct_pred', 'first_basket_rand', 'correct_rand']]




acc_pred = first_basket_df['correct_pred'].mean()
acc_rand = first_basket_df['correct_rand'].mean()

print(f'\nAccuracy predicted : {round(100 * acc_pred, 1)}%  [{first_basket_df["correct_pred"].sum()}/{first_basket_df.shape[0]}]\n')
print(f'\nAccuracy random    : {round(100 * acc_rand, 1)}%  [{first_basket_df["correct_rand"].sum()}/{first_basket_df.shape[0]}]\n\n')
print(first_basket_df, '\n\n\n')



Accuracy predicted : 12.5%  [1/8]


Accuracy random    : 12.5%  [1/8]


                    Date Home Away first_basket first_basket_tm  \
game_id                                                           
202411120BOS  2024-11-12  BOS  ATL    johnsja05             ATL   
202411120DET  2024-11-12  DET  MIA    harrito02             DET   
202411120GSW  2024-11-12  GSW  DAL    thompkl01             DAL   
202411120MIL  2024-11-12  MIL  TOR    lopezbr01             MIL   
202411120ORL  2024-11-12  ORL  CHO    wagnefr01             ORL   
202411120PHI  2024-11-12  PHI  NYK    georgpa01             PHI   
202411120POR  2024-11-12  POR  MIN    camarto01             POR   
202411120UTA  2024-11-12  UTA  PHO    bookede01             PHO   

             first_basket_pred  correct_pred first_basket_rand  correct_rand  
game_id                                                                       
202411120BOS         tatumja01         False         johnsja05          True  
202411120DET       

In [200]:
first_basket_df.merge(
    player_metadata[['player_id', 'name']]
    .rename(columns = {'player_id': 'first_basket'})
)

Unnamed: 0,Date,Home,Away,first_basket,first_basket_tm,first_basket_pred,correct_pred,first_basket_rand,correct_rand,name
0,2024-11-12,BOS,ATL,johnsja05,ATL,tatumja01,False,johnsja05,True,Jalen Johnson
1,2024-11-12,DET,MIA,harrito02,DET,adebaba01,False,herroty01,False,Tobias Harris
2,2024-11-12,GSW,DAL,thompkl01,DAL,doncilu01,False,jackstr02,False,Klay Thompson
3,2024-11-12,MIL,TOR,lopezbr01,MIL,antetgi01,False,rolliry01,False,Brook Lopez
4,2024-11-12,ORL,CHO,wagnefr01,ORL,ballla01,False,gibsota01,False,Franz Wagner
5,2024-11-12,PHI,NYK,georgpa01,PHI,brunsja01,False,townska01,False,Paul George
6,2024-11-12,POR,MIN,camarto01,POR,edwaran01,False,simonan01,False,Toumani Camara
7,2024-11-12,UTA,PHO,bookede01,PHO,bookede01,True,sextoco01,False,Devin Booker


In [207]:
player_metadata[['player_id', 'name']].T.to_dict()

{0: {'player_id': 'antetgi01', 'name': 'Giannis Antetokounmpo'},
 1: {'player_id': 'jokicni01', 'name': 'Nikola Jokić'},
 2: {'player_id': 'doncilu01', 'name': 'Luka Dončić'},
 3: {'player_id': 'gilgesh01', 'name': 'Shai Gilgeous-Alexander'},
 4: {'player_id': 'curryst01', 'name': 'Stephen Curry'},
 5: {'player_id': 'tatumja01', 'name': 'Jayson Tatum'},
 6: {'player_id': 'jamesle01', 'name': 'LeBron James'},
 7: {'player_id': 'davisan02', 'name': 'Anthony Davis'},
 8: {'player_id': 'duranke01', 'name': 'Kevin Durant'},
 9: {'player_id': 'edwaran01', 'name': 'Anthony Edwards'},
 10: {'player_id': 'bookede01', 'name': 'Devin Booker'},
 11: {'player_id': 'brunsja01', 'name': 'Jalen Brunson'},
 12: {'player_id': 'brownja02', 'name': 'Jaylen Brown'},
 13: {'player_id': 'irvinky01', 'name': 'Kyrie Irving'},
 14: {'player_id': 'mitchdo01', 'name': 'Donovan Mitchell'},
 15: {'player_id': 'wembavi01', 'name': 'Victor Wembanyama'},
 16: {'player_id': 'halibty01', 'name': 'Tyrese Haliburton'},
 1

In [211]:
playerId_map = dict(zip(player_metadata['player_id'], player_metadata['name']))

In [214]:
first_basket_df['first_basket'] = first_basket_df['first_basket'].map(playerId_map)

In [215]:
first_basket_df

Unnamed: 0_level_0,Date,Home,Away,first_basket,first_basket_tm,first_basket_pred,correct_pred,first_basket_rand,correct_rand
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
202411120BOS,2024-11-12,BOS,ATL,Jalen Johnson,ATL,tatumja01,False,johnsja05,True
202411120DET,2024-11-12,DET,MIA,Tobias Harris,DET,adebaba01,False,herroty01,False
202411120GSW,2024-11-12,GSW,DAL,Klay Thompson,DAL,doncilu01,False,jackstr02,False
202411120MIL,2024-11-12,MIL,TOR,Brook Lopez,MIL,antetgi01,False,rolliry01,False
202411120ORL,2024-11-12,ORL,CHO,Franz Wagner,ORL,ballla01,False,gibsota01,False
202411120PHI,2024-11-12,PHI,NYK,Paul George,PHI,brunsja01,False,townska01,False
202411120POR,2024-11-12,POR,MIN,Toumani Camara,POR,edwaran01,False,simonan01,False
202411120UTA,2024-11-12,UTA,PHO,Devin Booker,PHO,bookede01,True,sextoco01,False


In [220]:
from datetime import timezone
first_basket_df['time']  = datetime.now(timezone.utc)

In [1]:
first_basket_df

NameError: name 'first_basket_df' is not defined