In [1]:
import numpy as np
import pandas as pd
import tls_client
from bs4 import BeautifulSoup
from unidecode import unidecode
from betting_functions import get_url_soup
import time

In [2]:
requests = tls_client.Session(
    client_identifier="chrome112",
)

In [3]:
soup = get_url_soup(f'https://www.basketball-reference.com/leagues/NBA_2024_per_game.html')

# Find all <td> elements with the 'data-append-csv' attribute
td_elements = soup.find_all('td', {'data-append-csv': True})

# Create empty lists to store the extracted data
player_codes = []

# Loop through the <td> elements
for td_element in td_elements:
    data_append_csv = td_element['data-append-csv']
    player_name = td_element.find('a').text
    player_codes.append([player_name,data_append_csv])

# Print the extracted data
player_codes[0:5]

[['Precious Achiuwa', 'achiupr01'],
 ['Bam Adebayo', 'adebaba01'],
 ['Ochai Agbaji', 'agbajoc01'],
 ['Nickeil Alexander-Walker', 'alexani01'],
 ['Grayson Allen', 'allengr01']]

In [4]:
rows = soup.find_all('tr', class_='full_table')

# Create a list to store the 'pos' values
positions = []

# Loop through the rows and extract the 'pos' value for each row
for row in rows:
    pos_element = row.find('td', {'data-stat': 'pos'})
    if pos_element:
        position = pos_element.text
        positions.append(position)
code_df = pd.DataFrame(player_codes,columns=['name','code'])
code_df['pos'] = positions
code_df

Unnamed: 0,name,code,pos
0,Precious Achiuwa,achiupr01,C
1,Bam Adebayo,adebaba01,C
2,Ochai Agbaji,agbajoc01,SG
3,Nickeil Alexander-Walker,alexani01,SG
4,Grayson Allen,allengr01,SG
...,...,...,...
408,Delon Wright,wrighde01,PG
409,Trae Young,youngtr01,PG
410,Omer Yurtseven,yurtsom01,C
411,Cody Zeller,zelleco01,C


In [5]:
code = player_codes[0][1]
base = f'https://www.basketball-reference.com/players/c/{code}/gamelog/'
# NBA season we will be analyzing
year = '2024'
url = base+year
print(url)
# this is the HTML from the given URL
response1 = requests.get(url)
soup = BeautifulSoup(response1.content)

https://www.basketball-reference.com/players/c/achiupr01/gamelog/2024


In [6]:
def get_headers(soup):
    i = 0
    while i < 40:
        headers = [th.getText() for th in soup.findAll('tr', limit=40)[i].findAll('th')]
        i = i +1
        if headers:
            idx = i
            i = 41
    return headers[1:]


In [7]:
# avoid the first header row
def get_stats_df(soup,headers,player):

    rows = soup.findAll('tr')
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
    player_stats = [x for x in player_stats if len(x) > 4]
    stats = pd.DataFrame(player_stats, columns = headers)
    stats.insert(0, 'Player', player)
    stats.index = range(len(stats))
    return stats

soup = get_url_soup(url)
headers = get_headers(soup)
get_stats_df(soup,headers,code_df['code'].iloc[0])

Unnamed: 0,Player,G,Date,Age,Tm,Unnamed: 6,Opp,Unnamed: 8,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,achiupr01,1.0,2023-10-25,24-036,TOR,,MIN,W (+3),0,24:17,...,5.0,8.0,0.0,0.0,0.0,2.0,1.0,8.0,4.5,-5.0
1,achiupr01,2.0,2023-10-27,24-038,TOR,@,CHI,L (-1),0,22:38,...,7.0,9.0,1.0,0.0,0.0,3.0,6.0,8.0,4.1,-22.0
2,achiupr01,3.0,2023-10-28,24-039,TOR,,PHI,L (-7),0,18:15,...,4.0,7.0,3.0,0.0,1.0,1.0,1.0,6.0,4.9,5.0
3,achiupr01,,2023-10-30,24-041,TOR,,POR,L (-8),Inactive,,...,,,,,,,,,,


In [8]:
def get_stats(num,df):
    for person in range(num):
        errors = []
        try:
            year = '2024'
            player_id = df['code'].iloc[person]
            player_name = df['name'].iloc[person]
            base = f'https://www.basketball-reference.com/players/c/{player_id}/gamelog/'
            url = base+year
            print(url)
            time.sleep(2.95)
            soup = get_url_soup(url)
            
            headers = get_headers(soup)
            stats = get_stats_df(soup,headers,player_name)
            stats['pos'] = df['pos'].iloc[person]
            stats['season'] = year
            if person == 0:
                data_core = stats
            else:
                data_core = pd.concat([data_core,stats])
            print(person,player_codes[person][0])
        except: 
            errors.append(player_id)
    print(errors)
    return data_core


In [9]:
test = get_stats(len(code_df),code_df)

https://www.basketball-reference.com/players/c/achiupr01/gamelog/2024
0 Precious Achiuwa
[]
https://www.basketball-reference.com/players/c/adebaba01/gamelog/2024
1 Bam Adebayo
[]
https://www.basketball-reference.com/players/c/agbajoc01/gamelog/2024
2 Ochai Agbaji
[]
https://www.basketball-reference.com/players/c/alexani01/gamelog/2024
3 Nickeil Alexander-Walker
[]
https://www.basketball-reference.com/players/c/allengr01/gamelog/2024
4 Grayson Allen
[]
https://www.basketball-reference.com/players/c/anderky01/gamelog/2024
5 Kyle Anderson
[]
https://www.basketball-reference.com/players/c/antetgi01/gamelog/2024
6 Giannis Antetokounmpo
[]
https://www.basketball-reference.com/players/c/antetth01/gamelog/2024
7 Thanasis Antetokounmpo
[]
https://www.basketball-reference.com/players/c/anthoco01/gamelog/2024
8 Cole Anthony
[]
https://www.basketball-reference.com/players/c/anunoog01/gamelog/2024
9 OG Anunoby
[]
https://www.basketball-reference.com/players/c/avdijde01/gamelog/2024
10 Deni Avdija
[

In [25]:
with pd.option_context('display.max_rows', None,
                       'display.precision', 3,
                       ):
    display(test.sample(5))

Unnamed: 0,Player,G,Date,Age,Tm,Unnamed: 6,Opp,Unnamed: 8,GS,MP,...,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,pos,season
0,Kevin Love,1.0,2023-10-25,35-048,MIA,,DET,W (+1),1,27:43,...,2.0,0.0,0.0,1.0,5.0,13.0,10.5,5.0,PF,2024
1,Max Christie,,2023-10-26,20-258,LAL,,PHO,W (+5),Did Not Play,,...,,,,,,,,,SG,2024
0,Mouhamed Gueye,,2023-10-25,20-350,ATL,@,CHO,L (-6),Did Not Play,,...,,,,,,,,,PF,2024
2,Patrick Baldwin Jr.,2.0,2023-10-30,20-346,WAS,,BOS,L (-19),0,5:31,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,SF,2024
1,Devin Booker,,2023-10-26,26-361,PHO,@,LAL,L (-5),Inactive,,...,,,,,,,,,SG,2024


In [26]:
save = test.copy(deep=True)
save.tail(6)

Unnamed: 0,Player,G,Date,Age,Tm,Unnamed: 6,Opp,Unnamed: 8,GS,MP,...,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,pos,season
0,Cody Zeller,,2023-10-25,31-020,NOP,@,MEM,W (+7),Did Not Play,,...,,,,,,,,,C,2024
1,Cody Zeller,1.0,2023-10-28,31-023,NOP,,NYK,W (+9),0,7:44,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.4,-7.0,C,2024
2,Cody Zeller,2.0,2023-10-30,31-025,NOP,,GSW,L (-28),0,4:39,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.2,0.0,C,2024
0,Ivica Zubac,1.0,2023-10-25,26-221,LAC,,POR,W (+12),1,25:36,...,0.0,0.0,4.0,2.0,3.0,20.0,21.0,25.0,C,2024
1,Ivica Zubac,2.0,2023-10-27,26-223,LAC,@,UTA,L (-2),1,19:26,...,0.0,0.0,2.0,1.0,2.0,6.0,2.4,1.0,C,2024
2,Ivica Zubac,3.0,2023-10-29,26-225,LAC,,SAS,W (+40),1,22:29,...,0.0,0.0,1.0,1.0,4.0,16.0,14.0,23.0,C,2024


In [27]:
test['Date'] = pd.to_datetime(test['Date'])
data = test
for i in range(len(data)):
    data.iloc[i,0] = unidecode(data.iloc[i,0]).replace('_',' ')
    #remove special characters

In [28]:
data.head(5)

Unnamed: 0,Player,G,Date,Age,Tm,Unnamed: 6,Opp,Unnamed: 8,GS,MP,...,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,pos,season
0,Precious Achiuwa,1.0,2023-10-25,24-036,TOR,,MIN,W (+3),0,24:17,...,0.0,0.0,0.0,2.0,1.0,8.0,4.5,-5.0,C,2024
1,Precious Achiuwa,2.0,2023-10-27,24-038,TOR,@,CHI,L (-1),0,22:38,...,1.0,0.0,0.0,3.0,6.0,8.0,4.1,-22.0,C,2024
2,Precious Achiuwa,3.0,2023-10-28,24-039,TOR,,PHI,L (-7),0,18:15,...,3.0,0.0,1.0,1.0,1.0,6.0,4.9,5.0,C,2024
3,Precious Achiuwa,,2023-10-30,24-041,TOR,,POR,L (-8),Inactive,,...,,,,,,,,,C,2024
0,Bam Adebayo,1.0,2023-10-25,26-099,MIA,,DET,W (+1),1,32:08,...,3.0,0.0,2.0,2.0,3.0,22.0,18.4,-4.0,C,2024


In [29]:
data = data.reset_index(drop=True)

In [34]:
data['pos'].value_counts()

pos
SG    349
SF    303
PF    266
C     244
PG    242
Name: count, dtype: int64

In [35]:
data.dropna(inplace=True)
minutes =  data['MP'].to_list()

for i in range(len(minutes)):
    new = minutes[i].split(':')
    res = float(new[0])+(float(new[1])/60)
    minutes[i] = res
data['MP'] = minutes

In [39]:
data.columns

Index(['Player', 'G', 'Date', 'Age', 'Tm', ' ', 'Opp', ' ', 'GS', 'MP', 'FG',
       'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-', 'pos',
       'season'],
      dtype='object')

In [41]:
data.columns = ['player', 'G', 'date', 'age', 'team', 'H/A', 'Opp', 'W/L', 'GS', 'MP', 'FG',
       'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-', 'pos','season']
home = data['H/A'].values
print(home[0:4])
for i in range(len(home)):
    if home[i] == '@':
        home[i] = 0
    else:
        home[i] = 1
home[0:10]


['' '@' '' '']


array([1, 0, 1, 1, 0, 0, 1, 1, 0, 0], dtype=object)

In [44]:
data

Unnamed: 0,player,G,date,age,team,H/A,Opp,W/L,GS,MP,...,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,pos,season
0,Precious Achiuwa,1,2023-10-25,24-036,TOR,1,MIN,W (+3),0,24.283333,...,0,0,0,2,1,8,4.5,-5,C,2024
1,Precious Achiuwa,2,2023-10-27,24-038,TOR,0,CHI,L (-1),0,22.633333,...,1,0,0,3,6,8,4.1,-22,C,2024
2,Precious Achiuwa,3,2023-10-28,24-039,TOR,1,PHI,L (-7),0,18.250000,...,3,0,1,1,1,6,4.9,+5,C,2024
4,Bam Adebayo,1,2023-10-25,26-099,MIA,1,DET,W (+1),1,32.133333,...,3,0,2,2,3,22,18.4,-4,C,2024
5,Bam Adebayo,2,2023-10-27,26-101,MIA,0,BOS,L (-8),1,35.016667,...,2,1,2,3,1,27,17.8,0,C,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1399,Cody Zeller,1,2023-10-28,31-023,NOP,1,NYK,W (+9),0,7.733333,...,0,0,0,0,0,0,-0.4,-7,C,2024
1400,Cody Zeller,2,2023-10-30,31-025,NOP,1,GSW,L (-28),0,4.650000,...,0,0,0,0,0,0,-1.2,0,C,2024
1401,Ivica Zubac,1,2023-10-25,26-221,LAC,1,POR,W (+12),1,25.600000,...,0,0,4,2,3,20,21.0,+25,C,2024
1402,Ivica Zubac,2,2023-10-27,26-223,LAC,0,UTA,L (-2),1,19.433333,...,0,0,2,1,2,6,2.4,+1,C,2024


In [45]:
data['H/A'] = home
data = data.replace('CHO','CHA')
data = data.replace('PHO','PHX')
data= data.replace('BRK','BKN')

data['W/L'] = data['W/L'].str.extract(r"\(([-+]?\d+)\)").astype(int)
data

Unnamed: 0,player,G,date,age,team,H/A,Opp,W/L,GS,MP,...,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,pos,season
0,Precious Achiuwa,1,2023-10-25,24-036,TOR,1,MIN,3,0,24.283333,...,0,0,0,2,1,8,4.5,-5,C,2024
1,Precious Achiuwa,2,2023-10-27,24-038,TOR,0,CHI,-1,0,22.633333,...,1,0,0,3,6,8,4.1,-22,C,2024
2,Precious Achiuwa,3,2023-10-28,24-039,TOR,1,PHI,-7,0,18.250000,...,3,0,1,1,1,6,4.9,+5,C,2024
4,Bam Adebayo,1,2023-10-25,26-099,MIA,1,DET,1,1,32.133333,...,3,0,2,2,3,22,18.4,-4,C,2024
5,Bam Adebayo,2,2023-10-27,26-101,MIA,0,BOS,-8,1,35.016667,...,2,1,2,3,1,27,17.8,0,C,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1399,Cody Zeller,1,2023-10-28,31-023,NOP,1,NYK,9,0,7.733333,...,0,0,0,0,0,0,-0.4,-7,C,2024
1400,Cody Zeller,2,2023-10-30,31-025,NOP,1,GSW,-28,0,4.650000,...,0,0,0,0,0,0,-1.2,0,C,2024
1401,Ivica Zubac,1,2023-10-25,26-221,LAC,1,POR,12,1,25.600000,...,0,0,4,2,3,20,21.0,+25,C,2024
1402,Ivica Zubac,2,2023-10-27,26-223,LAC,0,UTA,-2,1,19.433333,...,0,0,2,1,2,6,2.4,+1,C,2024


In [47]:
data['date'].max()

Timestamp('2023-10-30 00:00:00')

In [48]:
KM_vals = pd.read_csv('KM_vals.csv',index_col=0)

In [49]:
KM_dict = dict(zip(KM_vals.Player,KM_vals.KM))

In [53]:
temp = []
for p in data.player.values:
    if p[0] in KM_dict:
        temp.append(KM_dict[p[0]])
    else:
        temp.append(15)
data['KM'] = temp

In [54]:
data[['FG%','3P%','FT%']] = data[['FG%','3P%','FT%']].replace(5.00,0)

In [59]:

data.to_csv(f'data_{year}.csv',index=False)