In [82]:
import numpy as np
import pandas as pd
import tls_client
from bs4 import BeautifulSoup
from unidecode import unidecode
from betting_functions import get_url_soup
import time

In [83]:
requests = tls_client.Session(
    client_identifier="chrome112",
)
year = '2018'

In [84]:
soup = get_url_soup(f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html')

# Find all <td> elements with the 'data-append-csv' attribute
td_elements = soup.find_all('td', {'data-append-csv': True})

# Create empty lists to store the extracted data
player_codes = []

# Loop through the <td> elements
for td_element in td_elements:
    data_append_csv = td_element['data-append-csv']
    player_name = td_element.find('a').text
    player_codes.append([player_name,data_append_csv])

# Print the extracted data
print(player_codes[0:10])
print(len(player_codes))


[['Álex Abrines', 'abrinal01'], ['Quincy Acy', 'acyqu01'], ['Steven Adams', 'adamsst01'], ['Bam Adebayo', 'adebaba01'], ['Arron Afflalo', 'afflaar01'], ['Cole Aldrich', 'aldrico01'], ['LaMarcus Aldridge', 'aldrila01'], ['Jarrett Allen', 'allenja01'], ['Kadeem Allen', 'allenka01'], ['Tony Allen', 'allento01']]
664


In [85]:
rows = soup.find_all('tr', class_=['full_table', 'italic_text partial_table'])

# Create a list to store the 'pos' values
positions = []

# Loop through the rows and extract the 'pos' value for each row
for row in rows:
    pos_element = row.find('td', {'data-stat': 'pos'})
    if pos_element:
        position = pos_element.text
        positions.append(position)
print(len(player_codes),len(positions))
code_df = pd.DataFrame(player_codes,columns=['name','code'])
code_df['pos'] = positions
code_df.drop_duplicates(inplace=True)
code_df = code_df.reset_index(drop=True)

664 664


In [86]:
code_df.iloc[0:10]

Unnamed: 0,name,code,pos
0,Álex Abrines,abrinal01,SG
1,Quincy Acy,acyqu01,PF
2,Steven Adams,adamsst01,C
3,Bam Adebayo,adebaba01,C
4,Arron Afflalo,afflaar01,SG
5,Cole Aldrich,aldrico01,C
6,LaMarcus Aldridge,aldrila01,C
7,Jarrett Allen,allenja01,C
8,Kadeem Allen,allenka01,PG
9,Tony Allen,allento01,SF


In [87]:
code = player_codes[0][1]
base = f'https://www.basketball-reference.com/players/c/{code}/gamelog/'
# NBA season we will be analyzing
url = base+year
print(url)
# this is the HTML from the given URL
response1 = requests.get(url)
soup = BeautifulSoup(response1.content)

https://www.basketball-reference.com/players/c/abrinal01/gamelog/2018


In [88]:
def get_headers(soup):
    i = 0
    while i < 40:
        headers = [th.getText() for th in soup.findAll('tr', limit=40)[i].findAll('th')]
        i = i +1
        if headers:
            idx = i
            i = 41
    return headers[1:]

In [89]:
# avoid the first header row
def get_stats_df(soup,headers,player):

    rows = soup.findAll('tr')
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
    player_stats = [x for x in player_stats if len(x) > 4]
    stats = pd.DataFrame(player_stats, columns = headers)
    stats.insert(0, 'player', player)
    stats.index = range(len(stats))
    return stats

#soup = get_url_soup(url)
#headers = get_headers(soup)
#get_stats_df(soup,headers,code_df['code'].iloc[0])

In [90]:
def get_stats(num,df):
    for person in range(num):
        errors = []
        try:
            player_id = df['code'].iloc[person]
            player_name = df['name'].iloc[person]
            base = f'https://www.basketball-reference.com/players/c/{player_id}/gamelog/'
            url = base+year
            print(url)
            time.sleep(2.95)
            soup = get_url_soup(url)
            
            headers = get_headers(soup)
            stats = get_stats_df(soup,headers,player_name)
            stats['pos'] = df['pos'].iloc[person]
            stats['season'] = year
            if person == 0:
                data_core = stats
            else:
                data_core = pd.concat([data_core,stats])
            print(person,player_name)
        except: 
            errors.append(player_id)
    print(errors)
    return data_core


In [91]:
print(len(code_df))
test = get_stats(len(code_df),code_df)

546
https://www.basketball-reference.com/players/c/abrinal01/gamelog/2018
0 Álex Abrines
https://www.basketball-reference.com/players/c/acyqu01/gamelog/2018
1 Quincy Acy
https://www.basketball-reference.com/players/c/adamsst01/gamelog/2018
2 Steven Adams
https://www.basketball-reference.com/players/c/adebaba01/gamelog/2018
3 Bam Adebayo
https://www.basketball-reference.com/players/c/afflaar01/gamelog/2018
4 Arron Afflalo
https://www.basketball-reference.com/players/c/aldrico01/gamelog/2018
5 Cole Aldrich
https://www.basketball-reference.com/players/c/aldrila01/gamelog/2018
6 LaMarcus Aldridge
https://www.basketball-reference.com/players/c/allenja01/gamelog/2018
7 Jarrett Allen
https://www.basketball-reference.com/players/c/allenka01/gamelog/2018
8 Kadeem Allen
https://www.basketball-reference.com/players/c/allento01/gamelog/2018
9 Tony Allen
https://www.basketball-reference.com/players/c/aminual01/gamelog/2018
10 Al-Farouq Aminu
https://www.basketball-reference.com/players/c/anderju01/

In [92]:
with pd.option_context('display.max_rows', None,
                       'display.precision', 3,
                       ):
    display(test.sample(5))

Unnamed: 0,player,G,Date,Age,Tm,Unnamed: 6,Opp,Unnamed: 8,GS,MP,...,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,pos,season
63,Jerryd Bayless,,2018-03-08,29-200,PHI,@,MIA,L (-9),Did Not Play,,...,,,,,,,,,SG,2018
65,Paul Millsap,22.0,2018-03-09,33-027,DEN,,LAL,W (+9),1,30:19,...,1.0,0.0,3.0,2.0,3.0,21.0,17.2,12.0,PF,2018
53,Nikola Vučević,,2018-02-08,27-107,ORL,,ATL,W (+2),Inactive,,...,,,,,,,,,C,2018
2,Nerlens Noel,3.0,2017-10-21,23-194,DAL,@,HOU,L (-16),0,15:54,...,0.0,0.0,0.0,2.0,3.0,4.0,0.5,-17.0,C,2018
13,Raymond Felton,14.0,2017-11-15,33-142,OKC,,CHI,W (+13),0,19:27,...,3.0,0.0,0.0,1.0,1.0,5.0,3.6,-5.0,PG,2018


In [93]:
save = test.copy(deep=True)
save.tail(6)

Unnamed: 0,player,G,Date,Age,Tm,Unnamed: 6,Opp,Unnamed: 8,GS,MP,...,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,pos,season
76,Ivica Zubac,38,2018-04-03,21-016,LAL,@,UTA,L (-7),0,10:57,...,0,0,1,0,1,0,-0.1,12,C,2018
77,Ivica Zubac,39,2018-04-04,21-017,LAL,,SAS,W (+10),0,28:30,...,6,0,1,1,4,8,9.9,13,C,2018
78,Ivica Zubac,40,2018-04-06,21-019,LAL,,MIN,L (-17),0,21:52,...,0,0,0,0,3,14,12.9,0,C,2018
79,Ivica Zubac,41,2018-04-08,21-021,LAL,,UTA,L (-15),0,13:54,...,1,0,1,2,0,2,1.6,-8,C,2018
80,Ivica Zubac,42,2018-04-10,21-023,LAL,,HOU,L (-6),0,20:06,...,1,1,0,2,3,2,0.7,-3,C,2018
81,Ivica Zubac,43,2018-04-11,21-024,LAL,@,LAC,W (+15),0,18:15,...,2,1,3,1,2,6,8.3,2,C,2018


In [94]:
test['Date'] = pd.to_datetime(test['Date'])
data = test
for i in range(len(data)):
    data.iloc[i,0] = unidecode(data.iloc[i,0]).replace('_',' ')
    #remove special characters

In [95]:
data.head(5)

Unnamed: 0,player,G,Date,Age,Tm,Unnamed: 6,Opp,Unnamed: 8,GS,MP,...,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,pos,season
0,Alex Abrines,1,2017-10-19,24-079,OKC,,NYK,W (+21),0,24:15,...,0,1,0,2,3,3,-1.4,23,SG,2018
1,Alex Abrines,2,2017-10-21,24-081,OKC,@,UTA,L (-9),0,29:04,...,1,1,0,0,1,7,6.9,6,SG,2018
2,Alex Abrines,3,2017-10-22,24-082,OKC,,MIN,L (-2),0,14:20,...,0,1,0,0,4,4,2.8,13,SG,2018
3,Alex Abrines,4,2017-10-25,24-085,OKC,,IND,W (+18),0,13:26,...,1,1,0,0,3,5,3.8,5,SG,2018
4,Alex Abrines,5,2017-10-27,24-087,OKC,@,MIN,L (-3),0,8:27,...,0,0,0,0,3,0,-1.9,9,SG,2018


In [96]:
data = data.reset_index(drop=True)

In [97]:
data['pos'].value_counts()

pos
SG       9173
C        8378
PG       8133
PF       7815
SF       6613
SF-SG     162
PG-SG      29
Name: count, dtype: int64

In [98]:
data.dropna(inplace=True)
minutes =  data['MP'].to_list()

for i in range(len(minutes)):
    new = minutes[i].split(':')
    res = float(new[0])+(float(new[1])/60)
    minutes[i] = res
data['MP'] = minutes

In [99]:
data.columns

Index(['player', 'G', 'Date', 'Age', 'Tm', ' ', 'Opp', ' ', 'GS', 'MP', 'FG',
       'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-', 'pos',
       'season'],
      dtype='object')

In [100]:
data.columns = ['player', 'G', 'date', 'age', 'team', 'H/A', 'Opp', 'W/L', 'GS', 'MP', 'FG',
       'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-', 'pos','season']
home = data['H/A'].values
print(home[0:4])
for i in range(len(home)):
    if home[i] == '@':
        home[i] = 0
    else:
        home[i] = 1
home[0:10]


['' '@' '' '']


array([1, 0, 1, 1, 0, 0, 0, 1, 0, 0], dtype=object)

In [101]:
data

Unnamed: 0,player,G,date,age,team,H/A,Opp,W/L,GS,MP,...,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,pos,season
0,Alex Abrines,1,2017-10-19,24-079,OKC,1,NYK,W (+21),0,24.250000,...,0,1,0,2,3,3,-1.4,+23,SG,2018
1,Alex Abrines,2,2017-10-21,24-081,OKC,0,UTA,L (-9),0,29.066667,...,1,1,0,0,1,7,6.9,+6,SG,2018
2,Alex Abrines,3,2017-10-22,24-082,OKC,1,MIN,L (-2),0,14.333333,...,0,1,0,0,4,4,2.8,+13,SG,2018
3,Alex Abrines,4,2017-10-25,24-085,OKC,1,IND,W (+18),0,13.433333,...,1,1,0,0,3,5,3.8,+5,SG,2018
4,Alex Abrines,5,2017-10-27,24-087,OKC,0,MIN,L (-3),0,8.450000,...,0,0,0,0,3,0,-1.9,+9,SG,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40298,Ivica Zubac,39,2018-04-04,21-017,LAL,1,SAS,W (+10),0,28.500000,...,6,0,1,1,4,8,9.9,+13,C,2018
40299,Ivica Zubac,40,2018-04-06,21-019,LAL,1,MIN,L (-17),0,21.866667,...,0,0,0,0,3,14,12.9,0,C,2018
40300,Ivica Zubac,41,2018-04-08,21-021,LAL,1,UTA,L (-15),0,13.900000,...,1,0,1,2,0,2,1.6,-8,C,2018
40301,Ivica Zubac,42,2018-04-10,21-023,LAL,1,HOU,L (-6),0,20.100000,...,1,1,0,2,3,2,0.7,-3,C,2018


In [102]:
data['H/A'] = home
data = data.replace('CHO','CHA')
data = data.replace('PHO','PHX')
data= data.replace('BRK','BKN')

data['W/L'] = data['W/L'].str.extract(r"\(([-+]?\d+)\)").astype(int)
data

Unnamed: 0,player,G,date,age,team,H/A,Opp,W/L,GS,MP,...,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,pos,season
0,Alex Abrines,1,2017-10-19,24-079,OKC,1,NYK,21,0,24.250000,...,0,1,0,2,3,3,-1.4,+23,SG,2018
1,Alex Abrines,2,2017-10-21,24-081,OKC,0,UTA,-9,0,29.066667,...,1,1,0,0,1,7,6.9,+6,SG,2018
2,Alex Abrines,3,2017-10-22,24-082,OKC,1,MIN,-2,0,14.333333,...,0,1,0,0,4,4,2.8,+13,SG,2018
3,Alex Abrines,4,2017-10-25,24-085,OKC,1,IND,18,0,13.433333,...,1,1,0,0,3,5,3.8,+5,SG,2018
4,Alex Abrines,5,2017-10-27,24-087,OKC,0,MIN,-3,0,8.450000,...,0,0,0,0,3,0,-1.9,+9,SG,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40298,Ivica Zubac,39,2018-04-04,21-017,LAL,1,SAS,10,0,28.500000,...,6,0,1,1,4,8,9.9,+13,C,2018
40299,Ivica Zubac,40,2018-04-06,21-019,LAL,1,MIN,-17,0,21.866667,...,0,0,0,0,3,14,12.9,0,C,2018
40300,Ivica Zubac,41,2018-04-08,21-021,LAL,1,UTA,-15,0,13.900000,...,1,0,1,2,0,2,1.6,-8,C,2018
40301,Ivica Zubac,42,2018-04-10,21-023,LAL,1,HOU,-6,0,20.100000,...,1,1,0,2,3,2,0.7,-3,C,2018


In [103]:
data['date'].max()

Timestamp('2018-04-11 00:00:00')

In [104]:
KM_vals = pd.read_csv('KM_vals.csv',index_col=0)

In [105]:
KM_dict = dict(zip(KM_vals.Player,KM_vals.KM))

In [106]:
temp = []
for p in data.player.values:
    if p[0] in KM_dict:
        temp.append(KM_dict[p[0]])
    else:
        temp.append(15)
data['KM'] = temp

In [107]:
data[['FG%','3P%','FT%']] = data[['FG%','3P%','FT%']].replace(5.00,0)

In [108]:

data.to_csv(f'data_{year}.csv',index=False)