In [1]:
# required libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import pickle

# create a function to scrape team performance for multiple years
def scrape_NBA_team_data(years = [2017, 2018]):

    final_df = pd.DataFrame(columns = ["Year", "Team", "W", "L",
                                       "W/L%", "GB", "PS/G", "PA/G",
                                       "SRS", "Playoffs",
                                       "Losing_season"])

    # loop through each year
    for y in years:
        # NBA season to scrape
        year = y

        # URL to scrape, notice f string:
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html"

        # collect HTML data
        html = urlopen(url)

        # create beautiful soup object from HTML
        soup = BeautifulSoup(html, features="lxml")

        # use getText()to extract the headers into a list
        titles = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

        # first, find only column headers
        headers = titles[1:titles.index("SRS")+1]

        # then, exclude first set of column headers (duplicated)
        titles = titles[titles.index("SRS")+1:]

        # next, row titles (ex: Boston Celtics, Toronto Raptors)
        try:
            row_titles = titles[0:titles.index("Eastern Conference")]
        except: row_titles = titles
        # remove the non-teams from this list
        for i in headers:
            row_titles.remove(i)
        row_titles.remove("Western Conference")
        divisions = ["Atlantic Division", "Central Division",
                     "Southeast Division", "Northwest Division",
                     "Pacific Division", "Southwest Division",
                     "Midwest Division"]
        for d in divisions:
            try:
                row_titles.remove(d)
            except:
                print("no division:", d)

        # next, grab all data from rows (avoid first row)
        rows = soup.findAll('tr')[1:]
        team_stats = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        # remove empty elements
        team_stats = [e for e in team_stats if e != []]
        # only keep needed rows
        team_stats = team_stats[0:len(row_titles)]

        # add team name to each row in team_stats
        for i in range(0, len(team_stats)):
            team_stats[i].insert(0, row_titles[i])
            team_stats[i].insert(0, year)

        # add team, year columns to headers
        headers.insert(0, "Team")
        headers.insert(0, "Year")

        # create a dataframe with all aquired info
        year_standings = pd.DataFrame(team_stats, columns = headers)

        # add a column to dataframe to indicate playoff appearance
        year_standings["Playoffs"] = ["Y" if "*" in ele else "N" for ele in year_standings["Team"]]
        # remove * from team names
        year_standings["Team"] = [ele.replace('*', '') for ele in year_standings["Team"]]
        # add losing season indicator (win % < .5)
        year_standings["Losing_season"] = ["Y" if float(ele) < .5 else "N" for ele in year_standings["W/L%"]]

        # append new dataframe to final_df
        final_df = final_df.append(year_standings)

    # print final_df
    print(final_df.info)
    # export to csv
    final_df.to_csv("nba_team_data.csv", index=False)

scrape_NBA_team_data(years = [2012, 2013, 2014,
                              2015, 2016, 2017, 2018, 2019,
                              2020])

to get rosters

start with the year -

https://www.basketball-reference.com/leagues/NBA_YEAR_ratings

loop through each team

https://www.basketball-reference.com/teams/THIS_TEAM/2014.html

get players from roster -
note- no per per game so lets use per from previous season

player column of roster
which contains a link to the players profile
https://www.basketball-reference.com/players/p/pendeje02.html

<a href="/players/p/pendeje02.html">Jeff Ayres</a>





In [2]:
def team_roster_base(years = [2015]):
    team_roster_base = pd.DataFrame(columns = ["Year", "Team", "team_dir"])
    # loop through each year
    for y in years:
        # NBA season to scrape- year is season end so 2015 is 2014-15 season 
        year = y
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_ratings.html"
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        table = soup.find('table', attrs={'id':'ratings'})
        teams = table.tbody.findAll("tr")
        for team in teams: #get team names and links - would get stats here as well
            team_name= team.td.string
            team_dir = team.td.a.get('href')
            team_year={"Year": year, "Team": team_name, "team_dir": team_dir}
            team_roster_base = team_roster_base.append(team_year, ignore_index = True)
        time.sleep(5)
    # export to csv
    team_roster_base.to_csv("year_team_link.csv", index=False)
    return team_roster_base

def get_rosters(team_roster_base):
    save_int = 60
    save = 0
    yearly_rosters = pd.DataFrame(columns = ["Year", "Team", "team_dir", "Player", "player_dir"])
    base_url = "https://www.basketball-reference.com"
    for index, row in team_roster_base.iterrows():
        roster_url = base_url+str(row['team_dir'])
        time.sleep(random.randint(3, 9))
        html_team = urlopen(roster_url)
        soup = BeautifulSoup(html_team, features="lxml")
        roster_table = soup.find('table', attrs={'id':'roster'})
        players = roster_table.tbody.findAll("tr")
        year = [str(row['Year'])]
        team = [str(row['Team'])]
        team_dir = [str(row['team_dir'])]
        for player in players:
            player_name = [str(player.td.string)]
            player_dir = [str(player.td.a.get('href'))]
            team_year_player = pd.DataFrame({"Year": year,
                                             "Team": team, 
                                             "team_dir": team_dir, 
                                             "Player": player_name, 
                                             "player_dir": player_dir})
            yearly_rosters = pd.concat([yearly_rosters, team_year_player], ignore_index = True)
            save+=1
        if save >= save_int:
            save = 0
            print(len(yearly_rosters))
            with open("df.pickle", "wb") as file:
                pickle.dump(yearly_rosters, file)
    with open("df.pickle", "wb") as file:
                pickle.dump(yearly_rosters, file)
    yearly_rosters.to_csv("season_rosters.csv", index=False)
    return yearly_rosters

# seasons = team_roster_base(years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022])
seasons = pd.read_csv("year_team_link.csv")
team_rosters = get_rosters(seasons)

68
145
214
275
356
434
505
575
643
711
781
848
928
999
1072
1138
1203
1276
1346
1414
1491
1568
1628
1689
1750
1820
1896
1974
2037
2105
2168
2231
2294
2357
2428
2509
2583
2645
2732
2801
2875
2950
3027
3087
3159
3242
3308
3370
3445
3505
3567
3635
3708
3786
3849
3910
3972
4043
4121
4185
4257
4333
4399
4466
4536
4605
4688
4763


In [46]:
def get_pers(season_rosters):
    progress = 0 
    save = 0
    per_table = pd.DataFrame(columns = ["season", "team_id", "player_dir", "per"])
    save_int = 5   
    players_list = []
    for index, row in season_rosters.iterrows():
        progress+= 1
        season = str(int(row['Year'])-1)+'-'+row['Year'][2:]
        team = row['team_dir'][7:10]
        base_url = "https://www.basketball-reference.com"
        player_url = base_url+str(row['player_dir'])
        if row['player_dir'] in players_list:
            continue
        else:
            print(row)
            players_list.append(row['player_dir'])
        time.sleep(random.randint(3, 9))
        html_player = urlopen(player_url)
        player_seasons = []
        player_team_ids = []
        # player_team_dirs = []
        player_dirs = []
        player_pers = []
        soup = BeautifulSoup(html_player, features="lxml")
        adv_table = soup.find('table', attrs={'id':'advanced'})
        teams = adv_table.tbody.findAll("tr")
        for team in teams:
            # th is season td is all other stats 
            player_seasons.append(str(team.th.string))
            other_stats = team.findAll('td')
            team_id = other_stats[1].string
            player_team_ids.append(str(team_id))
            # player_team_dirs.append(row['team_dir'])
            player_dirs.append(row['player_dir'])
            per_value = other_stats[6].string
            player_pers.append(per_value)
        this_player = pd .DataFrame({ "season": player_seasons,
                                     "team_id": player_team_ids, 
                                     "player_dir": player_dirs, 
                                     "per":player_pers})
        per_table = pd.concat([per_table, this_player], ignore_index = True)
        save+=1
        if progress % 480 == 0:
            print(progress/len(season_rosters))
        if save >= save_int:
            save = 0
            print(len(per_table), end = " ")
#changing filenames for second run 
            per_table.to_parquet('PER_table_error.parquet.gzip',
              compression='gzip')
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html
    per_table.to_parquet('PER_table_error.parquet.gzip',
              compression='gzip')
        
    # per_table.to_csv("PER_table.csv", index=False)
    return per_table

In [49]:
season_rosters = pd.read_pickle("df.pickle", compression='infer')
# pers = get_pers(season_rosters)
season_rosters.drop_duplicates(subset = ["player_dir"], keep = "last")

Unnamed: 0,Year,Team,team_dir,Player,player_dir
8,2015,Golden State Warriors,/teams/GSW/2015.html,Ognjen Kuzmić,/players/k/kuzmiog01.html
19,2015,Los Angeles Clippers,/teams/LAC/2015.html,Glen Davis,/players/d/davisgl01.html
20,2015,Los Angeles Clippers,/teams/LAC/2015.html,Chris Douglas-Roberts,/players/d/douglch01.html
25,2015,Los Angeles Clippers,/teams/LAC/2015.html,Lester Hudson,/players/h/hudsole01.html
32,2015,Los Angeles Clippers,/teams/LAC/2015.html,Hedo Türkoğlu,/players/t/turkohe01.html
...,...,...,...,...,...
4804,2022,Portland Trail Blazers,/teams/POR/2022.html,Tony Snell,/players/s/snellto01.html
4805,2022,Portland Trail Blazers,/teams/POR/2022.html,Trendon Watford,/players/w/watfotr01.html
4806,2022,Portland Trail Blazers,/teams/POR/2022.html,Brandon Williams,/players/w/willibr03.html
4807,2022,Portland Trail Blazers,/teams/POR/2022.html,Justise Winslow,/players/w/winslju01.html


In [18]:
season_PER = pd.read_parquet('PER_table.parquet.gzip') 
# season_PER.drop_duplicates()
season_PER

Unnamed: 0,season,team_id,player_dir,per
15,2016-17,PHO,/players/b/barbole01.html,11.5
29,2023-24,SAC,/players/b/barneha02.html,11.8
45,2018-19,GSW,/players/b/bogutan01.html,13.4
60,,GSW,/players/c/curryst01.html,20.9
63,2015-16,GSW,/players/e/ezelife01.html,17.7
...,...,...,...,...
8868,2023-24,BRK,/players/b/brookar01.html,8.1
8871,2022-23,GSW,/players/l/lamban01.html,10.9
8877,2023-24,PHI,/players/m/martike04.html,9.6
8879,2021-22,ATL,/players/o/oliveca01.html,21.4


In [54]:
season_indiv = season_rosters.drop_duplicates(subset = ["player_dir"], keep = "last")
last_index = season_indiv[season_indiv['player_dir'] == "/players/o/oliveca01.html"].index[0]
season_indiv.loc[last_index:]
season_indiv= season_indiv.loc[last_index:]
season_indiv.loc[4589]

Year                               2022
Team                 Los Angeles Lakers
team_dir           /teams/LAL/2022.html
Player                  Wayne Ellington
player_dir    /players/e/ellinwa01.html
Name: 4589, dtype: object

In [48]:
get_pers(season_indiv)

Year                               2022
Team                      Atlanta Hawks
team_dir           /teams/ATL/2022.html
Player                   Cameron Oliver
player_dir    /players/o/oliveca01.html
Name: 4417, dtype: object
Year                               2022
Team                      Atlanta Hawks
team_dir           /teams/ATL/2022.html
Player                     Lou Williams
player_dir    /players/w/willilo02.html
Name: 4420, dtype: object
Year                               2022
Team                      Atlanta Hawks
team_dir           /teams/ATL/2022.html
Player                     Delon Wright
player_dir    /players/w/wrighde01.html
Name: 4421, dtype: object
Year                               2022
Team                      Atlanta Hawks
team_dir           /teams/ATL/2022.html
Player                       Trae Young
player_dir    /players/y/youngtr01.html
Name: 4422, dtype: object
Year                               2022
Team                      Brooklyn Nets
team_dir        

URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>

In [None]:
#missing 240 people 

In [None]:
player_url = base_url+player_dir
player_soup = BeautifulSoup(player_url, features="lxml")
roster_table = soup.find('table', attrs={'id':'roster'})
INCOMPLETE DOESNT MAKE SENSE TO GET PER EVERY TIME SO WHEN SHOULD WE?