In [1]:
# required libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import pickle

# create a function to scrape team performance for multiple years
def scrape_NBA_team_data(years = [2017, 2018]):

    final_df = pd.DataFrame(columns = ["Year", "Team", "W", "L",
                                       "W/L%", "GB", "PS/G", "PA/G",
                                       "SRS", "Playoffs",
                                       "Losing_season"])

    # loop through each year
    for y in years:
        # NBA season to scrape
        year = y

        # URL to scrape, notice f string:
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html"

        # collect HTML data
        html = urlopen(url)

        # create beautiful soup object from HTML
        soup = BeautifulSoup(html, features="lxml")

        # use getText()to extract the headers into a list
        titles = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

        # first, find only column headers
        headers = titles[1:titles.index("SRS")+1]

        # then, exclude first set of column headers (duplicated)
        titles = titles[titles.index("SRS")+1:]

        # next, row titles (ex: Boston Celtics, Toronto Raptors)
        try:
            row_titles = titles[0:titles.index("Eastern Conference")]
        except: row_titles = titles
        # remove the non-teams from this list
        for i in headers:
            row_titles.remove(i)
        row_titles.remove("Western Conference")
        divisions = ["Atlantic Division", "Central Division",
                     "Southeast Division", "Northwest Division",
                     "Pacific Division", "Southwest Division",
                     "Midwest Division"]
        for d in divisions:
            try:
                row_titles.remove(d)
            except:
                print("no division:", d)

        # next, grab all data from rows (avoid first row)
        rows = soup.findAll('tr')[1:]
        team_stats = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        # remove empty elements
        team_stats = [e for e in team_stats if e != []]
        # only keep needed rows
        team_stats = team_stats[0:len(row_titles)]

        # add team name to each row in team_stats
        for i in range(0, len(team_stats)):
            team_stats[i].insert(0, row_titles[i])
            team_stats[i].insert(0, year)

        # add team, year columns to headers
        headers.insert(0, "Team")
        headers.insert(0, "Year")

        # create a dataframe with all aquired info
        year_standings = pd.DataFrame(team_stats, columns = headers)

        # add a column to dataframe to indicate playoff appearance
        year_standings["Playoffs"] = ["Y" if "*" in ele else "N" for ele in year_standings["Team"]]
        # remove * from team names
        year_standings["Team"] = [ele.replace('*', '') for ele in year_standings["Team"]]
        # add losing season indicator (win % < .5)
        year_standings["Losing_season"] = ["Y" if float(ele) < .5 else "N" for ele in year_standings["W/L%"]]

        # append new dataframe to final_df
        final_df = final_df.append(year_standings)

    # print final_df
    print(final_df.info)
    # export to csv
    final_df.to_csv("nba_team_data.csv", index=False)

scrape_NBA_team_data(years = [2012, 2013, 2014,
                              2015, 2016, 2017, 2018, 2019,
                              2020])

to get rosters

start with the year -

https://www.basketball-reference.com/leagues/NBA_YEAR_ratings

loop through each team

https://www.basketball-reference.com/teams/THIS_TEAM/2014.html

get players from roster -
note- no per per game so lets use per from previous season

player column of roster
which contains a link to the players profile
https://www.basketball-reference.com/players/p/pendeje02.html

<a href="/players/p/pendeje02.html">Jeff Ayres</a>





In [2]:
def team_roster_base(years = [2015]):
    team_roster_base = pd.DataFrame(columns = ["Year", "Team", "team_dir"])
    # loop through each year
    for y in years:
        # NBA season to scrape- year is season end so 2015 is 2014-15 season 
        year = y
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_ratings.html"
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        table = soup.find('table', attrs={'id':'ratings'})
        teams = table.tbody.findAll("tr")
        for team in teams: #get team names and links - would get stats here as well
            team_name= team.td.string
            team_dir = team.td.a.get('href')
            team_year={"Year": year, "Team": team_name, "team_dir": team_dir}
            team_roster_base = team_roster_base.append(team_year, ignore_index = True)
        time.sleep(5)
    # export to csv
    team_roster_base.to_csv("year_team_link.csv", index=False)
    return team_roster_base

def get_rosters(team_roster_base):
    save_int = 60
    save = 0
    yearly_rosters = pd.DataFrame(columns = ["Year", "Team", "team_dir", "Player", "player_dir"])
    base_url = "https://www.basketball-reference.com"
    for index, row in team_roster_base.iterrows():
        roster_url = base_url+str(row['team_dir'])
        time.sleep(random.randint(3, 9))
        html_team = urlopen(roster_url)
        soup = BeautifulSoup(html_team, features="lxml")
        roster_table = soup.find('table', attrs={'id':'roster'})
        players = roster_table.tbody.findAll("tr")
        year = [str(row['Year'])]
        team = [str(row['Team'])]
        team_dir = [str(row['team_dir'])]
        for player in players:
            player_name = [str(player.td.string)]
            player_dir = [str(player.td.a.get('href'))]
            team_year_player = pd.DataFrame({"Year": year,
                                             "Team": team, 
                                             "team_dir": team_dir, 
                                             "Player": player_name, 
                                             "player_dir": player_dir})
            yearly_rosters = pd.concat([yearly_rosters, team_year_player], ignore_index = True)
            save+=1
        if save >= save_int:
            save = 0
            print(len(yearly_rosters))
            with open("df.pickle", "wb") as file:
                pickle.dump(yearly_rosters, file)
    with open("df.pickle", "wb") as file:
                pickle.dump(yearly_rosters, file)
    yearly_rosters.to_csv("season_rosters.csv", index=False)
    return yearly_rosters

# seasons = team_roster_base(years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022])
seasons = pd.read_csv("year_team_link.csv")
team_rosters = get_rosters(seasons)

KeyboardInterrupt: 

In [2]:
def get_pers(season_rosters, filename):
    #get player efficiency ratings 
    progress = 0 
    save = 0
    per_table = pd.DataFrame(columns = ["season", "team_id", "player_dir", "per"])
    save_int = 50   
    players_list = []
    for index, row in season_rosters.iterrows():
        progress+= 1
        season = str(int(row['Year'])-1)+'-'+row['Year'][2:]
        team = row['team_dir'][7:10]
        base_url = "https://www.basketball-reference.com"
        player_url = base_url+str(row['player_dir'])
        if row['player_dir'] in players_list:
            continue
        else:
            print(row[0])
            players_list.append(row['player_dir'])
        try:
            time.sleep(random.randint(3, 9))
            html_player = urlopen(player_url)
            player_seasons = []
            player_team_ids = []
            # player_team_dirs = []
            player_dirs = []
            player_pers = []
            soup = BeautifulSoup(html_player, features="lxml")
            adv_table = soup.find('table', attrs={'id':'advanced'})
            teams = adv_table.tbody.findAll("tr")
            for team in teams:
                # th is season td is all other stats 
                player_seasons.append(str(team.th.string))
                other_stats = team.findAll('td')
                team_id = other_stats[1].string
                player_team_ids.append(str(team_id))
                # player_team_dirs.append(row['team_dir'])
                player_dirs.append(row['player_dir'])
                per_value = other_stats[6].string
                player_pers.append(per_value)
            this_player = pd .DataFrame({ "season": player_seasons,
                                         "team_id": player_team_ids, 
                                         "player_dir": player_dirs, 
                                         "per":player_pers})
            per_table = pd.concat([per_table, this_player], ignore_index = True)
            save+=1
        except:
            print(row)
        if save >= save_int:
            save = 0
            print(len(per_table), end = " ")
#changing filenames for second run 
            per_table.to_parquet(f'{filename}.parquet.gzip',
              compression='gzip')
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html
    per_table.to_parquet(f'{filename}.parquet.gzip',
              compression='gzip')
        
    # per_table.to_csv("PER_table.csv", index=False)
    return per_table

In [3]:
# players on our rosters
season_rosters = pd.read_pickle("df.pickle", compression='infer')
# pers = get_pers(season_rosters)
season_rosters_players = season_rosters.drop_duplicates(subset = ["player_dir"], keep = "first")
players_all = season_rosters_players["player_dir"]
len(players_all)

1240

In [4]:
#player pers every season up to the error
season_PER = pd.read_parquet('PER_table.parquet.gzip') 
season_PER= season_PER.drop_duplicates(subset = ["player_dir"], keep = "first")
players_have = season_PER["player_dir"]
players_have

0       /players/b/barbole01.html
16      /players/b/barneha02.html
30      /players/b/bogutan01.html
46      /players/c/curryst01.html
61      /players/e/ezelife01.html
                  ...            
8864    /players/b/brookar01.html
8869     /players/l/lamban01.html
8872    /players/m/martike04.html
8878    /players/o/oliveca01.html
8880     /players/t/tateja01.html
Name: player_dir, Length: 1100, dtype: object

In [5]:
#players missing due to the error
players_needed = set(players_all).difference(set(players_have))
len(set(players_all).difference(set(players_have)))

140

In [6]:
#selecting the players we dont have 
missing_players = season_rosters_players[season_rosters_players["player_dir"].isin(players_needed)]
len(missing_players)
missing_players

Unnamed: 0,Year,Team,team_dir,Player,player_dir
4012,2021,Houston Rockets,/teams/HOU/2021.html,Brodric Thomas,/players/t/thomabr01.html
4034,2021,Cleveland Cavaliers,/teams/CLE/2021.html,Isaac Okoro,/players/o/okorois01.html
4038,2021,Cleveland Cavaliers,/teams/CLE/2021.html,Lamar Stevens,/players/s/stevela01.html
4042,2021,Cleveland Cavaliers,/teams/CLE/2021.html,Dylan Windler,/players/w/windldy01.html
4044,2021,Orlando Magic,/teams/ORL/2021.html,Cole Anthony,/players/a/anthoco01.html
...,...,...,...,...,...
4783,2022,Portland Trail Blazers,/teams/POR/2022.html,Greg Brown III,/players/b/browngr01.html
4785,2022,Portland Trail Blazers,/teams/POR/2022.html,Jarron Cumberland,/players/c/cumbeja01.html
4796,2022,Portland Trail Blazers,/teams/POR/2022.html,Cameron McGriff,/players/m/mcgrica01.html
4805,2022,Portland Trail Blazers,/teams/POR/2022.html,Trendon Watford,/players/w/watfotr01.html


In [7]:
# scraping the missing players into missing pers
missing_players_df = get_pers(missing_players, "missing_players")

2021
2021
2021
2021
2021
2021
2021
2021
2021
2021
2021
2021
2021
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
Year                               2022
Team                    Milwaukee Bucks
team_dir           /teams/MIL/2022.html
Player                     Luca Vildoza
player_dir    /players/v/vildolu01.html
Name: 4285, dtype: object
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
150 2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
275 2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022
2022


In [12]:
# players after scraping attempt for missing 
mp = pd.read_parquet('missing_players.parquet.gzip') 
mp= mp.drop_duplicates(subset = ["player_dir"], keep = "first")
players_mp = mp["player_dir"]
players_mp
#merge with existing
players_have = pd.concat([players_have, players_mp], ignore_index = True)

In [13]:
#players still missing
players_needed = set(players_all).difference(set(players_have))
len(set(players_all).difference(set(players_have)))

1

In [14]:
#selecting the player we dont have 
missing_players = season_rosters_players[season_rosters_players["player_dir"].isin(players_needed)]
len(missing_players)
missing_players

Unnamed: 0,Year,Team,team_dir,Player,player_dir
4285,2022,Milwaukee Bucks,/teams/MIL/2022.html,Luca Vildoza,/players/v/vildolu01.html


In [30]:
luca_vildoza = pd.DataFrame({ "season": [2021-22],
                                         "team_id": ["MIL"], 
                                         "player_dir": ["/players/v/vildolu01.html"], 
                                         "per":[17.9]})
season_player_per = pd.concat([pd.read_parquet('PER_table.parquet.gzip'), pd.read_parquet('missing_players.parquet.gzip') , luca_vildoza], ignore_index = True)
# season_player_per.drop_duplicates(subset = ["player_dir"], keep = "first")
season_player_per.to_csv("seasons_per_s.csv", index=False)
#need to remove season pers before the year 2015
season_player_per[season_player_per["season"].isin(["2014-15", "2015-16", "2016-17", "2017-18", "2018-19", "2019-20", "2020-21", "2021-22"])]
#this list is larger because of trades/transfers etc

Unnamed: 0,season,team_id,player_dir,per
13,2014-15,GSW,/players/b/barbole01.html,15.3
14,2015-16,GSW,/players/b/barbole01.html,11.7
15,2016-17,PHO,/players/b/barbole01.html,11.5
18,2014-15,GSW,/players/b/barneha02.html,13.4
19,2015-16,GSW,/players/b/barneha02.html,12.3
...,...,...,...,...
9244,2021-22,POR,/players/b/browngr01.html,11.0
9247,2021-22,POR,/players/c/cumbeja01.html,8.4
9248,2021-22,POR,/players/m/mcgrica01.html,12.5
9249,2021-22,POR,/players/w/watfotr01.html,15.8


In [31]:
season_rosters = pd.read_pickle("df.pickle", compression='infer')
season_rosters

Unnamed: 0,Year,Team,team_dir,Player,player_dir
0,2015,Golden State Warriors,/teams/GSW/2015.html,Leandro Barbosa,/players/b/barbole01.html
1,2015,Golden State Warriors,/teams/GSW/2015.html,Harrison Barnes,/players/b/barneha02.html
2,2015,Golden State Warriors,/teams/GSW/2015.html,Andrew Bogut,/players/b/bogutan01.html
3,2015,Golden State Warriors,/teams/GSW/2015.html,Stephen Curry,/players/c/curryst01.html
4,2015,Golden State Warriors,/teams/GSW/2015.html,Festus Ezeli,/players/e/ezelife01.html
...,...,...,...,...,...
4804,2022,Portland Trail Blazers,/teams/POR/2022.html,Tony Snell,/players/s/snellto01.html
4805,2022,Portland Trail Blazers,/teams/POR/2022.html,Trendon Watford,/players/w/watfotr01.html
4806,2022,Portland Trail Blazers,/teams/POR/2022.html,Brandon Williams,/players/w/willibr03.html
4807,2022,Portland Trail Blazers,/teams/POR/2022.html,Justise Winslow,/players/w/winslju01.html


In [None]:
# next step get team stats- how are we defining team stats 
#we need team stats for each game player over the regular season by each team. 
# should be able to repurpose team_roster_base() function. 
# dataframe would need to contain the team the season/final year of the 