In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Scraping standings

In [122]:
# This function scrapes the division standings and outputs a dataframe of the standings for an input year
def scrape_standings(year):


    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_standings.html'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html')
    while soup.find(class_ = 'thead') is not None:
        soup.find(class_ = 'thead').decompose()
    east_table, west_table = [t for t in soup.find_all('table') if 'divs_standings' in t.get('id')]

    east = pd.read_html(str(east_table))[0].rename(columns = {'Eastern Conference': 'Team'})
    east.insert(1, 'Tm', [str(x).split('/')[2] for x in east_table.find_all('a', href = True)])
    west = pd.read_html(str(west_table))[0].rename(columns = {'Western Conference': 'Team'})
    west.insert(1, 'Tm', [str(x).split('/')[2] for x in west_table.find_all('a', href = True)])

    standings = pd.concat([east, west])
    standings['Team'] = standings['Team'].str.replace('*', '', regex = False)
    standings.insert(2, 'Year', len(standings) * [year])

    return standings.sort_values('W/L%', ascending = False).drop(columns = ['GB']).reset_index(drop = True)

In [121]:
scrape_standings(1980)

Unnamed: 0,Team,Tm,Year,W,L,W/L%,PS/G,PA/G,SRS
0,Boston Celtics,BOS,1980,61,21,0.744,113.5,105.7,7.37
1,Los Angeles Lakers,LAL,1980,60,22,0.732,115.1,109.2,5.4
2,Philadelphia 76ers,PHI,1980,59,23,0.72,109.1,104.9,4.04
3,Seattle SuperSonics,SEA,1980,56,26,0.683,108.5,103.8,4.24
4,Phoenix Suns,PHO,1980,55,27,0.671,111.1,107.5,3.25
5,Atlanta Hawks,ATL,1980,50,32,0.61,104.5,101.6,2.83
6,Milwaukee Bucks,MIL,1980,49,33,0.598,110.1,106.1,3.57
7,Kansas City Kings,KCK,1980,47,35,0.573,108.0,104.9,2.82
8,San Antonio Spurs,SAS,1980,41,41,0.5,119.4,119.7,-0.24
9,Houston Rockets,HOU,1980,41,41,0.5,110.8,110.6,0.27


In [123]:
scrape_standings(2023)

Unnamed: 0,Team,Tm,Year,W,L,W/L%,PS/G,PA/G,SRS
0,Milwaukee Bucks,MIL,2023,58,24,0.707,116.9,113.3,3.61
1,Boston Celtics,BOS,2023,57,25,0.695,117.9,111.4,6.38
2,Philadelphia 76ers,PHI,2023,54,28,0.659,115.2,110.9,4.37
3,Denver Nuggets,DEN,2023,53,29,0.646,115.8,112.5,3.04
4,Cleveland Cavaliers,CLE,2023,51,31,0.622,112.3,106.9,5.23
5,Memphis Grizzlies,MEM,2023,51,31,0.622,116.9,113.0,3.6
6,Sacramento Kings,SAC,2023,48,34,0.585,120.7,118.1,2.3
7,New York Knicks,NYK,2023,47,35,0.573,116.0,113.1,2.99
8,Phoenix Suns,PHO,2023,45,37,0.549,113.6,111.6,2.08
9,Brooklyn Nets,BRK,2023,45,37,0.549,113.4,112.5,1.03


# Scraping player stats (basic & advanced)

In [103]:
def single_team(df) :
    # For an input player, this function returns only a row with total stats and the latest team of the player
    if len(df) > 1:
        row = df[df['Tm'] == 'TOT']
        row['Tm'] = df['Tm'].values[-1]
        return row
    else :
        return df

def scrape_player_stats(year) :

    # Scrape per game stats -- PTS, TRB, AST, ...
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html')
    table = soup.find('table')
    while table.find(class_ = 'thead') is not None :
        table.find(class_ = 'thead').decompose()
    data_pg = pd.read_html(str(table))[0].drop(columns = ['Rk'])
    data_pg.insert(1, 'href', [str(x).split('.html')[0].split('/')[-1] for x in table.find_all('a', href = True) if 'players' in str(x)])
    data_pg = data_pg.groupby('href').apply(single_team).reset_index(drop = True)
    data_pg['Player'] = data_pg['Player'].str.replace('*', '', regex = False)

    # Scrape advanced stats -- PER, BPM, WS, VORP, ...
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html')
    table = soup.find('table')
    while table.find(class_ = 'thead') is not None :
        table.find(class_ = 'thead').decompose()
    data_adv = pd.read_html(str(table))[0]
    data_adv = data_adv.drop(columns = ['Rk'] + [x for x in data_adv.columns if 'Unnamed' in x])
    data_adv.insert(1, 'href', [str(x).split('.html')[0].split('/')[-1] for x in table.find_all('a', href = True) if 'players' in str(x)])
    data_adv = data_adv.groupby('href').apply(single_team).reset_index(drop = True)
    data_adv['Player'] = data_adv['Player'].str.replace('*', '', regex = False)

    # Merge per game and advanced stats together
    data = data_pg.merge(data_adv, on = ['Player', 'href'], suffixes = ('', '_y'))
    data = data.drop(columns = [col for col in data.columns if '_y' in col]).reset_index(drop = True) # Delete duplicated columns
    data.insert(2, 'Year', len(data) * [year])

    if len(data) != len(data_pg) or len(data) != len(data_adv):
        Warning('Merge between per game and advanced stats is not 1:1 !!')

    return data

In [104]:
scrape_player_stats(2023)

Unnamed: 0,Player,href,Year,Pos,Age,Tm,G,GS,MP,FG,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,Precious Achiuwa,achiupr01,2023,C,23,TOR,55,12,20.7,3.6,...,11.4,19.4,0.8,1.4,2.2,0.093,-1.4,-0.8,-2.3,-0.1
1,Steven Adams,adamsst01,2023,C,29,MEM,42,42,27.0,3.7,...,19.8,14.6,1.3,2.1,3.4,0.144,-0.3,0.9,0.6,0.7
2,Bam Adebayo,adebaba01,2023,C,25,MIA,75,75,34.6,8.0,...,12.7,25.2,3.6,3.8,7.4,0.137,0.8,0.8,1.5,2.3
3,Ochai Agbaji,agbajoc01,2023,SG,22,UTA,59,22,20.5,2.8,...,9.0,15.8,0.9,0.4,1.3,0.053,-1.7,-1.4,-3.0,-0.3
4,Santi Aldama,aldamsa01,2023,PF,22,MEM,77,20,21.8,3.2,...,9.3,16.0,2.1,2.4,4.6,0.130,-0.3,0.8,0.5,1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,Thaddeus Young,youngth01,2023,PF,34,TOR,54,9,14.7,2.0,...,16.7,13.5,0.7,1.1,1.8,0.109,-1.8,1.9,0.1,0.4
535,Trae Young,youngtr01,2023,PG,24,ATL,73,73,34.8,8.2,...,15.2,32.6,5.3,1.4,6.7,0.126,5.3,-2.0,3.3,3.4
536,Omer Yurtseven,yurtsom01,2023,C,24,MIA,9,0,9.2,1.8,...,11.9,18.0,0.2,0.1,0.3,0.159,-2.5,-1.5,-3.9,0.0
537,Cody Zeller,zelleco01,2023,C,30,MIA,15,2,14.5,2.5,...,15.8,18.1,0.4,0.3,0.7,0.147,-2.0,-0.7,-2.8,0.0


In [105]:
scrape_player_stats(1980)

Unnamed: 0,Player,href,Year,Pos,Age,Tm,G,GS,MP,FG,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,Kareem Abdul-Jabbar,abdulka01,1980,C,32,LAL,82,,38.3,10.2,...,15.7,24.1,9.5,5.3,14.8,0.227,4.8,2.4,7.2,7.3
1,Tom Abernethy,abernto01,1980,PF,25,GSW,67,,18.2,2.3,...,9.9,13.3,1.2,0.8,2.0,0.080,-1.0,-0.2,-1.2,0.2
2,Alvan Adams,adamsal01,1980,C,25,PHO,75,,28.9,6.2,...,18.2,21.9,3.1,3.9,7.0,0.155,1.7,1.9,3.6,3.1
3,Tiny Archibald,architi01,1980,PG,31,BOS,80,80.0,35.8,4.8,...,19.7,17.0,5.9,2.9,8.9,0.148,1.4,-0.3,1.1,2.3
4,Dennis Awtrey,awtrede01,1980,C,31,CHI,26,,21.5,1.0,...,24.8,7.9,0.1,0.5,0.6,0.053,-2.3,0.9,-1.4,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,Bubba Wilson,wilsobu01,1980,SG,24,GSW,16,,8.9,0.4,...,22.4,10.3,-0.2,0.0,-0.2,-0.064,-4.4,-0.9,-5.3,-0.2
283,Rick Wilson,wilsori01,1980,SG,23,ATL,5,,11.8,0.4,...,32.5,17.1,-0.3,0.1,-0.2,-0.146,-4.1,0.0,-4.1,0.0
284,Brian Winters,wintebr01,1980,SG,27,MIL,80,,32.8,6.7,...,13.3,21.1,3.6,2.6,6.2,0.113,0.3,-0.2,0.2,1.4
285,Larry Wright,wrighla01,1980,PG,25,WSB,76,,16.9,3.0,...,16.5,20.1,0.8,1.0,1.8,0.069,-1.1,0.1,-1.0,0.3


# Opening day roster

In [90]:
# This function creates a dictionary for any input year, where the key is the name of the team and the item is a list of players on the opening day roster
def OpeningDayRoster(year) :

    url = f'https://basketball.realgm.com/nba/teams/{year}'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html')
    teams = sorted([x['href'].split('/')[3] for x in soup.find_all('a', href = True) if '/teams/' in str(x) and 'Rosters' in str(x) and str(2022) in str(x)])

    dict = {}
    for i, tm in enumerate(teams[:2]) :

        url = f'https://basketball.realgm.com/nba/teams/{tm}/{i+1}/Rosters/Opening_Day/{year}'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html')
        table = soup.find_all('table')[4]
        dict[' '.join(tm.split('-'))] = pd.read_html(str(table))[0]['Player'].to_list()

    return dict

In [91]:
OpeningDayRoster(2022)

{'Atlanta Hawks': ['Bogdan Bogdanovic',
  'John Collins',
  'Sharife Cooper',
  'Gorgui Dieng',
  'Danilo Gallinari',
  'Solomon Hill',
  'Kevin Huerter',
  "De'Andre Hunter",
  'Jalen Johnson',
  'Timothe Luwawu-Cabarrot',
  'Skylar Mays',
  'Clint Capela',
  'Onyeka Okongwu',
  'Cam Reddish',
  'Lou Williams',
  'Delon Wright',
  'Trae Young'],
 'Boston Celtics': ['Jaylen Brown',
  'Bruno Fernando',
  'Sam Hauser',
  'Juancho Hernangomez',
  'Al Horford',
  'Enes Freedom',
  'Romeo Langford',
  'Aaron Nesmith',
  'Jabari Parker',
  'Payton Pritchard',
  'Josh Richardson',
  'Dennis Schroder',
  'Marcus Smart',
  'Jayson Tatum',
  'Brodric Thomas',
  'Grant Williams',
  'Robert Williams']}