This notebook scrapes 2022-2023 season data weekly and outputs it as a csv file, and a txt file documenting the date and number of games.

In [1]:
# Import packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import numpy as np
from IPython.display import clear_output
from time import sleep

# Set directory path
path = '/Users/martinbogaert/Desktop/NBA Data Analysis/nba_ai_awards/'

print('MAKE SURE THAT YOU CHANGE THE WEEK VARIABLE BELOW')

MAKE SURE THAT YOU CHANGE THE WEEK VARIABLE BELOW


In [2]:
scrape_year = 2023 # change to 2023
# Date and time of scrapping
week = 17 # CHANGE THIS
date1 = datetime.now().strftime('%d/%m')
date2 = datetime.now().strftime('%d %B %Y')
time = datetime.now().strftime('%H:%M:%S')
print(date2 + ', ' + time)

13 February 2023, 18:04:44


In [3]:
# Define function which deals with players having played on multiple teams
def single_player(df):
    if len(df) == 1: # if only one team, keep it
        return df
    else: # multiple teams: keep total (TOT) stats and replace team with last played for
        row = df[df['Tm'] == 'TOT'].copy()
        row['Tm'] = str(df['Tm'].iloc[-1])
        return row
        
# Team abbreviation dictionary
team_dic = {'Sacramento Kings': 'SAC', 'Los Angeles Clippers': 'LAC','Washington Wizards': 'WAS','Phoenix Suns': 'PHO',
   'Boston Celtics': 'BOS','Golden State Warriors': 'GSW','Denver Nuggets': 'DEN','Orlando Magic': 'ORL','Chicago Bulls': 'CHI',
   'Utah Jazz': 'UTA','Toronto Raptors': 'TOR','Los Angeles Lakers': 'LAL','Portland Trail Blazers': 'POR',
   'Memphis Grizzlies': 'MEM','Miami Heat': 'MIA','Atlanta Hawks': 'ATL','Oklahoma City Thunder': 'OKC','Milwaukee Bucks': 'MIL',
   'San Antonio Spurs': 'SAS','Charlotte Hornets': 'CHO','Brooklyn Nets': 'BRK','Cleveland Cavaliers': 'CLE','Philadelphia 76ers': 'PHI',
   'Houston Rockets': 'HOU','Dallas Mavericks': 'DAL','New York Knicks': 'NYK','Indiana Pacers': 'IND','Detroit Pistons': 'DET',
   'Minnesota Timberwolves': 'MIN','New Orleans Pelicans': 'NOP','Charlotte Hornets\xa0': 'CHO'
           }


# Player stats

In [4]:
# Scrape player per game and advanced stats

dfs = [] 
for Type in ['per_game', 'advanced']: # Loop twice: per game, advanced stats

    # Scrape webpage table
    url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}_{Type}.html' 
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html')
    table = soup.find('table')
    # Transform table into DataFrame
    stats = pd.read_html(str(table))[0]
    stats = stats[stats['Rk'] != 'Rk'].reset_index(drop = True)
    del stats['Rk'] # Drop rank column

    dfs.append(stats) # Store both DataFrames

# Find unique columns, deal with repeating columns
col_pg = list(dfs[0]) 
col_pg.remove('Player'); col_pg.remove('Tm') # Remove merge columns (Player & team)
temp = list(dfs[1])
unique_cols = [x for x in temp if x not in col_pg] # Keep advanced columns if they are not already in per game columns

# Merge per game and advanced data
data = pd.merge(dfs[0], dfs[1][unique_cols], on = ['Player', 'Tm'])
data['Year'] = len(data) * [scrape_year] # Add year
del data['Unnamed: 19']
del data['Unnamed: 24']

data['Player'] = data['Player'].str.replace('*', '', regex = False) # Delete * in some names
data = data.groupby(['Player']).apply(single_player) # Keep one entry for multiple teams players
data = data.reset_index(drop = True)
data = data.apply(pd.to_numeric, errors = 'ignore')
# Calculate projected win shares
for stat in ['OWS', 'DWS', 'WS', 'VORP'] :
    data[stat] = data[stat] + data[stat] / data['G'] * (82 - data['G'])

data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year
0,A.J. Green,SG,23,MIL,25,0,8.8,1.5,3.4,0.44,...,17.6,0.984,0.656,1.64,0.116,-0.4,-0.6,-1.0,0.328,2023
1,A.J. Lawson,SG,22,DAL,12,0,4.2,0.8,1.7,0.45,...,20.2,0.0,0.0,0.0,0.031,-3.4,-2.9,-6.3,-0.683333,2023
2,AJ Griffin,SF,19,ATL,53,11,20.5,3.7,7.7,0.479,...,17.6,1.54717,1.237736,2.939623,0.083,-0.2,0.0,-0.2,0.773585,2023
3,Aaron Gordon,PF,27,DEN,49,49,30.3,6.6,11.2,0.587,...,21.5,6.359184,2.677551,9.204082,0.177,3.2,-0.3,2.9,3.012245,2023
4,Aaron Holiday,PG,26,ATL,49,5,14.7,1.6,3.8,0.424,...,13.3,0.334694,1.004082,1.338776,0.055,-3.0,0.8,-2.1,0.0,2023


In [5]:
# Srape individual ORtg and DRtg (can only be found in team data)
dfs_rtg = []
for i, tm in enumerate(set(data['Tm'])) :
    
    prog = round(100 * (i+1) / len(set(data['Tm'])), 2) # Compute and print loop progress
    print(f'{prog}% {tm} ...')
    clear_output(wait = True)
    
     # Scrape webpage table
    url = f'https://www.basketball-reference.com/teams/{tm}/{scrape_year}.html'
    page = requests.get(url)
    table = page.text.split(f'<div class="table_container current" id="div_per_poss">')[1].split('</table>')[0] + '</table>'
    soup = BeautifulSoup(table, 'html')
    # Transform table into DataFrame
    data_rtg = pd.read_html(str(soup))[0]
    data_rtg['Tm'] = len(data_rtg) * [tm] # Add team and year
    data_rtg['Year'] = len(data_rtg) * [scrape_year]

    data_rtg = data_rtg.rename(columns = {'Unnamed: 1' : 'Player'})[['Player','Tm','Year','ORtg','DRtg']]
    dfs_rtg.append(data_rtg)

    sleep(2)

# Merge data with individual ORtg and DRtg
data_rtg = pd.concat(dfs_rtg).rename(columns = {'ORtg':'ORtg/100', 'DRtg':'DRtg/100'})
data_rtg['Player'] = data_rtg['Player'].str.replace('*', '', regex = False) # Clean player name column
data = data.merge(data_rtg, on = ['Player','Year','Tm'], how = 'inner') # Merge ratings with data
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year,ORtg/100,DRtg/100
0,A.J. Green,SG,23,MIL,25,0,8.8,1.5,3.4,0.44,...,0.656,1.64,0.116,-0.4,-0.6,-1.0,0.328,2023,120.0,114.0
1,A.J. Lawson,SG,22,DAL,12,0,4.2,0.8,1.7,0.45,...,0.0,0.0,0.031,-3.4,-2.9,-6.3,-0.683333,2023,104.0,120.0
2,AJ Griffin,SF,19,ATL,53,11,20.5,3.7,7.7,0.479,...,1.237736,2.939623,0.083,-0.2,0.0,-0.2,0.773585,2023,115.0,117.0
3,Aaron Gordon,PF,27,DEN,49,49,30.3,6.6,11.2,0.587,...,2.677551,9.204082,0.177,3.2,-0.3,2.9,3.012245,2023,127.0,114.0
4,Aaron Holiday,PG,26,ATL,49,5,14.7,1.6,3.8,0.424,...,1.004082,1.338776,0.055,-3.0,0.8,-2.1,0.0,2023,110.0,116.0


# Team stats

In [6]:
# Scrape team stats
url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}.html'
page = requests.get(url)

soup = BeautifulSoup(page.content, 'html')

while soup.find('tr', class_ = 'thead') is not None:
    soup.find('tr', class_ = 'thead').decompose()

table_E = soup.find('table', id = 'divs_standings_E')
teams_E = pd.read_html(str(table_E))[0]
teams_E = teams_E.rename(columns = {'Eastern Conference': 'Team'})

table_W = soup.find('table', id = 'divs_standings_W')
teams_W = pd.read_html(str(table_W))[0]
teams_W = teams_W.rename(columns = {'Western Conference': 'Team'})

teams_E['Seed'] = [len(teams_E[teams_E['W/L%'] > wl])+1 for wl in teams_E['W/L%']]
teams_W['Seed'] = [len(teams_W[teams_W['W/L%'] > wl])+1 for wl in teams_W['W/L%']]

teams = pd.concat([teams_E, teams_W])

del teams['GB']
teams['Team'] = teams['Team'].str.replace('*','', regex = False)
teams['Team'] = teams['Team'].apply(lambda x: ' '.join(x.split()[:-1]))

url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}_ratings.html'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html')
soup.find('tr', class_='over_header').decompose()    
table = soup.find('table')
ratings = pd.read_html(str(table))[0][['Team', 'MOV', 'ORtg', 'DRtg', 'NRtg', 'MOV/A', 'ORtg/A', 'DRtg/A', 'NRtg/A']]
ratings['Team'] = ratings['Team'].str.replace('Charlotte Hornets\xa0', 'Charlotte Hornets')
teams['Team'] = teams['Team'].str.replace('Charlotte Hornets\xa0', 'Charlotte Hornets')

teams = teams.merge(ratings, on = 'Team', how = 'outer')
teams = teams.sort_values('W/L%', ascending = False).reset_index(drop = True)
teams['Tm'] = [team_dic.get(tm) for tm in teams['Team']]

# Merge indiviudal and team stats
stats = data.merge(teams, how = 'inner', on = 'Tm')
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,SRS,Seed,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A
0,A.J. Green,SG,23,MIL,25,0,8.8,1.5,3.4,0.44,...,2.82,2,3.02,114.35,111.43,2.92,2.82,114.33,111.58,2.74
1,Bobby Portis,PF,27,MIL,47,14,26.8,6.0,12.0,0.499,...,2.82,2,3.02,114.35,111.43,2.92,2.82,114.33,111.58,2.74
2,Brook Lopez,C,34,MIL,55,55,30.4,5.5,10.9,0.503,...,2.82,2,3.02,114.35,111.43,2.92,2.82,114.33,111.58,2.74
3,George Hill,PG,36,MIL,35,0,19.1,1.7,3.8,0.447,...,2.82,2,3.02,114.35,111.43,2.92,2.82,114.33,111.58,2.74
4,Giannis Antetokounmpo,PF,28,MIL,45,45,33.3,11.5,21.3,0.542,...,2.82,2,3.02,114.35,111.43,2.92,2.82,114.33,111.58,2.74


In [7]:
# Determine players eligible for ROY and SMOY
url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}_rookies.html' # Scrape rookie list
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html')

for overhead in ['over_header', 'over_header thead', 'thead']:
    while soup.find('tr', class_ = overhead) is not None:
        soup.find('tr', class_ = overhead).decompose()
        
table = soup.find('table')
temp = pd.read_html(str(table))[0]
rookies = list(temp['Player'])

stats['roy'] = [1 if player in rookies else 0 for player in stats['Player']] # flag eligible rookies
stats['smoy'] = [1 if gs/g <= 0.5 else 0 for gs, g in zip(stats['GS'], stats['G'])] # flag eligible sixth-mans

for stat in list(stats):
    stats[stat] = stats[stat].replace(np.nan, 0) # replace NaNs with zeros



stats.sort_values('PTS', ascending = False).head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A,roy,smoy
479,Joel Embiid,C,28,PHI,43,43,34.9,11.1,20.7,0.539,...,3.58,116.98,113.41,3.57,3.2,116.63,113.44,3.19,0,0
29,Luka Dončić,PG,23,DAL,48,48,36.6,11.3,22.3,0.505,...,0.71,117.19,116.48,0.71,0.83,117.07,116.26,0.82,0,0
4,Giannis Antetokounmpo,PF,28,MIL,45,45,33.3,11.5,21.3,0.542,...,3.02,114.35,111.43,2.92,2.82,114.33,111.58,2.74,0,0
98,Shai Gilgeous-Alexander,PG,24,OKC,51,51,35.5,10.3,20.2,0.511,...,1.09,115.99,114.92,1.07,1.29,116.19,114.91,1.28,0,0
279,Damian Lillard,PG,32,POR,44,44,36.2,9.3,20.0,0.465,...,0.07,118.01,118.09,-0.08,-0.09,117.94,118.18,-0.24,0,0


In [8]:
# Export data to csv
stats.to_csv(path + f'Algorithm/weekly data/week_{week}.csv', index = None)

# Export weekly info to txt
n = teams['W'].sum()
with open(path + f'Algorithm/weekly data/week_{week}.txt', 'w+') as file:
    file.write(f'Date : {date1}\n')
    file.write(f'Date : {date2}\n')
    file.write(f'No. games : {n}\n')
    file.write(f'Time : {time}\n')
    file.close()

In [9]:
print(stats.shape)

(505, 70)
