This notebook scrapes 2022-2023 season data weekly and outputs it as a csv file, and a txt file documenting the date and number of games.

In [14]:
# Import packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import numpy as np
from IPython.display import clear_output
from time import sleep

# Set directory path
path = '/Users/martinbogaert/Desktop/NBA Data Analysis/nba_ai_awards/'

print('MAKE SURE THAT YOU CHANGE THE WEEK VARIABLE BELOW')

MAKE SURE THAT YOU CHANGE THE WEEK VARIABLE BELOW


In [15]:
scrape_year = 2023 # change to 2023
# Date and time of scrapping
week = 4 # CHANGE THIS
date1 = datetime.now().strftime('%d/%m')
date2 = datetime.now().strftime('%d %B %Y')
time = datetime.now().strftime('%H:%M:%S')
print(date2 + ', ' + time)

14 November 2022, 20:57:42


In [16]:
# Define function which deals with players having played on multiple teams
def single_player(df):
    if len(df) == 1: # if only one team, keep it
        return df
    else: # multiple teams: keep total (TOT) stats and replace team with last played for
        row = df[df['Tm'] == 'TOT'].copy()
        row['Tm'] = str(df['Tm'].iloc[-1])
        return row
        
# Team abbreviation dictionary
team_dic = {'Sacramento Kings': 'SAC', 'Los Angeles Clippers': 'LAC','Washington Wizards': 'WAS','Phoenix Suns': 'PHO',
   'Boston Celtics': 'BOS','Golden State Warriors': 'GSW','Denver Nuggets': 'DEN','Orlando Magic': 'ORL','Chicago Bulls': 'CHI',
   'Utah Jazz': 'UTA','Toronto Raptors': 'TOR','Los Angeles Lakers': 'LAL','Portland Trail Blazers': 'POR',
   'Memphis Grizzlies': 'MEM','Miami Heat': 'MIA','Atlanta Hawks': 'ATL','Oklahoma City Thunder': 'OKC','Milwaukee Bucks': 'MIL',
   'San Antonio Spurs': 'SAS','Charlotte Hornets': 'CHO','Brooklyn Nets': 'BRK','Cleveland Cavaliers': 'CLE','Philadelphia 76ers': 'PHI',
   'Houston Rockets': 'HOU','Dallas Mavericks': 'DAL','New York Knicks': 'NYK','Indiana Pacers': 'IND','Detroit Pistons': 'DET',
   'Minnesota Timberwolves': 'MIN','New Orleans Pelicans': 'NOP','Charlotte Hornets\xa0': 'CHO'
           }


# Player stats

In [17]:
# Scrape player per game and advanced stats

dfs = [] 
for Type in ['per_game', 'advanced']: # Loop twice: per game, advanced stats

    # Scrape webpage table
    url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}_{Type}.html' 
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html')
    table = soup.find('table')
    # Transform table into DataFrame
    stats = pd.read_html(str(table))[0]
    stats = stats[stats['Rk'] != 'Rk'].reset_index(drop = True)
    del stats['Rk'] # Drop rank column

    dfs.append(stats) # Store both DataFrames

# Find unique columns, deal with repeating columns
col_pg = list(dfs[0]) 
col_pg.remove('Player'); col_pg.remove('Tm') # Remove merge columns (Player & team)
temp = list(dfs[1])
unique_cols = [x for x in temp if x not in col_pg] # Keep advanced columns if they are not already in per game columns

# Merge per game and advanced data
data = pd.merge(dfs[0], dfs[1][unique_cols], on = ['Player', 'Tm'])
data['Year'] = len(data) * [scrape_year] # Add year
del data['Unnamed: 19']
del data['Unnamed: 24']

data['Player'] = data['Player'].str.replace('*', '', regex = False) # Delete * in some names
data = data.groupby(['Player']).apply(single_player) # Keep one entry for multiple teams players
data = data.reset_index(drop = True)
data = data.apply(pd.to_numeric, errors = 'ignore')
# Calculate projected win shares
for stat in ['OWS', 'DWS', 'WS', 'VORP'] :
    data[stat] = data[stat] + data[stat] / data['G'] * (82 - data['G'])

data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year
0,Precious Achiuwa,C,23,TOR,12,0,20.4,3.0,7.7,0.391,...,20.9,0.683333,1.366667,2.05,0.062,-1.9,-1.9,-3.8,-0.683333,2023
1,Steven Adams,C,29,MEM,13,13,27.1,3.1,5.0,0.615,...,12.7,1.892308,3.153846,5.676923,0.117,-1.0,0.5,-0.5,0.630769,2023
2,Bam Adebayo,C,25,MIA,13,13,34.9,7.3,13.7,0.534,...,24.2,1.261538,4.415385,5.046154,0.087,-1.7,1.0,-0.7,0.630769,2023
3,Ochai Agbaji,SG,22,UTA,6,0,11.2,1.3,3.2,0.421,...,13.7,0.0,0.0,1.366667,0.039,-4.7,-3.6,-8.3,-1.366667,2023
4,Santi Aldama,PF,22,MEM,14,14,27.7,3.8,8.1,0.465,...,14.3,2.928571,2.928571,5.857143,0.118,-0.9,0.8,-0.1,1.171429,2023


In [18]:
# Srape individual ORtg and DRtg (can only be found in team data)
dfs_rtg = []
for i, tm in enumerate(set(data['Tm'])) :
    
    prog = round(100 * (i+1) / len(set(data['Tm'])), 2) # Compute and print loop progress
    print(f'{prog}% {tm} ...')
    clear_output(wait = True)
    
     # Scrape webpage table
    url = f'https://www.basketball-reference.com/teams/{tm}/{scrape_year}.html'
    page = requests.get(url)
    table = page.text.split(f'<div class="table_container current" id="div_per_poss">')[1].split('</table>')[0] + '</table>'
    soup = BeautifulSoup(table, 'html')
    # Transform table into DataFrame
    data_rtg = pd.read_html(str(soup))[0]
    data_rtg['Tm'] = len(data_rtg) * [tm] # Add team and year
    data_rtg['Year'] = len(data_rtg) * [scrape_year]

    data_rtg = data_rtg.rename(columns = {'Unnamed: 1' : 'Player'})[['Player','Tm','Year','ORtg','DRtg']]
    dfs_rtg.append(data_rtg)

    sleep(2)

# Merge data with individual ORtg and DRtg
data_rtg = pd.concat(dfs_rtg).rename(columns = {'ORtg':'ORtg/100', 'DRtg':'DRtg/100'})
data_rtg['Player'] = data_rtg['Player'].str.replace('*', '', regex = False) # Clean player name column
data = data.merge(data_rtg, on = ['Player','Year','Tm'], how = 'inner') # Merge ratings with data
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year,ORtg/100,DRtg/100
0,Precious Achiuwa,C,23,TOR,12,0,20.4,3.0,7.7,0.391,...,1.366667,2.05,0.062,-1.9,-1.9,-3.8,-0.683333,2023,106.0,113
1,Steven Adams,C,29,MEM,13,13,27.1,3.1,5.0,0.615,...,3.153846,5.676923,0.117,-1.0,0.5,-0.5,0.630769,2023,114.0,109
2,Bam Adebayo,C,25,MIA,13,13,34.9,7.3,13.7,0.534,...,4.415385,5.046154,0.087,-1.7,1.0,-0.7,0.630769,2023,106.0,109
3,Ochai Agbaji,SG,22,UTA,6,0,11.2,1.3,3.2,0.421,...,0.0,1.366667,0.039,-4.7,-3.6,-8.3,-1.366667,2023,108.0,117
4,Santi Aldama,PF,22,MEM,14,14,27.7,3.8,8.1,0.465,...,2.928571,5.857143,0.118,-0.9,0.8,-0.1,1.171429,2023,118.0,110


# Team stats

In [19]:
# Scrape team stats
url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}.html'
page = requests.get(url)

soup = BeautifulSoup(page.content, 'html')

while soup.find('tr', class_ = 'thead') is not None:
    soup.find('tr', class_ = 'thead').decompose()

table_E = soup.find('table', id = 'divs_standings_E')
teams_E = pd.read_html(str(table_E))[0]
teams_E = teams_E.rename(columns = {'Eastern Conference': 'Team'})

table_W = soup.find('table', id = 'divs_standings_W')
teams_W = pd.read_html(str(table_W))[0]
teams_W = teams_W.rename(columns = {'Western Conference': 'Team'})

teams_E['Seed'] = [len(teams_E[teams_E['W/L%'] > wl])+1 for wl in teams_E['W/L%']]
teams_W['Seed'] = [len(teams_W[teams_W['W/L%'] > wl])+1 for wl in teams_W['W/L%']]

teams = pd.concat([teams_E, teams_W])

del teams['GB']
teams['Team'] = teams['Team'].str.replace('*','', regex = False)
teams['Team'] = teams['Team'].apply(lambda x: ' '.join(x.split()[:-1]))

url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}_ratings.html'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html')
soup.find('tr', class_='over_header').decompose()    
table = soup.find('table')
ratings = pd.read_html(str(table))[0][['Team', 'MOV', 'ORtg', 'DRtg', 'NRtg', 'MOV/A', 'ORtg/A', 'DRtg/A', 'NRtg/A']]
ratings['Team'] = ratings['Team'].str.replace('Charlotte Hornets\xa0', 'Charlotte Hornets')
teams['Team'] = teams['Team'].str.replace('Charlotte Hornets\xa0', 'Charlotte Hornets')

teams = teams.merge(ratings, on = 'Team', how = 'outer')
teams = teams.sort_values('W/L%', ascending = False).reset_index(drop = True)
teams['Tm'] = [team_dic.get(tm) for tm in teams['Team']]

# Merge indiviudal and team stats
stats = data.merge(teams, how = 'inner', on = 'Tm')
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,SRS,Seed,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A
0,Precious Achiuwa,C,23,TOR,12,0,20.4,3.0,7.7,0.391,...,3.24,6,2.36,114.37,112.28,2.09,3.24,114.9,111.9,3.0
1,OG Anunoby,SF,25,TOR,14,14,35.4,6.4,13.6,0.474,...,3.24,6,2.36,114.37,112.28,2.09,3.24,114.9,111.9,3.0
2,Dalano Banton,PG,23,TOR,13,0,9.5,1.8,4.3,0.429,...,3.24,6,2.36,114.37,112.28,2.09,3.24,114.9,111.9,3.0
3,Scottie Barnes,PF,21,TOR,13,13,31.9,5.5,12.8,0.431,...,3.24,6,2.36,114.37,112.28,2.09,3.24,114.9,111.9,3.0
4,Khem Birch,C,30,TOR,4,0,8.5,0.8,1.3,0.6,...,3.24,6,2.36,114.37,112.28,2.09,3.24,114.9,111.9,3.0


In [20]:
# Determine players eligible for ROY and SMOY
url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}_rookies.html' # Scrape rookie list
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html')

for overhead in ['over_header', 'over_header thead', 'thead']:
    while soup.find('tr', class_ = overhead) is not None:
        soup.find('tr', class_ = overhead).decompose()
        
table = soup.find('table')
temp = pd.read_html(str(table))[0]
rookies = list(temp['Player'])

stats['roy'] = [1 if player in rookies else 0 for player in stats['Player']] # flag eligible rookies
stats['smoy'] = [1 if gs/g <= 0.5 else 0 for gs, g in zip(stats['GS'], stats['G'])] # flag eligible sixth-mans

for stat in list(stats):
    stats[stat] = stats[stat].replace(np.nan, 0) # replace NaNs with zeros



stats.sort_values('PTS', ascending = False).head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A,roy,smoy
382,Luka Dončić,PG,23,DAL,12,12,36.9,11.7,23.5,0.496,...,3.33,115.25,111.81,3.44,4.96,116.15,111.1,5.06,0,0
196,Stephen Curry,PG,34,GSW,12,12,35.2,11.0,20.9,0.526,...,-3.0,113.15,116.19,-3.04,-3.7,112.29,116.03,-3.74,0,0
361,Jayson Tatum,PF,24,BOS,13,13,37.5,10.4,20.8,0.5,...,6.0,121.15,114.81,6.34,5.94,121.43,115.16,6.26,0,0
440,Joel Embiid,C,28,PHI,10,10,35.2,10.8,20.1,0.537,...,2.0,112.94,110.81,2.13,3.54,113.69,110.04,3.65,0,0
62,Giannis Antetokounmpo,PF,28,MIL,9,9,32.8,11.4,21.1,0.542,...,6.08,110.83,104.8,6.03,3.67,109.04,105.43,3.61,0,0


In [21]:
# Export data to csv
stats.to_csv(path + f'Algorithm/weekly data/week_{week}.csv', index = None)

# Export weekly info to txt
n = teams['W'].sum()
with open(path + f'Algorithm/weekly data/week_{week}.txt', 'w+') as file:
    file.write(f'Date : {date1}\n')
    file.write(f'Date : {date2}\n')
    file.write(f'No. games : {n}\n')
    file.write(f'Time : {time}\n')
    file.close()

In [22]:
print(stats.shape)

(454, 70)
