This notebook scrapes 2022-2023 season data weekly and outputs it as a csv file, and a txt file documenting the date and number of games.

In [1]:
# Import packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import numpy as np
from IPython.display import clear_output

# Set directory path
path = '/Users/martinbogaert/Desktop/NBA Data Analysis/2022-2023 Awards Project clean/'

print('MAKE SURE THAT YOU CHANGE THE WEEK VARIABLE BELOW')

MAKE SURE THAT YOU CHANGE THE WEEK VARIABLE BELOW


In [2]:
scrape_year = 2023 # change to 2023
# Date and time of scrapping
week = 2 # CHANGE THIS
date1 = datetime.now().strftime('%d/%m')
date2 = datetime.now().strftime('%d %B %Y')
time = datetime.now().strftime('%H:%M:%S')
print(date2 + ', ' + time)

31 October 2022, 10:56:12


In [3]:
# Define function which deals with players having played on multiple teams
def single_player(df):
    if len(df) == 1: # if only one team, keep it
        return df
    else: # multiple teams: keep total (TOT) stats and replace team with last played for
        row = df[df['Tm'] == 'TOT'].copy()
        row['Tm'] = str(df['Tm'].iloc[-1])
        return row
        
# Team abbreviation dictionary
team_dic = {'Sacramento Kings': 'SAC', 'Los Angeles Clippers': 'LAC','Washington Wizards': 'WAS','Phoenix Suns': 'PHO',
   'Boston Celtics': 'BOS','Golden State Warriors': 'GSW','Denver Nuggets': 'DEN','Orlando Magic': 'ORL','Chicago Bulls': 'CHI',
   'Utah Jazz': 'UTA','Toronto Raptors': 'TOR','Los Angeles Lakers': 'LAL','Portland Trail Blazers': 'POR',
   'Memphis Grizzlies': 'MEM','Miami Heat': 'MIA','Atlanta Hawks': 'ATL','Oklahoma City Thunder': 'OKC','Milwaukee Bucks': 'MIL',
   'San Antonio Spurs': 'SAS','Charlotte Hornets': 'CHO','Brooklyn Nets': 'BRK','Cleveland Cavaliers': 'CLE','Philadelphia 76ers': 'PHI',
   'Houston Rockets': 'HOU','Dallas Mavericks': 'DAL','New York Knicks': 'NYK','Indiana Pacers': 'IND','Detroit Pistons': 'DET',
   'Minnesota Timberwolves': 'MIN','New Orleans Pelicans': 'NOP','Charlotte Hornets\xa0': 'CHO'
           }


# Player stats

In [4]:
# Scrape player per game and advanced stats
from xml.sax.handler import DTDHandler


dfs = [] 
for Type in ['per_game', 'advanced']: # Loop twice: per game, advanced stats

    # Scrape webpage table
    url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}_{Type}.html' 
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html')
    table = soup.find('table')
    # Transform table into DataFrame
    stats = pd.read_html(str(table))[0]
    stats = stats[stats['Rk'] != 'Rk'].reset_index(drop = True)
    del stats['Rk'] # Drop rank column

    dfs.append(stats) # Store both DataFrames

# Find unique columns, deal with repeating columns
col_pg = list(dfs[0]) 
col_pg.remove('Player'); col_pg.remove('Tm') # Remove merge columns (Player & team)
temp = list(dfs[1])
unique_cols = [x for x in temp if x not in col_pg] # Keep advanced columns if they are not already in per game columns

# Merge per game and advanced data
data = pd.merge(dfs[0], dfs[1][unique_cols], on = ['Player', 'Tm'])
data['Year'] = len(data) * [scrape_year] # Add year
del data['Unnamed: 19']
del data['Unnamed: 24']

data['Player'] = data['Player'].str.replace('*', '', regex = False) # Delete * in some names
data = data.groupby(['Player']).apply(single_player) # Keep one entry for multiple teams players
data = data.reset_index(drop = True)
data = data.apply(pd.to_numeric, errors = 'ignore')
# Calculate projected win shares
for stat in ['OWS', 'DWS', 'WS', 'VORP'] :
    data[stat] = data[stat] + data[stat] / data['G'] * (82 - data['G'])

data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year
0,Precious Achiuwa,C,23,TOR,6,0,23.2,3.0,8.5,0.353,...,20.8,-2.733333,1.366667,0.0,-0.015,-4.2,-2.8,-6.9,-2.733333,2023
1,Steven Adams,C,29,MEM,6,6,27.0,2.5,4.2,0.6,...,11.4,1.366667,1.366667,2.733333,0.053,-3.1,-0.5,-3.6,-1.366667,2023
2,Bam Adebayo,C,25,MIA,7,7,35.1,6.6,12.3,0.535,...,22.6,1.171429,3.514286,4.685714,0.074,-1.9,0.0,-1.9,0.0,2023
3,Ochai Agbaji,SG,22,UTA,3,0,12.7,2.0,4.3,0.462,...,17.4,0.0,0.0,2.733333,0.072,-4.0,-4.6,-8.6,-2.733333,2023
4,Santi Aldama,PF,22,MEM,6,6,30.5,4.0,8.5,0.471,...,14.2,2.733333,1.366667,4.1,0.077,-1.9,-0.9,-2.7,0.0,2023


In [5]:
# Srape individual ORtg and DRtg (can only be found in team data)
dfs_rtg = []
for i, tm in enumerate(set(data['Tm'])) :
    
    prog = round(100 * (i+1) / len(set(data['Tm'])), 2) # Compute and print loop progress
    print(f'{prog}% {tm} ...')
    clear_output(wait = True)
    
     # Scrape webpage table
    url = f'https://www.basketball-reference.com/teams/{tm}/{scrape_year}.html'
    page = requests.get(url)
    table = page.text.split(f'<div class="table_container current" id="div_per_poss">')[1].split('</table>')[0] + '</table>'
    soup = BeautifulSoup(table, 'html')
    # Transform table into DataFrame
    data_rtg = pd.read_html(str(soup))[0]
    data_rtg['Tm'] = len(data_rtg) * [tm] # Add team and year
    data_rtg['Year'] = len(data_rtg) * [scrape_year]

    data_rtg = data_rtg.rename(columns = {'Unnamed: 1' : 'Player'})[['Player','Tm','Year','ORtg','DRtg']]
    dfs_rtg.append(data_rtg)

# Merge data with individual ORtg and DRtg
data_rtg = pd.concat(dfs_rtg).rename(columns = {'ORtg':'ORtg/100', 'DRtg':'DRtg/100'})
data_rtg['Player'] = data_rtg['Player'].str.replace('*', '', regex = False) # Clean player name column
data = data.merge(data_rtg, on = ['Player','Year','Tm'], how = 'inner') # Merge ratings with data
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year,ORtg/100,DRtg/100
0,Precious Achiuwa,C,23,TOR,6,0,23.2,3.0,8.5,0.353,...,1.366667,0.0,-0.015,-4.2,-2.8,-6.9,-2.733333,2023,93.0,114.0
1,Steven Adams,C,29,MEM,6,6,27.0,2.5,4.2,0.6,...,1.366667,2.733333,0.053,-3.1,-0.5,-3.6,-1.366667,2023,108.0,115.0
2,Bam Adebayo,C,25,MIA,7,7,35.1,6.6,12.3,0.535,...,3.514286,4.685714,0.074,-1.9,0.0,-1.9,0.0,2023,107.0,112.0
3,Ochai Agbaji,SG,22,UTA,3,0,12.7,2.0,4.3,0.462,...,0.0,2.733333,0.072,-4.0,-4.6,-8.6,-2.733333,2023,117.0,119.0
4,Santi Aldama,PF,22,MEM,6,6,30.5,4.0,8.5,0.471,...,1.366667,4.1,0.077,-1.9,-0.9,-2.7,0.0,2023,120.0,119.0


# Team stats

In [6]:
# Scrape team stats
url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}.html'
page = requests.get(url)

soup = BeautifulSoup(page.content, 'html')

while soup.find('tr', class_ = 'thead') is not None:
    soup.find('tr', class_ = 'thead').decompose()

table_E = soup.find('table', id = 'divs_standings_E')
teams_E = pd.read_html(str(table_E))[0]
teams_E = teams_E.rename(columns = {'Eastern Conference': 'Team'})

table_W = soup.find('table', id = 'divs_standings_W')
teams_W = pd.read_html(str(table_W))[0]
teams_W = teams_W.rename(columns = {'Western Conference': 'Team'})

teams_E['Seed'] = [len(teams_E[teams_E['W/L%'] > wl])+1 for wl in teams_E['W/L%']]
teams_W['Seed'] = [len(teams_W[teams_W['W/L%'] > wl])+1 for wl in teams_W['W/L%']]

teams = pd.concat([teams_E, teams_W])

del teams['GB']
teams['Team'] = teams['Team'].str.replace('*','', regex = False)
teams['Team'] = teams['Team'].apply(lambda x: ' '.join(x.split()[:-1]))

url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}_ratings.html'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html')
soup.find('tr', class_='over_header').decompose()    
table = soup.find('table')
ratings = pd.read_html(str(table))[0][['Team', 'MOV', 'ORtg', 'DRtg', 'NRtg', 'MOV/A', 'ORtg/A', 'DRtg/A', 'NRtg/A']]
ratings['Team'] = ratings['Team'].str.replace('Charlotte Hornets\xa0', 'Charlotte Hornets')
teams['Team'] = teams['Team'].str.replace('Charlotte Hornets\xa0', 'Charlotte Hornets')

teams = teams.merge(ratings, on = 'Team', how = 'outer')
teams = teams.sort_values('W/L%', ascending = False).reset_index(drop = True)
teams['Tm'] = [team_dic.get(tm) for tm in teams['Team']]

# Merge indiviudal and team stats
stats = data.merge(teams, how = 'inner', on = 'Tm')
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,SRS,Seed,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A
0,Precious Achiuwa,C,23,TOR,6,0,23.2,3.0,8.5,0.353,...,0.2,5,-1.33,110.95,112.39,-1.44,0.2,111.25,110.99,0.26
1,OG Anunoby,SF,25,TOR,6,6,36.2,4.7,10.8,0.431,...,0.2,5,-1.33,110.95,112.39,-1.44,0.2,111.25,110.99,0.26
2,Dalano Banton,PG,23,TOR,6,0,8.2,1.0,2.5,0.4,...,0.2,5,-1.33,110.95,112.39,-1.44,0.2,111.25,110.99,0.26
3,Scottie Barnes,PF,21,TOR,5,5,28.6,5.8,11.0,0.527,...,0.2,5,-1.33,110.95,112.39,-1.44,0.2,111.25,110.99,0.26
4,Khem Birch,C,30,TOR,2,0,6.5,0.5,1.0,0.5,...,0.2,5,-1.33,110.95,112.39,-1.44,0.2,111.25,110.99,0.26


In [7]:
# Determine players eligible for ROY and SMOY
url = f'https://www.basketball-reference.com/leagues/NBA_{scrape_year}_rookies.html' # Scrape rookie list
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html')

for overhead in ['over_header', 'over_header thead', 'thead']:
    while soup.find('tr', class_ = overhead) is not None:
        soup.find('tr', class_ = overhead).decompose()
        
table = soup.find('table')
temp = pd.read_html(str(table))[0]
rookies = list(temp['Player'])

stats['roy'] = [1 if player in rookies else 0 for player in stats['Player']] # flag eligible rookies
stats['smoy'] = [1 if gs/g <= 0.5 else 0 for gs, g in zip(stats['GS'], stats['G'])] # flag eligible sixth-mans

for stat in list(stats):
    stats[stat] = stats[stat].replace(np.nan, 0) # replace NaNs with zeros



stats.sort_values('PTS', ascending = False).head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A,roy,smoy
366,Luka Dončić,PG,23,DAL,6,6,36.7,12.8,25.5,0.503,...,7.33,120.18,112.69,7.49,9.39,120.38,110.83,9.55,0,0
58,Giannis Antetokounmpo,PF,28,MIL,5,5,34.6,12.8,21.2,0.604,...,10.4,114.17,103.8,10.37,10.27,112.21,101.96,10.26,0,0
24,Ja Morant,PG,23,MEM,5,5,32.8,10.8,20.4,0.529,...,-1.17,119.23,120.37,-1.14,-2.21,117.96,120.21,-2.25,0,0
77,Donovan Mitchell,SG,26,CLE,6,6,39.0,11.0,22.2,0.496,...,12.0,118.42,106.24,12.17,13.74,120.68,106.74,13.94,0,0
410,Kevin Durant,PF,34,BRK,6,6,37.5,10.5,20.5,0.512,...,-8.67,114.97,123.55,-8.58,-4.68,115.8,120.35,-4.55,0,0


In [8]:
# Export data to csv
stats.to_csv(path + f'Algorithm/weekly data/week_{week}.csv', index = None)

# Export weekly info to txt
n = teams['W'].sum()
with open(path + f'Algorithm/weekly data/week_{week}.txt', 'w+') as file:
    file.write(f'Date : {date1}\n')
    file.write(f'Date : {date2}\n')
    file.write(f'No. games : {n}\n')
    file.write(f'Time : {time}\n')
    file.close()

In [9]:
print(stats.shape)

(436, 70)
