In [2]:
# Import packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')

# Set working path
path = '/Users/martinbogaert/Desktop/NBA Data Analysis/2022-2023_Awards_Project_clean/'

## A. Awards scrapping

In [2]:
# function scraps input award history for given timerange
def award_history(aw, start, end = 2022):
    
    year = list(range(start, end+1))

    dfs = []
    for yr in year:

        url = f'https://www.basketball-reference.com/awards/awards_{yr}.html'
        page = requests.get(url)
        
    #    with open(path + f'Scrapping/{aw} yearly data/{yr}.html', 'w+') as file:
       #     file.write(page.text)
       #     file.close()
        
    #    if yr < 1976 and yr > 1971:
       #     table = page.text.split(f'<div class="table_container" id="div_nba_{aw}">')[1].split('</table>')[0] + '</table>'
        
        table = page.text.split(f'<div class="table_container" id="div_{aw}">')[1].split('</table>')[0] + '</table>'
           
        soup = BeautifulSoup(table, 'html')
        soup.find('tr', class_ = 'over_header').decompose()
        
        year_df = pd.read_html(str(soup))[0]
        year_df['Year'] = len(year_df) * [yr]
        
        dfs.append(year_df)
        
        clear_output(wait = True)
        print(str(yr) + ' / 2022')

    DF = pd.concat(dfs)

    return DF

### MVP history scrapping

In [3]:
# Most Valuable Player votes scrapping
mvp = award_history('mvp', 1978)
mvp = mvp.reset_index(drop = True)[['Player', 'Year', 'Share']]
mvp.to_csv(path + 'Scrapping/mvp_history.csv', index = None)

2022 / 2022


### ROY history scrapping

In [4]:
# Rookie of the Year votes scrapping
roy = award_history('roy', 1978)
roy = roy.reset_index(drop = True)[['Player', 'Year', 'Share']]
roy.to_csv(path + 'Scrapping/roy_history.csv', index = None)

2022 / 2022


### DPOY history scrapping

In [5]:
dpoy = award_history('dpoy', 1983)
dpoy = dpoy.reset_index(drop = True)[['Player', 'Year', 'Share']]
dpoy.to_csv(path + 'Scrapping/dpoy_history.csv', index = None)

2022 / 2022


### SMOY history scrapping

In [6]:
# 6th Man of the Year votes scrapping
smoy = award_history('smoy', 1984)
smoy = smoy.reset_index(drop = True)[['Player', 'Year', 'Share']]
smoy.to_csv(path + 'Scrapping/smoy_history.csv', index = None)

2022 / 2022


### MIP history scrapping

In [7]:
# Most Improved Player votes scrapping
mip = award_history('mip', 1988)
mip = mip.reset_index(drop = True)[['Player', 'Year', 'Share']]
mip.to_csv(path + 'Scrapping/mip_history.csv', index = None)

2022 / 2022


## B. Players scrapping

In [8]:
# Function to replace traded player with a single row, team is the last he played on
def single_player(df):
    if len(df) == 1: # If player played for a single team, do nothing
        return df
    else: # If player has moved, return total stats with last team
        row = df[df['Tm'] == 'TOT'].copy()
        row['Tm'] = str(df['Tm'].iloc[-1])
        return row

# Function to scrape players stats (per game & advanced)
def stat_year(yr):
    
    Types = ['per_game', 'advanced']
    dfs = []
    for Type in Types: # loop for both type of stats
    
        url = f'https://www.basketball-reference.com/leagues/NBA_{yr}_{Type}.html'
        page = requests.get(url)

        with open(path + f'SCrapping/player yearly data/{yr}.html', 'w+') as file:
            file.write(page.text)
            file.close()

        soup = BeautifulSoup(page.content, 'html')
        table = soup.find('table')

        stats = pd.read_html(str(table))[0]
        stats = stats[stats['Rk'] != 'Rk'].reset_index(drop = True) # Drop intra-data headers
        del stats['Rk']
        
        dfs.append(stats)
    
    # Deal with stats which appear in both per game and advanced data
    col_pg = list(dfs[0]) # Store columns in per game data
    col_pg.remove('Player'); col_pg.remove('Tm') # Remove merge columns
    col_adv = list(dfs[1]) # Store columns in advanced data
    unique_cols = [x for x in col_adv if x not in col_pg] # Keep advanced columns if they are unique
    
    # Merge on player and team
    DF = pd.merge(dfs[0], dfs[1][unique_cols], on = ['Player', 'Tm'])
    DF['Year'] = len(DF) * [yr] # Keep track of year
    del DF['Unnamed: 19']
    del DF['Unnamed: 24']
    
    DF = DF.groupby(['Player']).apply(single_player) # Apply single player function to each player
    DF = DF.reset_index(drop = True)
    
    return DF

In [10]:
# Run function for all years
years = list(range(1978, 2023))
dfs = []
for yr in years:
    dfs.append(stat_year(yr))
    clear_output(wait = True)
    print(str(yr) + ' / 2022')

data = pd.concat(dfs)
data['Player'] = data['Player'].str.replace('*', '', regex = False) # Clean player name column
data = data.reset_index(drop = True)

2022 / 2022


In [11]:
# Add individual ORtg and DRtg (can only be found in team data)
dfs_rtg = []
for i, tm_yr in enumerate(data.groupby(['Tm','Year']).groups.keys()):
    
    prog = round(100 * (i+1) / len(data.groupby(['Tm','Year']).groups.keys()), 2)
    print(f'{prog}% {tm_yr[0]} {tm_yr[1]-1}-{tm_yr[1]} ...')
    clear_output(wait = True)
    
    url = f'https://www.basketball-reference.com/teams/{tm_yr[0]}/{tm_yr[1]}.html'
    page = requests.get(url)

    table = page.text.split(f'<div class="table_container current" id="div_per_poss">')[1].split('</table>')[0] + '</table>'
    soup = BeautifulSoup(table, 'html')
    
    data_rtg = pd.read_html(str(soup))[0]
    data_rtg['Tm'] = len(data_rtg) * [tm_yr[0]]
    data_rtg['Year'] = len(data_rtg) * [tm_yr[1]]

    data_rtg = data_rtg.rename(columns = {'Unnamed: 1' : 'Player'})[['Player','Tm','Year','ORtg','DRtg']]
    dfs_rtg.append(data_rtg)

100.0% WSB 1996-1997 ...


In [12]:
data_rtg = pd.concat(dfs_rtg).rename(columns = {'ORtg':'ORtg/100', 'DRtg':'DRtg/100'})
data_rtg['Player'] = data_rtg['Player'].str.replace('*', '', regex = False) # Clean player name column
data = data.merge(data_rtg, on = ['Player','Year','Tm'], how = 'inner')

data.to_csv(path + 'Scrapping/player_data.csv', index = None)

## C. Teams scrapping

In [13]:
# Function to format team name post-2022, get rid of parentheses with seed following team name
#def team_name(tm, sd):
 #   l = []
 #   for team, seed in zip(tm, sd):
 #       if len(str(seed)) == 1:
 #           l.append(team[:-4])
 #       else:
  #          l.append(team[:-5])
  #  return l
###  NO LONGER FORMATTED AS SUCH ###

# Function to scrape team stats
def team_history(yr):
    
    # Scrape team standings
    url = f'https://www.basketball-reference.com/leagues/NBA_{yr}.html'
    page = requests.get(url)
    
    with open(path + f'Scrapping/team yearly data/sds_{yr}.html', 'w+') as file:
            file.write(page.text)
            file.close()
    
    soup = BeautifulSoup(page.content, 'html')
    
    # Decompose all headers, 3 by conference
    for i in list(range(0,6)):
        if soup.find('tr', class_='thead') is not None:
            soup.find('tr', class_='thead').decompose()

    # Eastern conference DataFrame
    table_E = soup.find('table', id = 'divs_standings_E')
    teams_E = pd.read_html(str(table_E))[0]
    teams_E = teams_E.rename(columns = {'Eastern Conference': 'Team'})
    # Western conference DataFrame
    table_W = soup.find('table', id = 'divs_standings_W')
    teams_W = pd.read_html(str(table_W))[0]
    teams_W = teams_W.rename(columns = {'Western Conference': 'Team'})
    # Calculate seeds
    teams_E['Seed'] = [len(teams_E[teams_E['W/L%'] > wl])+1 for wl in teams_E['W/L%']]
    teams_W['Seed'] = [len(teams_W[teams_W['W/L%'] > wl])+1 for wl in teams_W['W/L%']]
    teams_E = teams_E.sort_values('Seed'); teams_W = teams_W.sort_values('Seed') # Sort DataFrames by seed
    # Assemble league-wise table
    teams = pd.concat([teams_E, teams_W])

    # Data cleaning
    del teams['GB']
    teams['Team'] = teams['Team'].str.replace('*','', regex = False)
    teams['Year'] = len(teams) * [yr] # Keep track of year
    
    # Scrape team ratings
    url = f'https://www.basketball-reference.com/leagues/NBA_{yr}_ratings.html'
    page = requests.get(url)

    with open(path + f'Scrapping/team yearly data/rtg_{yr}.html', 'w+') as file:
            file.write(page.text)
            file.close()
    
    soup = BeautifulSoup(page.content, 'html')
    soup.find('tr', class_='over_header').decompose()    
    table = soup.find('table')
    ratings = pd.read_html(str(table))[0][['Team', 'MOV', 'ORtg', 'DRtg', 'NRtg', 'MOV/A', 'ORtg/A', 'DRtg/A', 'NRtg/A']]
 
    # Deal with Charlotte Hornets name problem
    ratings['Team'] = ratings['Team'].str.replace('Charlotte Hornets\xa0', 'Charlotte Hornets')
    teams['Team'] = teams['Team'].str.replace('Charlotte Hornets\xa0', 'Charlotte Hornets')

    # Merge standings and ratings
    teams = teams.merge(ratings, on = 'Team', how = 'outer')
    teams = teams.reset_index(drop = True)

    return teams

In [14]:
# Scrape teams data for all years
years = list(range(1978, 2023))
dfs = []
for yr in years:
    dfs.append(team_history(yr))
    clear_output(wait = True)
    print(str(yr) + ' / 2022')
    
data_tm = pd.concat(dfs)

2022 / 2022


In [15]:
team_dic = {'Syracuse Nationals': 'SYR','Buffalo Braves': 'BUF','New York Nets': 'NYN','Cincinnati Royals': 'CIN','Kansas City-Omaha Kings': 'KCO',
    'Capital Bullets': 'CAP','Sacramento Kings': 'SAC','Seattle SuperSonics': 'SEA','Los Angeles Clippers': 'LAC','Washington Wizards': 'WAS',
    'New Orleans/Oklahoma City Hornets': 'NOK','Phoenix Suns': 'PHO','New Jersey Nets': 'NJN','Washington Bullets': 'WSB','Boston Celtics': 'BOS',
    'Golden State Warriors': 'GSW','Denver Nuggets': 'DEN','Vancouver Grizzlies': 'VAN','Orlando Magic': 'ORL','Chicago Bulls': 'CHI','Utah Jazz': 'UTA',
    'Toronto Raptors': 'TOR','Los Angeles Lakers': 'LAL','Portland Trail Blazers': 'POR','Memphis Grizzlies': 'MEM','Miami Heat': 'MIA',
    'New Orleans Hornets': 'NOH','San Diego Rockets': 'SDR','Atlanta Hawks': 'ATL','Oklahoma City Thunder': 'OKC','Philadelphia Warriors': 'PHW',
    'Milwaukee Bucks': 'MIL','New Orleans Jazz': 'NOJ','San Antonio Spurs': 'SAS','Charlotte Hornets': 'CHO','Brooklyn Nets': 'BRK',
    'Cleveland Cavaliers': 'CLE','San Diego Clippers': 'SDC','San Francisco Warriors': 'SFW','Rochester Royals': 'ROC','Philadelphia 76ers': 'PHI',
    'Houston Rockets': 'HOU','Fort Wayne Pistons': 'FTW','Dallas Mavericks': 'DAL','New York Knicks': 'NYK','Kansas City Kings': 'KCK',
    'Indiana Pacers': 'IND','St. Louis Hawks': 'STL','Baltimore Bullets': 'BAL','Detroit Pistons': 'DET','Minnesota Timberwolves': 'MIN','New Orleans Pelicans': 'NOP',
    'Chicago Packers': 'CHP','Charlotte Bobcats': 'CHA','Chicago Zephyrs': 'CHZ','Minneapolis Lakers': 'MNL','Charlotte Hornets\xa0': 'CHO'}

In [16]:
Tm = []
for yr, tm in zip(data_tm['Year'], data_tm['Team']):
    if tm == 'Charlotte Hornets' and yr < 2015:
        Tm.append('CHH')
    else:
        Tm.append(team_dic.get(tm))
data_tm['Tm'] = Tm    
data_tm = data_tm.reset_index(drop = True)
data_tm.to_csv(path + 'Scrapping/team_data.csv', index = None)