In [58]:
### IMPORT NECESSARY LIBRARIES / PACKAGES ###
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
from IPython.display import clear_output
from matplotlib import pyplot as plt
from time import sleep
import warnings
warnings.filterwarnings('ignore')

In [59]:
cap = pd.read_csv('salary_cap.csv') # Load salary cap history
cap['Salary Cap'] = cap['Salary Cap'].str.replace('$', '').str.replace(',', '').astype(float)
cap['2021$'] = cap['2021$'].str.replace('$', '').str.replace(',', '').astype(float)

cols_adv = ['Player','Age','G','MP','PER','TS%','3PAr','FTr','ORB%','DRB%','TRB%','AST%','STL%','BLK%','TOV%','USG%','OWS','DWS','WS','WS/48','OBPM','DBPM','BPM','VORP']
cols_pg = ['Player','GS','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS/G']
cols_sal = ['Player','Salary']

In [1]:
# Function takes into input team and year and returns advanced statistics / salaries df
def adv_and_sal(team, year, min_mins = 250, min_games = 10):

    # URL to scrape to get headers for loop
    url = f"https://www.basketball-reference.com/teams/{team}/{year}.html"
    page = requests.get(url)
    # Raise an error if webpage is empty
    if int(str(page)[-5:-2]) == 404:
        print(team, year)
        raise ValueError('Error')

    ### SALARIES ###
    table_sal = page.text.split('<div class="table_container" id="div_salaries2">')[1].split('</table>')[0] + '</table>'
    soup_sal = BeautifulSoup(table_sal, 'html')
    salaries = pd.read_html(str(soup_sal))[0].rename(columns = {'Unnamed: 1': 'Player'})
    salaries['Tm'] = len(salaries) * [team]
    del salaries['Rk']

    ### ADVANCED STATS ###
    table_adv = page.text.split('<div class="table_container current" id="div_advanced">')[1].split('</table>')[0] + '</table>'
    soup_adv = BeautifulSoup(table_adv, 'html')
    adv_stats = pd.read_html(str(soup_adv))[0].rename(columns = {'Unnamed: 1': 'Player'})
    adv_stats['Tm'] = len(adv_stats) * [team]
    del adv_stats['Rk']

    ### PER GAME ###
    table_pg = page.text.split('<div class="table_container current" id="div_per_game">')[1].split('</table>')[0] + '</table>'
    soup_pg = BeautifulSoup(table_pg, 'html')
    per_game = pd.read_html(str(soup_pg))[0].rename(columns = {'Unnamed: 1': 'Player'})
    per_game['Tm'] = len(per_game) * [team]
    del per_game['Rk']

    temp = pd.merge(adv_stats[cols_adv], salaries[cols_sal], on = ['Player', 'Tm'])
    df = pd.merge(per_game[cols_pg], temp, on = ['Player', 'Tm'])

    df = df.apply(pd.to_numeric, errors = 'ignore')
    df = df[df['MP'] >= 2]

    for stat in list(df) :
        df[stat] = df[stat].replace(np.nan, 0)

    df['Year'] = len(df) * [year]
    df['Salary'] = salaries['Salary'].str.replace('$', '').str.replace(',', '').astype(float) # Convert salary to numeric
    df['Salary cap'] = len(df) * [cap.groupby('Year').get_group(year)['Salary Cap'].iloc[0]] # Add salary cap
    df['%Cap'] = [int(i) / int(j) for i, j in zip(list(df['Salary']), list(df['Salary cap']))]
        
    df = df[df['%Cap'] > 0.5 / 100]
    
    return df

In [70]:
### ITERATING OVER TEAMS AND YEARS ###
years = np.arange(2000, 2023)
Control = True
dfs_yearly = []

for yr in years:

    # Set list of teams for given year
    url = f"https://www.basketball-reference.com/leagues/NBA_{yr}_coaches.html"
    page = requests.get(url)
    soup_y = BeautifulSoup(page.content, features = 'html')
    soup_y.find('tr', 'over_header').decompose()
    soup_y.find('tr', 'over_header').decompose()
    coaches = pd.read_html(str(soup_y))[0]
    teams = list(set(coaches['Tm']))
    
    dfs = []
    for i, tm in enumerate(teams):
        sleep(2)
        if Control:
            print(str(i+1) + '.' + tm + ' ', end = '')
        dfs.append(adv_and_sal(tm, yr))
        
    dfs_yearly.append(pd.concat(dfs).reset_index(drop = True))
    
    if Control:
        print('')
        print('Year ' + str(yr) + ': Processed!')

    clear_output(wait = True)
    sleep(2)

1.MIA 2.HOU 3.UTA 4.POR 5.ATL 6.TOR 7.LAL 8.MIL 9.IND 10.DEN 11.LAC 12.CHI 13.CLE 14.PHI 15.PHO 16.NYK 17.DAL 18.DET 19.BOS 20.CHO 21.BRK 22.SAC 23.SAS 24.WAS 25.OKC 26.MIN 27.GSW 28.ORL 29.MEM 30.NOP 
Year 2022: Processed!


In [75]:
Data_Frame = pd.concat(dfs_yearly).reset_index(drop = True).sort_values(by = '%Cap', ascending = False)
Data_Frame.head()

Unnamed: 0,Player,GS,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,...,WS,WS/48,OBPM,DBPM,BPM,VORP,Salary,Year,Salary cap,%Cap
1924,Kevin Garnett,82,9.8,19.6,0.499,0.1,0.5,0.256,9.7,19.1,...,18.3,0.272,6.8,3.3,10.2,10.0,28000000.0,2004,43840000.0,0.638686
1983,Dwyane Wade,77,8.2,17.1,0.478,0.2,0.6,0.289,8.0,16.5,...,11.0,0.177,3.9,0.7,4.6,4.9,27696430.0,2005,43870000.0,0.63133
1540,Kevin Garnett,82,9.1,18.1,0.502,0.2,0.9,0.282,8.8,17.2,...,15.6,0.225,6.0,2.4,8.4,8.7,25200000.0,2003,40271000.0,0.62576
1317,Kobe Bryant,82,10.6,23.5,0.451,1.5,4.0,0.383,9.1,19.5,...,14.9,0.21,6.3,0.7,7.1,7.7,23571429.0,2003,40271000.0,0.58532
1700,Kobe Bryant,64,7.9,18.1,0.438,1.1,3.3,0.327,6.8,14.8,...,10.7,0.21,5.1,0.5,5.6,4.7,24749999.0,2004,43840000.0,0.564553


In [76]:
Data_Frame.to_csv('nba_salaries_data.csv', index = False)