## Collects nba player data from basketball-reference and ESPN.com

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import pandas as pd
import math

from ipynb.fs.full.UtilCollections import TEAM_TO_ABBR, ALL_NEW_ABBR
from ipynb.fs.full.UtilFunctions import format_season

In [2]:
def get_player_salaries(season):    
    page = 1
    players_per_page = 40
    headers_row = ['RK', 'NAME', 'TEAM', 'SALARY']
    
    url = "https://www.espn.com/nba/salaries/_/year/2023"   
    soup = BeautifulSoup(urlopen(url))
    
    total_results = soup.find('div', class_='totalResults').getText()
    total_results = int(total_results.split(' ')[0])    
    total_pages = math.ceil(total_results / players_per_page)
    
    headers = [td.getText() for td in soup.find('tr', class_='colhead')]
    season_salaries = []
    
    for page in range(5, 6):
        url = "http://www.espn.com/nba/salaries/_/year/{0}/page/{1}".format(season, page)
        html = urlopen(url)
        soup = BeautifulSoup(html)    
        
        rows = soup.findAll('tr')[1:]
        page_salaries = [[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))]
        adjusted_salaries = []
        for salary in page_salaries:
            if salary != headers_row:
                name, _ = salary[1].split(', ')
                salary_value = salary[3].replace('$', '').replace(',','')
                salary_value = float(salary_value)
                adjusted_salaries.append([salary[0], name, salary[2], salary_value])
            
        season_salaries.extend(adjusted_salaries)
        
    df = pd.DataFrame(season_salaries, columns=headers)
    df = df.set_index('RK')
    df.drop(columns=['TEAM'], inplace=True)
    df = df.rename(columns={"NAME":"Player", "SALARY":"Salary"})
    
    return df

In [3]:
def get_player_stats(season, regular_or_playoffs):
    url = "https://www.basketball-reference.com/{0}/NBA_{1}_per_game.html".format(regular_or_playoffs, season)
    soup = BeautifulSoup(urlopen(url))

    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    headers = headers[1:]
    
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]for i in range(len(rows))]
    df = pd.DataFrame(player_stats, columns=headers)
    return df


In [4]:
def get_player_totals(season, regular_or_playoffs):
    url = "https://www.basketball-reference.com/{0}/NBA_{1}_totals.html".format(regular_or_playoffs, season)
    soup = BeautifulSoup(urlopen(url))

    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    headers = headers[1:]
    
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]for i in range(len(rows))]
    df = pd.DataFrame(player_stats, columns=headers)
    return df


In [5]:
def get_advanced_player_stats(season, regular_or_playoffs):
    url = "https://www.basketball-reference.com/{0}/NBA_{1}_advanced.html".format(regular_or_playoffs, season)
    soup = BeautifulSoup(urlopen(url))
    
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    headers = headers[1:]
    
    rows = soup.findAll('tr')[1:]
    advanced_stats = [[td.getText() for td in rows[i].findAll('td')]for i in range(len(rows))]
    df = pd.DataFrame(advanced_stats, columns=headers)
    
    #removing columns that will not be necessary or don't have data
    df = df.loc[:, ~df.columns.isin(['Pos', 'Age', 'G', 'MP', '\xa0', ' .1'])]
    return df

In [6]:
def get_player_shooting_stats(season):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_shooting.html".format(season)
    soup = BeautifulSoup(urlopen(url))
    
    #Renaming headers to make it easier to read the data
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
    headers = headers[1:]
    headers[-7] = "Dunks made"
    headers[-5] = "%3PA corner"
    headers[-4] = "3P% corner"
    headers[-2] = "Heaved att."
    headers[-1] = "Heaves made"
    headers[9:15] = ["% Att 2P", "% Att 0-3ft", "% Att 3-10ft", "% Att 10-16ft", "% Att 16ft-3P", "% Att 3P"]
    headers[16:22] = ["% Made 2P", "% Made 0-3ft", "% Made 3-10ft", "% Made 10-16ft", "% Made 16ft-3P", "% Made 3P"]
    headers[23:25] = ["% 2P Asst.", "% 3P Asst."]

    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]for i in range(len(rows))]
    player_stats = player_stats[1:]
    
    df = pd.DataFrame(player_stats, columns=headers)
    df = df.loc[:, ~df.columns.isin(['\xa0'])]
    df = df.dropna(how='all')
    df.columns = ["Player", "Pos", "Age", "Tm", "G", "MP", "FG%", "AvgDistance", "2PAr",
                         "0-3Ar", "3-10Ar", "10-16Ar", "16-3PAr", "3PAr", "2P%", "0-3%", "3-10%", "10-16%",
                         "16-3P%", "3P%", "2P_Assisted", "3P_Assisted", "DunksAr", "Dunks", "3PCornerAr", "3PCorner%",
                         "HeavesAttempted", "Heaves"]
    return df

In [7]:
def get_awards_history(seasons):
    for season in seasons:
        first_year, second_year = format_season(season)
        url = "https://www.basketball-reference.com/leagues/NBA_{}.html#all_all_awards".format(season)
        soup = BeautifulSoup(urlopen(url))

        for comment in soup(text=lambda text: isinstance(text, Comment)):
            if 'id="all_awards"' in comment.string:
                tag = BeautifulSoup(comment, 'html.parser')
                comment.replace_with(tag)
                break

        table = soup.find('table', id='all_awards')
        df = pd.read_html(str(table))[0]
        df.dropna(how='any', inplace=True)
        df = df[df['Award'] != "Player of the Seeding Games"]
        df.index = ['MVP', 'ROY', 'DPOY', 'MIP', 'SMOY']
        df.drop('Award', axis=1, inplace=True)
        df.to_csv("DataCollection/Awards/awards_{0}-{1}.csv".format(first_year, second_year))

In [54]:
def get_all_star_voting(seasons):
    for season in seasons:
        first_year, second_year = format_season(season)
        url = "https://www.basketball-reference.com/allstar/NBA_{}_voting.html".format(season)
        soup = BeautifulSoup(urlopen(url))
        table = soup.find('table', id='bc-w')
        bcw_df = pd.read_html(str(table))[0]
        
        url = "https://www.basketball-reference.com/allstar/NBA_{}_voting-frontcourt-western-conference.html".format(season)
        soup = BeautifulSoup(urlopen(url))
        table = soup.find('table', id='fc-w')
        fcw_df = pd.read_html(str(table))[0]        
        
        url = "https://www.basketball-reference.com/allstar/NBA_{}_voting-backcourt-eastern-conference.html".format(season)
        soup = BeautifulSoup(urlopen(url))
        table = soup.find('table', id='bc-e')
        bce_df = pd.read_html(str(table))[0]
        
        url = "https://www.basketball-reference.com/allstar/NBA_{}_voting-frontcourt-eastern-conference.html".format(season)
        soup = BeautifulSoup(urlopen(url))
        table = soup.find('table', id='fc-e')
        fce_df = pd.read_html(str(table))[0]

        df = pd.concat([bcw_df, fcw_df, bce_df, fce_df])
        
        df = df.droplevel(0, axis=1)
        df.drop(columns=["Season", 'Unnamed: 4_level_1', 'Unnamed: 7_level_1', 'Unnamed: 10_level_1'], inplace=True)
        df.columns = ["Player", "Fan_Votes", "Fan_Rank", "Player_Votes", "Player_Rank", "Media_Votes", "Media_Rank", "Score"]
        df.to_csv("DataCol")

In [58]:
get_all_star_voting(seasons)

ValueError: No tables found

In [8]:
def get_regular_season_player_stats_to_df(season, salaries_df):
    regular_season_url = "leagues"
    
    traditional_stats_df = get_player_stats(season, regular_season_url)
    advanced_stats_df = get_advanced_player_stats(season, regular_season_url)
    
    stats = pd.merge(traditional_stats_df, advanced_stats_df, on=["Player", "Tm"])
    stats.dropna(how='all', inplace=True)    
    stats_and_salaries = pd.merge(stats, salaries_df, on="Player", how="left")
    
    return stats_and_salaries

In [9]:
def get_playoffs_player_stats_to_df(season):
    playoffs_url = 'playoffs'
    
    traditional_stats_df = get_player_stats(season, playoffs_url)
    advanced_stats_df = get_advanced_player_stats(season, playoffs_url)
    
    stats = pd.merge(traditional_stats_df, advanced_stats_df, on="Player")
    stats.dropna(how='all', inplace=True)
    stats.rename(columns={'Tm_x' : 'Tm'}, inplace=True)
    
    stats['Tm'] = stats['Tm'].apply(lambda x: ALL_NEW_ABBR[x])
    
    return stats

In [10]:
def get_player_totals_to_df(season):
    regular_season_url = "leagues"

    stats = get_player_totals(season, regular_season_url)
    salaries_df = get_player_salaries(season)
    
    stats = stats.dropna(how='all')    
    stats_and_salaries = pd.merge(stats, salaries_df, on="Player", how="left")
        
    return stats_and_salaries

In [11]:
def player_stats_to_csv(seasons, salaries_df, playoffs=False):
    for season in seasons:
        player_stats = None
        csv_file_name = ""
        
        first_year, second_year = format_season(season)

        if playoffs:
            player_stats = get_playoffs_player_stats_to_df(season)
            csv_file_name = "DataCollection/Player_Stats_Playoffs/player_stats_playoffs_{0}-{1}.csv".format(first_year, second_year)
        else:
            player_stats = get_regular_season_player_stats_to_df(season, salaries_df)
            csv_file_name = "DataCollection/Player_Stats/player_stats_{0}-{1}.csv".format(first_year, second_year)
        
        player_stats.to_csv(csv_file_name, index=False)

In [12]:
def player_totals_to_csv(seasons):
    for season in seasons:
        player_stats = get_player_totals_to_df(season)
        
        first_year, second_year = format_season(season)        
        csv_file_name = "DataCollection/Player_Totals/player_totals_{0}-{1}.csv".format(first_year, second_year)

        player_stats.to_csv(csv_file_name, index=False)

In [13]:
def player_shooting_stats_to_csv(seasons):
    for season in seasons:
        
        shooting_df = get_player_shooting_stats(season)        

        first_year, second_year = format_season(season)
        csv_file_name = "DataCollection/Player_Shooting_Stats/Regular_Season/player_shooting_stats_{0}-{1}.csv".format(first_year, second_year)

        shooting_df.to_csv(csv_file_name, index=False)

In [57]:
#NBA season to analyze
seasons = range(2000,2024)

In [95]:
player_stats_to_csv([2023])

In [None]:
player_totals_to_csv(seasons)

In [21]:
player_shooting_stats_to_csv(seasons)

In [None]:
player_stats_to_csv(seasons, True)