## Collects nba player data from basketball-reference and ESPN.com

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import math

In [None]:
from pathlib import Path

output_file = 'my_file.csv'
output_dir = Path('long_path/to/my_dir')

output_dir.mkdir(parents=True, exist_ok=True)

df.to_csv(output_dir / output_file) 

In [2]:


TEAM_TO_ABBR = {
    'ATLANTA HAWKS': 'ATL',
    'ST. LOUIS HAWKS': 'SLH',
    'MILWAUKEE HAWKS': 'MIL',
    'TRI-CITIES BLACKHAWKS': 'TCB',
    'BOSTON CELTICS': 'BOS',
    'BROOKLYN NETS': 'BRK',
    'NEW JERSEY NETS' : 'BRK',
    'NEW YORK NETS' : 'NYN',
    'CHICAGO BULLS': 'CHI',
    'CHARLOTTE HORNETS': 'CHO',
    'CHARLOTTE BOBCATS' : 'CHO',
    'CLEVELAND CAVALIERS': 'CLE',
    'DALLAS MAVERICKS': 'DAL',
    'DENVER NUGGETS': 'DEN',
    'DETROIT PISTONS': 'DET',
    'FORT WAYNE PISTONS': 'FWP',
    'GOLDEN STATE WARRIORS': 'GSW',
    'SAN FRANCISCO WARRIORS': 'SFW',
    'PHILADELPHIA WARRIORS': 'PHI',
    'HOUSTON ROCKETS': 'HOU',
    'SAN DIEGO ROCKETS': 'HOU',
    'INDIANA PACERS': 'IND',
    'LOS ANGELES CLIPPERS': 'LAC',
    'SAN DIEGO CLIPPERS': 'SDC',
    'BUFFALO BRAVES': 'BUF',
    'LOS ANGELES LAKERS': 'LAL',
    'MINNEAPOLIS LAKERS': 'MIN',
    'MEMPHIS GRIZZLIES': 'MEM',
    'VANCOUVER GRIZZLIES' : 'MEM',
    'MIAMI HEAT': 'MIA',
    'MILWAUKEE BUCKS': 'MIL',
    'MINNESOTA TIMBERWOLVES': 'MIN',
    'NEW ORLEANS PELICANS' : 'NOP',
    'NEW ORLEANS/OKLAHOMA CITY HORNETS' : 'NOP',
    'NEW ORLEANS HORNETS' : 'NOP',
    'NEW YORK KNICKS' : 'NYK',
    'OKLAHOMA CITY THUNDER' : 'OKC',
    'SEATTLE SUPERSONICS' : 'OKC',
    'ORLANDO MAGIC' : 'ORL',
    'PHILADELPHIA 76ERS' : 'PHI',
    'SYRACUSE NATIONALS' : 'SYR',
    'PHOENIX SUNS' : 'PHO',
    'PORTLAND TRAIL BLAZERS' : 'POR',
    'SACRAMENTO KINGS' : 'SAC',
    'KANSAS CITY KINGS' : 'KCK',
    'KANSAS CITY-OMAHA KINGS' : 'KCK',
    'CINCINNATI ROYALS' : 'CIN',
    'ROCHESTER ROYALS': 'ROR',
    'SAN ANTONIO SPURS' : 'SAS',
    'TORONTO RAPTORS' : 'TOR',
    'UTAH JAZZ' : 'UTA',
    'NEW ORLEANS JAZZ' : 'NOJ',
    'WASHINGTON WIZARDS' : 'WAS',
    'WASHINGTON BULLETS' : 'WAS',
    'CAPITAL BULLETS' : 'CAP',
    'BALTIMORE BULLETS' : 'BAL',
    'CHICAGO ZEPHYRS' : 'CHI',
    'CHICAGO PACKERS' : 'CHI',
    
    # DEFUNCT FRANCHISES
#     'ANDERSON PACKERS': 'AND',
#     'CHICAGO STAGS': 'CHI',
#     'INDIANAPOLIS OLYMPIANS': 'IND',
#     'SHEBOYGAN RED SKINS': 'SRS',
#     'ST. LOUIS BOMBERS': 'SLB',
#     'WASHINGTON CAPITOLS' : 'WAS',
#     'WATERLOO HAWKS': 'WAT',
    }

In [3]:
ALL_NEW_ABBR = {
    'ATL': 'ATL',
    'BOS': 'BOS',
    'BRK': 'BRK',
    'NJN' : 'BRK',
    'CHH' : 'CHO',
    'CHI': 'CHI',
    'CHO': 'CHO',
    'CHA' : 'CHO',
    'CLE': 'CLE',
    'DAL': 'DAL',
    'DEN': 'DEN',
    'DET': 'DET',
    'GSW': 'GSW',
    'HOU': 'HOU',
    'IND': 'IND',
    'LAC': 'LAC',
    'SDC': 'LAC',
    'LAL': 'LAL',
    'MEM': 'MEM',
    'VAN' : 'MEM',
    'MIA': 'MIA',
    'MIL': 'MIL',
    'MIN': 'MIN',
    'NOP' : 'NOP',
    'NOK' : 'NOP',
    'NOH' : 'NOP',
    'NYK' : 'NYK',
    'OKC' : 'OKC',
    'SEA' : 'OKC',
    'ORL' : 'ORL',
    'PHI' : 'PHI',
    'PHO' : 'PHO',
    'POR' : 'POR',
    'SAC' : 'SAC',
    'SAS' : 'SAS',
    'TOR' : 'TOR',
    'UTA' : 'UTA',
    'WAS' : 'WAS',
    'WSB' : 'WAS',
    'TOT' : 'TOT'
    }

In [4]:
def format_season(season):
        #Formats years in season to use in File Name
        #Ex: 2021 turns into 2020-21
        first_year = season-1
        second_year = str(season)[2:]
        
        return first_year, second_year

In [5]:
def get_player_salaries(season):    
    page = 1
    players_per_page = 40
    headers_row = ['RK', 'NAME', 'TEAM', 'SALARY']
    
    url = "http://www.espn.com/nba/salaries/_/year/{0}/page/{1}".format(season, page)
    html = urlopen(url)
    soup = BeautifulSoup(html)
    
    total_results = soup.find('div', class_='totalResults').getText()
    total_results = int(total_results.split(' ')[0])    
    total_pages = math.ceil(total_results / players_per_page)
    
    headers = [td.getText() for td in soup.find('tr', class_='colhead')]
    season_salaries = []
    
    
    for page in range(1, total_pages+1):
        url = "http://www.espn.com/nba/salaries/_/year/{0}/page/{1}".format(season, page)
        html = urlopen(url)
        soup = BeautifulSoup(html)    
        
        rows = soup.findAll('tr')[1:]
        page_salaries = [[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))]
        adjusted_salaries = []
        for salary in page_salaries:
            if salary != headers_row:
                name, _ = salary[1].split(', ')
                salary_value = salary[3].replace('$', '').replace(',','')
                salary_value = float(salary_value)
                adjusted_salaries.append([salary[0], name, salary[2], salary_value])
            
        season_salaries.extend(adjusted_salaries)
        
    df = pd.DataFrame(season_salaries, columns=headers)
    df = df.set_index('RK')
    df.drop(columns=['TEAM'], inplace=True)
    df = df.rename(columns={"NAME":"Player", "SALARY":"Salary"})
    
    return df

In [7]:
def get_player_stats(season, regular_or_playoffs):
    url = "https://www.basketball-reference.com/{0}/NBA_{1}_per_game.html".format(regular_or_playoffs, season)
    html = urlopen(url)
    soup = BeautifulSoup(html)

    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    headers = headers[1:]
    
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]for i in range(len(rows))]
    df = pd.DataFrame(player_stats, columns=headers)
    return df


In [8]:
def get_player_totals(season, regular_or_playoffs):
    url = "https://www.basketball-reference.com/{0}/NBA_{1}_totals.html".format(regular_or_playoffs, season)
    html = urlopen(url)
    soup = BeautifulSoup(html)

    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    headers = headers[1:]
    
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]for i in range(len(rows))]
    df = pd.DataFrame(player_stats, columns=headers)
    return df


In [9]:
def get_advanced_player_stats(season, regular_or_playoffs):
    url = "https://www.basketball-reference.com/{0}/NBA_{1}_advanced.html".format(regular_or_playoffs, season)
    html = urlopen(url)
    soup = BeautifulSoup(html)
    
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    headers = headers[1:]
    
    rows = soup.findAll('tr')[1:]
    advanced_stats = [[td.getText() for td in rows[i].findAll('td')]for i in range(len(rows))]
    df = pd.DataFrame(advanced_stats, columns=headers)
    
    #removing columns that will not be necessary or don't have data
    df = df.loc[:, ~df.columns.isin(['Pos', 'Age', 'G', 'MP', '\xa0', ' .1'])]
    return df

In [23]:
def get_player_shooting_stats(season):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_shooting.html".format(season)
    html = urlopen(url)
    soup = BeautifulSoup(html)

    headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
    headers = headers[1:]
    headers[-7] = "Dunks made"
    headers[-5] = "%3PA corner"
    headers[-4] = "3P% corner"
    headers[-2] = "Heaved att."
    headers[-1] = "Heaves made"
    headers[9:15] = ["% Att 2P", "% Att 0-3", "% Att 3-10", "% Att 10-16", "% Att 16-3P", "% Att 3P"]
    headers[16:22] = ["% Made 2P", "% Made 0-3", "% Made 3-10", "% Made 10-16", "% Made 16-3P", "% Made 3P"]
    headers[23:25] = ["% 2P Asst.", "% 3P Asst."]

    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]for i in range(len(rows))]
    player_stats = player_stats[1:]

    df = pd.DataFrame(player_stats, columns=headers)
    df = df.loc[:, ~df.columns.isin(['\xa0'])]
    df = df.dropna(how='all')
    
    return df

In [11]:
def get_adjusted_player_shooting_stats(season):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_adj_shooting.html".format(season)
    html = urlopen(url)
    soup = BeautifulSoup(html)

    print(soup.findAll('table'))
    
#     headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
#     headers = headers[1:]
#     rows = soup.findAll('tr')[1:]
#     player_stats = [[td.getText() for td in rows[i].findAll('td')]for i in range(len(rows))]
#     player_stats = player_stats[1:]

#     df = pd.DataFrame(player_stats, columns=headers)
#     df = shooting_df.loc[:, ~shooting_df.columns.isin(['\xa0', 'Pos', 'Age', 'G', 'MP', '2P', '3P'])]
#     df = df.rename(columns={"Team : Tm"})
    
#     return df
    

In [12]:
def get_player_stats_to_df(season):
    #Gets data from regular season url
    regular_season_url = "leagues"
    #Gets traditional and advances stats tables
    traditional_stats_df = get_player_stats(season, regular_season_url)
    advanced_stats_df = get_advanced_player_stats(season, regular_season_url)
    salaries_df = get_player_salaries(season)
    
    #Merges dataframes into one
    stats = pd.merge(traditional_stats_df, advanced_stats_df, on=["Player", "Tm"])
    stats = stats.dropna(how='all')    
    stats_and_salaries = pd.merge(stats, salaries_df, on="Player", how="left")
    
    return stats_and_salaries

In [13]:
def get_playoffs_player_stats_to_df(season):
    #Gets data from regular season url
    playoffs_url = 'playoffs'
    
    #Gets traditional and advances stats tables
    traditional_stats_df = get_player_stats(season, playoffs_url)
    advanced_stats_df = get_advanced_player_stats(season, playoffs_url)
    
    #Merges dataframes into one
    stats = pd.merge(traditional_stats_df, advanced_stats_df, on="Player")
    stats = stats.dropna(how='all')
    stats.rename(columns={'Tm_x' : 'Tm'}, inplace=True)
    
    stats['Tm'] = stats['Tm'].apply(lambda x: ALL_NEW_ABBR[x])
    
    return stats

In [14]:
def get_player_totals_to_df(season):
    #Gets data from regular season url
    regular_season_url = "leagues"
    #Gets traditional and advances stats tables
    stats = get_player_totals(season, regular_season_url)
    salaries_df = get_player_salaries(season)
    
    #Merges dataframes into one
    stats = stats.dropna(how='all')    
    stats_and_salaries = pd.merge(stats, salaries_df, on="Player", how="left")
        
    return stats_and_salaries

In [15]:
def player_stats_to_csv(seasons, playoffs=False):
    for season in seasons:
        player_stats = None
        #Gets data either from playoffs or from regular season
        if playoffs:
            player_stats = get_playoffs_player_stats_to_df(season)
        else:
            player_stats = get_player_stats_to_df(season)
        
        first_year, second_year = format_season(season)
        
        #Sets file_name for playoffs data or for regular season data
        csv_file_name = ""
        if playoffs:
            csv_file_name = "DataCollection/Player_Stats_Playoffs/player_stats_playoffs_{0}-{1}.csv".format(first_year, second_year)
        else:
            csv_file_name = "DataCollection/Player_Stats/player_stats_{0}-{1}.csv".format(first_year, second_year)

        #Saves data to csv file
        player_stats.to_csv(csv_file_name, index=False)

In [16]:
def player_totals_to_csv(seasons):
    for season in seasons:
        player_stats = get_player_totals_to_df(season)
        
        first_year, second_year = format_season(season)
        
        csv_file_name = "DataCollection/Player_Totals/player_totals_{0}-{1}.csv".format(first_year, second_year)

        #Saves data to csv file
        player_stats.to_csv(csv_file_name, index=False)

In [21]:
def player_shooting_stats_to_csv(seasons):
    for season in seasons:
        
        shooting_df = get_player_shooting_stats(season)
#         adj_shooting_df = get_adjusted_player_shooting_stats(season)        

#         df = pd.merge()
        
        first_year, second_year = format_season(season)

        csv_file_name = "DataCollection/Player_Shooting_Stats/Regular_Season/player_shooting_stats_{0}-{1}.csv".format(first_year, second_year)

        shooting_df.to_csv(csv_file_name, index=False)
    


In [18]:
#NBA season to analyze
seasons = range(2000,2023)

player_stats_to_csv(seasons)

In [19]:
#NBA season to analyze
seasons = range(2000,2023)

player_totals_to_csv(seasons)

2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022


In [None]:
player_shooting_stats_to_csv(seasons)

In [None]:
playoff_seasons = range(2000, 2023)
player_stats_to_csv(playoff_seasons, True)