In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd


In [2]:
def format_season(season):
        #Formats years in season to use in File Name
        #Ex: 2021 turns into 2020-21
        first_year = season-1
        second_year = str(season)[2:]
        
        return first_year, second_year

In [3]:
TEAM_TO_ABBR = {
    'ATLANTA HAWKS': 'ATL',
#     'ST. LOUIS HAWKS': 'SLH',
#     'MILWAUKEE HAWKS': 'MIL',
#     'TRI-CITIES BLACKHAWKS': 'TCB',
    'BOSTON CELTICS': 'BOS',
    'BROOKLYN NETS': 'BRK',
    'NEW JERSEY NETS' : 'BRK',
#     'NEW YORK NETS' : 'NYN',
    'CHICAGO BULLS': 'CHI',
    'CHARLOTTE HORNETS': 'CHO',
    'CHARLOTTE BOBCATS' : 'CHO',
    'CLEVELAND CAVALIERS': 'CLE',
    'DALLAS MAVERICKS': 'DAL',
    'DENVER NUGGETS': 'DEN',
    'DETROIT PISTONS': 'DET',
#     'FORT WAYNE PISTONS': 'FWP',
    'GOLDEN STATE WARRIORS': 'GSW',
#     'SAN FRANCISCO WARRIORS': 'SFW',
#     'PHILADELPHIA WARRIORS': 'PHI',
    'HOUSTON ROCKETS': 'HOU',
#     'SAN DIEGO ROCKETS': 'HOU',
    'INDIANA PACERS': 'IND',
    'LOS ANGELES CLIPPERS': 'LAC',
#     'SAN DIEGO CLIPPERS': 'SDC',
#     'BUFFALO BRAVES': 'BUF',
    'LOS ANGELES LAKERS': 'LAL',
#     'MINNEAPOLIS LAKERS': 'MIN',
    'MEMPHIS GRIZZLIES': 'MEM',
    'VANCOUVER GRIZZLIES' : 'MEM',
    'MIAMI HEAT': 'MIA',
    'MILWAUKEE BUCKS': 'MIL',
    'MINNESOTA TIMBERWOLVES': 'MIN',
    'NEW ORLEANS PELICANS' : 'NOP',
    'NEW ORLEANS/OKLAHOMA CITY HORNETS' : 'NOP',
    'NEW ORLEANS HORNETS' : 'NOP',
    'NEW YORK KNICKS' : 'NYK',
    'OKLAHOMA CITY THUNDER' : 'OKC',
    'SEATTLE SUPERSONICS' : 'OKC',
    'ORLANDO MAGIC' : 'ORL',
    'PHILADELPHIA 76ERS' : 'PHI',
#     'SYRACUSE NATIONALS' : 'SYR',
    'PHOENIX SUNS' : 'PHO',
    'PORTLAND TRAIL BLAZERS' : 'POR',
    'SACRAMENTO KINGS' : 'SAC',
#     'KANSAS CITY KINGS' : 'KCK',
#     'KANSAS CITY-OMAHA KINGS' : 'KCK',
#     'CINCINNATI ROYALS' : 'CIN',
#     'ROCHESTER ROYALS': 'ROR',
    'SAN ANTONIO SPURS' : 'SAS',
    'TORONTO RAPTORS' : 'TOR',
    'UTAH JAZZ' : 'UTA',
#     'NEW ORLEANS JAZZ' : 'NOJ',
    'WASHINGTON WIZARDS' : 'WAS',
#     'WASHINGTON BULLETS' : 'WAS',
#     'CAPITAL BULLETS' : 'CAP',
#     'BALTIMORE BULLETS' : 'BAL',
#     'CHICAGO ZEPHYRS' : 'CHI',
#     'CHICAGO PACKERS' : 'CHI',

    # DEFUNCT FRANCHISES
#     'ANDERSON PACKERS': 'AND',
#     'CHICAGO STAGS': 'CHI',
#     'INDIANAPOLIS OLYMPIANS': 'IND',
#     'SHEBOYGAN RED SKINS': 'SRS',
#     'ST. LOUIS BOMBERS': 'SLB',
#     'WASHINGTON CAPITOLS' : 'WAS',
#     'WATERLOO HAWKS': 'WAT',
    }

In [4]:
def get_team_stats(seasons, playoffs=False):
    selector = "div_per_game-team"
    regular_or_playoffs="leagues"
    team_column = "Team"
    
    if playoffs:
        regular_or_playoffs = "playoffs"
        team_column = "Tm"
        
    for season in seasons:
        #Reads the html data and turns it into a DataFrame
        url = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2F{regular_or_playoffs}%2FNBA_{season}.html&div={selector}'
        html = urlopen(url)
        soup = BeautifulSoup(html)
        table = soup.find('table')
        df = pd.read_html(str(table))[0]
        
        #Eliminates league abverage row
        league_avg_index = df[df[team_column] == 'League Average'].index[0]
        df = df[:league_avg_index]

        #Marks wether teams made the playoffs or not
        df["Made_Playoffs"] = False
        
        for team in df[team_column]:
            if '*' in team:
                df.loc[df[team_column] == team, ["Made_Playoffs"]] = True
                
        df[team_column] = df[team_column].apply(lambda x: x.replace('*', '').upper())
        df[team_column] = df[team_column].apply(lambda x: TEAM_TO_ABBR[x])
        df['Rk'] = df['Rk'].apply(lambda x: int(x))
        df = df.set_index('Rk')
        
        first_year, second_year = format_season(season)

        csv_file_name = "DataCollection/Team_Stats/team_stats_{0}-{1}.csv".format(first_year, second_year)
        if playoffs:
            csv_file_name = "DataCollection/Team_Stats_Playoffs/team_stats_playoffs{0}-{1}.csv".format(first_year, second_year)

        df.to_csv(csv_file_name, index=False)

In [5]:
def get_opponent_stats(seasons, playoffs=False):
    selector = "div_per_game-opponent"
    regular_or_playoffs="leagues"
    team_column = "Team"
    
    if playoffs:
        regular_or_playoffs = "playoffs"
        team_column = "Tm"
        
    for season in seasons:
        url = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2F{regular_or_playoffs}%2FNBA_{season}.html&div={selector}'        
        html = urlopen(url)
        soup = BeautifulSoup(html)
        table = soup.find('table')
        df = pd.read_html(str(table))[0]

        league_avg_index = df[df[team_column] == 'League Average'].index[0]
        df = df[:league_avg_index]
        
        df["Made_Playoffs"] = False
        for team in df[team_column]:
            if '*' in team:
                df.loc[df[team_column] == team, ["Made_Playoffs"]] = True
                
        df[team_column] = df[team_column].apply(lambda x: x.replace('*', '').upper())
        df[team_column] = df[team_column].apply(lambda x: TEAM_TO_ABBR[x])
        df['Rk'] = df['Rk'].apply(lambda x: int(x))
        df = df.set_index('Rk')
        
        static_columns = [team_column, "G", "MP"]
        opp_columns = static_columns
        for column in df.columns:
            if column not in static_columns:
                column = "OPP_"+column
                opp_columns.append(column)
                
        df.columns = opp_columns
    

        first_year, second_year = format_season(season)

        csv_file_name = "DataCollection/Opponent_Stats/opponent_stats_{0}-{1}.csv".format(first_year, second_year)
        if playoffs:
            csv_file_name = "DataCollection/Opponent_Stats_Playoffs/opponent_stats_playoffS_{0}-{1}.csv".format(first_year, second_year)

        df.to_csv(csv_file_name, index=False)

In [6]:
def get_advanced_team_stats(seasons, playoffs=False):
    
    regular_or_playoffs="leagues"
    team_column = "Team"
    
    if playoffs:
        regular_or_playoffs = "playoffs"
        team_column = "Tm"
            
    for season in seasons:
        url = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2F{regular_or_playoffs}%2FNBA_{season}.html&div=div_advanced-team'
        html = urlopen(url)
        soup = BeautifulSoup(html)
        table = soup.find('table')
        df = pd.read_html(str(table))[0]

        df.columns = list(map(lambda x: x[1], list(df.columns)))
        league_avg_index = df[df[team_column] == 'League Average'].index[0]
        df = df[:league_avg_index]

        df["Made_Playoffs"] = False
        for team in df[team_column]:
            if '*' in team:
                df.loc[df[team_column] == team, ["Made_Playoffs"]] = True

        df[team_column] = df[team_column].apply(lambda x: x.replace('*', '').upper())
        df[team_column] = df[team_column].apply(lambda x: TEAM_TO_ABBR[x])
        df['Rk'] = df['Rk'].apply(lambda x: int(x))
        df = df.set_index('Rk')
        df = df.loc[:, ~df.columns.str.contains("Unnamed")]

        columns = list(df.columns)
        if not playoffs:
            for i in range(len(columns)):
                column = columns[i]
                if i in range(16,20):
                    columns[i] = "Off_"+column
                elif i in range(20,24):
                    columns[i] = "Def_"+column
        else:
            for i in range(len(columns)):
                column = columns[i]
                if i in range(14,18):
                    columns[i] = "Off_"+column
                elif i in range(18,22):
                    columns[i] = "Def_"+column
        df.columns = columns

        first_year, second_year = format_season(season)

        csv_file_name = "DataCollection/Advanced_Team_Stats/adv_team_stats_{0}-{1}.csv".format(first_year, second_year)
        if playoffs:
             csv_file_name = "DataCollection/Advanced_Team_Stats_Playoffs/adv_team_stats_playoffs_{0}-{1}.csv".format(first_year, second_year)
        
        df.to_csv(csv_file_name, index=False)

In [7]:
seasons = range(2001, 2023)

get_team_stats(seasons)

In [8]:
get_team_stats(seasons, True)

In [9]:
get_opponent_stats(seasons)

In [10]:
get_opponent_stats(seasons, True)

In [11]:
seasons = range(2001, 2023)

get_advanced_team_stats(seasons)

In [12]:
get_advanced_team_stats(seasons, True)