In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
TEAM_TO_ABBR = {
    'ATLANTA HAWKS': 'ATL',
#     'ST. LOUIS HAWKS': 'SLH',
#     'MILWAUKEE HAWKS': 'MIL',
#     'TRI-CITIES BLACKHAWKS': 'TCB',
    'BOSTON CELTICS': 'BOS',
    'BROOKLYN NETS': 'BRK',
    'NEW JERSEY NETS' : 'BRK',
#     'NEW YORK NETS' : 'NYN',
    'CHICAGO BULLS': 'CHI',
    'CHARLOTTE HORNETS': 'CHO',
    'CHARLOTTE BOBCATS' : 'CHO',
    'CLEVELAND CAVALIERS': 'CLE',
    'DALLAS MAVERICKS': 'DAL',
    'DENVER NUGGETS': 'DEN',
    'DETROIT PISTONS': 'DET',
#     'FORT WAYNE PISTONS': 'FWP',
    'GOLDEN STATE WARRIORS': 'GSW',
#     'SAN FRANCISCO WARRIORS': 'SFW',
#     'PHILADELPHIA WARRIORS': 'PHI',
    'HOUSTON ROCKETS': 'HOU',
#     'SAN DIEGO ROCKETS': 'HOU',
    'INDIANA PACERS': 'IND',
    'LOS ANGELES CLIPPERS': 'LAC',
#     'SAN DIEGO CLIPPERS': 'SDC',
#     'BUFFALO BRAVES': 'BUF',
    'LOS ANGELES LAKERS': 'LAL',
#     'MINNEAPOLIS LAKERS': 'MIN',
    'MEMPHIS GRIZZLIES': 'MEM',
    'VANCOUVER GRIZZLIES' : 'MEM',
    'MIAMI HEAT': 'MIA',
    'MILWAUKEE BUCKS': 'MIL',
    'MINNESOTA TIMBERWOLVES': 'MIN',
    'NEW ORLEANS PELICANS' : 'NOP',
    'NEW ORLEANS/OKLAHOMA CITY HORNETS' : 'NOP',
    'NEW ORLEANS HORNETS' : 'NOP',
    'NEW YORK KNICKS' : 'NYK',
    'OKLAHOMA CITY THUNDER' : 'OKC',
    'OKLAHOMA CITY HORNETS' : 'NOP',
    'SEATTLE SUPERSONICS' : 'OKC',
    'ORLANDO MAGIC' : 'ORL',
    'PHILADELPHIA 76ERS' : 'PHI',
#     'SYRACUSE NATIONALS' : 'SYR',
    'PHOENIX SUNS' : 'PHO',
    'PORTLAND TRAIL BLAZERS' : 'POR',
    'SACRAMENTO KINGS' : 'SAC',
#     'KANSAS CITY KINGS' : 'KCK',
#     'KANSAS CITY-OMAHA KINGS' : 'KCK',
#     'CINCINNATI ROYALS' : 'CIN',
#     'ROCHESTER ROYALS': 'ROR',
    'SAN ANTONIO SPURS' : 'SAS',
    'TORONTO RAPTORS' : 'TOR',
    'UTAH JAZZ' : 'UTA',
#     'NEW ORLEANS JAZZ' : 'NOJ',
    'WASHINGTON WIZARDS' : 'WAS',
#     'WASHINGTON BULLETS' : 'WAS',
#     'CAPITAL BULLETS' : 'CAP',
#     'BALTIMORE BULLETS' : 'BAL',
#     'CHICAGO ZEPHYRS' : 'CHI',
#     'CHICAGO PACKERS' : 'CHI',

    # DEFUNCT FRANCHISES
#     'ANDERSON PACKERS': 'AND',
#     'CHICAGO STAGS': 'CHI',
#     'INDIANAPOLIS OLYMPIANS': 'IND',
#     'SHEBOYGAN RED SKINS': 'SRS',
#     'ST. LOUIS BOMBERS': 'SLB',
#     'WASHINGTON CAPITOLS' : 'WAS',
#     'WATERLOO HAWKS': 'WAT',
    }

In [3]:
def format_season(season):
        #Formats years in season to use in File Name
        #Ex: 2021 turns into 2020-21
        first_year = season-1
        second_year = str(season)[2:]
        
        return first_year, second_year

In [5]:
def get_standings_from_espn(season):
    first_year, second_year = format_season(season)
    url = "https://www.espn.com/nba/standings/_/season/{}".format(season)
    
    html = urlopen(url)
    soup = BeautifulSoup(html)
    
    tables = soup.findAll('table')
    eastern_teams = pd.read_html(str(tables[0]))[0]
    eastern_teams.loc[-1] = eastern_teams.columns[0]
    eastern_teams.index = eastern_teams.index + 2  # shifting index
    eastern_teams.sort_index(inplace=True)
    eastern_teams.rename(columns={eastern_teams.columns[0]:'Team'}, inplace=True)
    eastern_stats = pd.read_html(str(tables[1]))[0]
    eastern_stats.index = eastern_stats.index + 1
    western_teams = pd.read_html(str(tables[2]))[0]
    western_teams.loc[-1] = western_teams.columns[0]
    western_teams.index = western_teams.index + 2  # shifting index
    western_teams.sort_index(inplace=True) 
    western_teams.rename(columns={western_teams.columns[0]:'Team'}, inplace=True)
    western_stats = pd.read_html(str(tables[3]))[0]
    western_stats.index = western_stats.index + 1
    
    eastern_standings = pd.concat([eastern_teams, eastern_stats], axis=1)
    eastern_standings["Made_Playoffs"] = False
    western_standings = pd.concat([western_teams, western_stats], axis=1)
    western_standings["Made_Playoffs"] = False
    
    for team in eastern_standings["Team"]:
        if team[1] in ['x', 'y', 'z', '*']: #Chars that start the row and indicate the team made the playoffs
            eastern_standings.loc[eastern_standings["Team"] == team, ["Made_Playoffs"]] = True
            
        if '--' in team:
            adjusted_team_name = team.split('--')[1]
        else:
            adjusted_team_name = team

        while not (adjusted_team_name[1]>= 'a' and adjusted_team_name[1] <= 'z'): #Make sure the column only has team names
            adjusted_team_name = adjusted_team_name[1:]
            
        eastern_standings.loc[eastern_standings["Team"] == team, ["Team"]] = adjusted_team_name.upper()

    for team in western_standings["Team"]:
        if team[1] in ['x', 'y', 'z', '*']: #Chars that start the row and indicate the team made the playoffs
            western_standings.loc[western_standings["Team"] == team, ["Made_Playoffs"]] = True                
        
        if '--' in team:
            adjusted_team_name = team.split('--')[1]
        else:
            adjusted_team_name = team
            
        if "Clippers" in adjusted_team_name:
            adjusted_team_name = "Los Angeles Clippers"
            
        while not (adjusted_team_name[1]>= 'a' and adjusted_team_name[1] <= 'z'): #Make sure the column only has team names
            adjusted_team_name = adjusted_team_name[1:]
        
        adjusted_team_name = adjusted_team_name.replace('  ', ' ')
        
        western_standings.loc[western_standings["Team"] == team, ["Team"]] = adjusted_team_name.upper()
        
    eastern_standings["Team"] = eastern_standings['Team'].apply(lambda x: TEAM_TO_ABBR[x])
    western_standings["Team"] = western_standings['Team'].apply(lambda x: TEAM_TO_ABBR[x])

            
    return eastern_standings, western_standings

In [6]:
def get_expanded_standings_to_df(season):
    selector = "div_expanded_standings"
    url = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2Fleagues%2FNBA_{season}.html&div={selector}'
    html = urlopen(url)
    soup = BeautifulSoup(html)
    table = soup.find('table')
    df = pd.read_html(str(table))[0]
    
    print(df)

In [7]:
def get_playoff_series(season):
    selector = "div_all_playoffs"
    url = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2Fleagues%2FNBA_{season}.html&div={selector}'
    html = urlopen(url)
    soup = BeautifulSoup(html)
    table = soup.find('table')
    df = pd.read_html(str(table))[0]
    
    df.drop([2, 3, 4, 5], axis='columns', inplace=True)
    
    df.rename(columns={0 : 'Series', 1: "Winner_over_loser"}, inplace=True)
    
    for row, col in df.iterrows():
        if "Game" in str(col[0]):
            df.drop(row, inplace=True)
            
    df.dropna(how='all', inplace=True)
            
    winners, losers_and_results = [wl.split(" over ")[0] for wl in df["Winner_over_loser"]], [wl.split(" over ")[1] for wl in df["Winner_over_loser"]]
    losers, results = [lr.split("(")[0] for lr in losers_and_results], [lr.split("(")[1] for lr in losers_and_results]
    wins, losses = [int(wl.split("-")[0]) for wl in results], [int(wl.split("-")[1][0]) for wl in results]
    
    winners = [TEAM_TO_ABBR[w.strip().upper()] for w in winners]
    losers = [TEAM_TO_ABBR[l.strip().upper()] for l in losers]

    
    df["Winner"] = winners
    df["Loser"] = losers
    df["Winner_total_wins"] = wins
    df["Loser_total_wins"] = losses
    
    df.drop("Winner_over_loser", axis='columns', inplace=True)

    
    return df

In [8]:
def merge_standings(eastern_standings, western_standings):
    standings = pd.concat([western_standings, eastern_standings])
    standings.sort_values(by=["Made_Playoffs", "W", "Rank", "Team"], ascending=[False, False, True, True], inplace=True)
    standings.reset_index(inplace=True)
    standings.drop('index', axis=1, inplace=True)
    standings.index += 1
    standings["Rank"] = standings.index
    return standings[["Team", "Rank"]]

In [9]:
def build_playoff_standings(season):
    playoffs = get_playoff_series(season)
    first_year, second_year = format_season(season)
    western_standings = pd.read_csv("DataCollection/Standings/standings_western_conference_{0}-{1}.csv".format(first_year, second_year))
    eastern_standings = pd.read_csv("DataCollection/Standings/standings_eastern_conference_{0}-{1}.csv".format(first_year, second_year))
    standings = merge_standings(eastern_standings, western_standings)
    
    teams = []
    
    winners = pd.DataFrame(playoffs.groupby(by='Winner').sum())
    
    #add champion and finalist
    finals = playoffs[playoffs["Series"] == "Finals"]
    champion = finals["Winner"][0]
    finalist = finals["Loser"][0]
    
    champion_wins = winners.loc[champion]['Winner_total_wins']
    champion_losses = winners.loc[champion]['Loser_total_wins']
    finalist_wins = winners.loc[finalist]['Winner_total_wins'] + finals["Loser_total_wins"][0]
    finalist_losses = winners.loc[finalist]['Loser_total_wins'] + finals["Winner_total_wins"][0]
    
    teams.append((champion, champion_wins, champion_losses))
    teams.append((finalist, finalist_wins, finalist_losses))
    
    #add conference finalists
    
    western_conf_finals = playoffs[playoffs["Series"] == "Western Conference Finals"]
    western_conf_finals.reset_index(drop=True, inplace=True)
    eastern_conf_finals = playoffs[playoffs["Series"] == "Eastern Conference Finals"]
    eastern_conf_finals.reset_index(drop=True, inplace=True)
    
    west_finalist = western_conf_finals['Loser'][0]
    east_finalist = eastern_conf_finals['Loser'][0]
    
    west_finalist_wins = winners.loc[west_finalist]["Winner_total_wins"] + western_conf_finals["Loser_total_wins"][0]
    west_finalist_losses = winners.loc[west_finalist]["Loser_total_wins"] + western_conf_finals["Winner_total_wins"][0]
    east_finalist_wins = winners.loc[east_finalist]["Winner_total_wins"] + eastern_conf_finals["Loser_total_wins"][0]    
    east_finalist_losses = winners.loc[east_finalist]["Loser_total_wins"] + eastern_conf_finals["Winner_total_wins"][0]

    teams.append((west_finalist, west_finalist_wins, west_finalist_losses))
    teams.append((east_finalist, east_finalist_wins, east_finalist_losses))
    
    #add conference semifinalists
    
    western_conf_semifinals = playoffs[playoffs["Series"] == "Western Conference Semifinals"]
    eastern_conf_semifinals = playoffs[playoffs["Series"] == "Eastern Conference Semifinals"]
    
    for team in western_conf_semifinals['Loser']:
        semifinals = western_conf_semifinals[western_conf_semifinals['Loser'] == team]
        semifinals.reset_index(drop=True, inplace=True)
        team_wins = winners.loc[team]["Winner_total_wins"] + semifinals['Loser_total_wins'][0]
        team_losses = winners.loc[team]["Loser_total_wins"] + semifinals['Winner_total_wins'][0]
        teams.append((team, team_wins, team_losses))
        
    for team in eastern_conf_semifinals['Loser']:
        semifinals = eastern_conf_semifinals[eastern_conf_semifinals['Loser'] == team]
        semifinals.reset_index(drop=True, inplace=True)
        team_wins = winners.loc[team]["Winner_total_wins"] + semifinals['Loser_total_wins'][0]
        team_losses = winners.loc[team]["Loser_total_wins"] + semifinals['Winner_total_wins'][0]
        teams.append((team, team_wins, team_losses))
        
    western_conf_first_round = playoffs[playoffs["Series"] == "Western Conference First Round"]
    eastern_conf_first_round = playoffs[playoffs["Series"] == "Eastern Conference First Round"]
    
    for team in western_conf_first_round['Loser']:
        first_round = western_conf_first_round[western_conf_first_round['Loser'] == team]
        first_round.reset_index(drop=True, inplace=True)
        team_wins = first_round['Loser_total_wins'][0]
        team_losses = first_round['Winner_total_wins'][0]
        teams.append((team, team_wins, team_losses))
        
    for team in eastern_conf_first_round['Loser']:
        first_round = eastern_conf_first_round[eastern_conf_first_round['Loser'] == team]
        first_round.reset_index(drop=True, inplace=True)
        team_wins = first_round['Loser_total_wins'][0]
        team_losses = first_round['Winner_total_wins'][0]
        teams.append((team, team_wins, team_losses))
        
    
    df = pd.DataFrame(teams, columns=["Team", "Playoff_wins", "Playoff_losses"])
    playoff_standings = pd.merge(df, standings, how='inner', on='Team')
    
    playoff_standings.sort_values(by=["Playoff_wins", "Playoff_losses", "Rank"], ascending=[False, True, True], inplace=True)

    playoff_standings.to_csv("DataCollection/Standings_Playoffs/playoff_standings_{0}-{1}.csv".format(first_year, second_year), index=False)

In [13]:
def standings_to_csv(seasons, expanded=False):
    for season in seasons:
        eastern, western = get_standings_from_espn(season)
        
        first_year, second_year = format_season(season)
        
        csv_file_name_eastern = ""
        csv_file_name_western = ""
#         if expanded:
#             csv_file_name = "Standings/standings_{0}-{1}.csv".format(first_year, second_year)
#         else:
        csv_file_name_eastern = "DataCollection/Standings/standings_eastern_conference_{0}-{1}.csv".format(first_year, second_year)
        csv_file_name_western = "DataCollection/Standings/standings_western_conference_{0}-{1}.csv".format(first_year, second_year)    
        
        eastern.to_csv(csv_file_name_eastern, index=True, index_label="Rank")
        western.to_csv(csv_file_name_western, index=True, index_label="Rank")       

In [14]:
seasons = range(2003,2023)

standings_to_csv([2022])