In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
TEAM_TO_ABBR = {
    'ATLANTA HAWKS': 'ATL',
#     'ST. LOUIS HAWKS': 'SLH',
#     'MILWAUKEE HAWKS': 'MIL',
#     'TRI-CITIES BLACKHAWKS': 'TCB',
    'BOSTON CELTICS': 'BOS',
    'BROOKLYN NETS': 'BRK',
    'NEW JERSEY NETS' : 'BRK',
#     'NEW YORK NETS' : 'NYN',
    'CHICAGO BULLS': 'CHI',
    'CHARLOTTE HORNETS': 'CHO',
    'CHARLOTTE BOBCATS' : 'CHO',
    'CLEVELAND CAVALIERS': 'CLE',
    'DALLAS MAVERICKS': 'DAL',
    'DENVER NUGGETS': 'DEN',
    'DETROIT PISTONS': 'DET',
#     'FORT WAYNE PISTONS': 'FWP',
    'GOLDEN STATE WARRIORS': 'GSW',
#     'SAN FRANCISCO WARRIORS': 'SFW',
#     'PHILADELPHIA WARRIORS': 'PHI',
    'HOUSTON ROCKETS': 'HOU',
#     'SAN DIEGO ROCKETS': 'HOU',
    'INDIANA PACERS': 'IND',
    'LOS ANGELES CLIPPERS': 'LAC',
#     'SAN DIEGO CLIPPERS': 'SDC',
#     'BUFFALO BRAVES': 'BUF',
    'LOS ANGELES LAKERS': 'LAL',
#     'MINNEAPOLIS LAKERS': 'MIN',
    'MEMPHIS GRIZZLIES': 'MEM',
    'VANCOUVER GRIZZLIES' : 'MEM',
    'MIAMI HEAT': 'MIA',
    'MILWAUKEE BUCKS': 'MIL',
    'MINNESOTA TIMBERWOLVES': 'MIN',
    'NEW ORLEANS PELICANS' : 'NOP',
    'NEW ORLEANS/OKLAHOMA CITY HORNETS' : 'NOP',
    'NEW ORLEANS HORNETS' : 'NOP',
    'NEW YORK KNICKS' : 'NYK',
    'OKLAHOMA CITY THUNDER' : 'OKC',
    'OKLAHOMA CITY HORNETS' : 'NOP',
    'SEATTLE SUPERSONICS' : 'OKC',
    'ORLANDO MAGIC' : 'ORL',
    'PHILADELPHIA 76ERS' : 'PHI',
#     'SYRACUSE NATIONALS' : 'SYR',
    'PHOENIX SUNS' : 'PHO',
    'PORTLAND TRAIL BLAZERS' : 'POR',
    'SACRAMENTO KINGS' : 'SAC',
#     'KANSAS CITY KINGS' : 'KCK',
#     'KANSAS CITY-OMAHA KINGS' : 'KCK',
#     'CINCINNATI ROYALS' : 'CIN',
#     'ROCHESTER ROYALS': 'ROR',
    'SAN ANTONIO SPURS' : 'SAS',
    'TORONTO RAPTORS' : 'TOR',
    'UTAH JAZZ' : 'UTA',
#     'NEW ORLEANS JAZZ' : 'NOJ',
    'WASHINGTON WIZARDS' : 'WAS',
#     'WASHINGTON BULLETS' : 'WAS',
#     'CAPITAL BULLETS' : 'CAP',
#     'BALTIMORE BULLETS' : 'BAL',
#     'CHICAGO ZEPHYRS' : 'CHI',
#     'CHICAGO PACKERS' : 'CHI',

    # DEFUNCT FRANCHISES
#     'ANDERSON PACKERS': 'AND',
#     'CHICAGO STAGS': 'CHI',
#     'INDIANAPOLIS OLYMPIANS': 'IND',
#     'SHEBOYGAN RED SKINS': 'SRS',
#     'ST. LOUIS BOMBERS': 'SLB',
#     'WASHINGTON CAPITOLS' : 'WAS',
#     'WATERLOO HAWKS': 'WAT',
    }

In [5]:
def format_season(season):
        #Formats years in season to use in File Name
        #Ex: 2021 turns into 2020-21
        first_year = season-1
        second_year = str(season)[2:]
        
        return first_year, second_year

In [6]:
def get_standings_to_df(season):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html".format(season)
    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html, features='lxml')
    #gets the table headers from the BeautifulSoup object
    titles = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    eastern_headers = titles[1:titles.index("SRS")+1]
    western_headers = titles[1:titles.index("SRS")+1]
    titles = titles[titles.index("SRS")+1:]
    
    try:
        row_titles = titles[0:titles.index("Eastern Conference")]
    except:
        row_titles = titles
        
    for i in eastern_headers:
        row_titles.remove(i)
    
    row_titles.remove("Western Conference")

    #gets the data for each player from the BeautifulSoup object
    rows = soup.findAll('tr')[1:]
    team_stats = [[td.getText() for td in rows[i].findAll('td')]for i in range(len(rows))]
    
    team_stats = [e for e in team_stats if e != []]
    team_stats = team_stats[0:len(row_titles)]
    
    for i in range(0, len(team_stats)):
        team_stats[i].insert(0, row_titles[i])
        team_stats[i].insert(0, season)
    
    eastern_headers.insert(0, "Eastern Conference")
    eastern_headers.insert(0, "Season")
    western_headers.insert(0, "Western Conference")
    western_headers.insert(0, "Season")
    
    eastern_standings = pd.DataFrame(team_stats[:15], columns=eastern_headers)
    western_standings = pd.DataFrame(team_stats[15:], columns=western_headers)
    
    eastern_standings["Made_Playoffs"] = False
    western_standings["Made_Playoffs"] = False
    
    for team in eastern_standings['Eastern Conference']:
        if '*' in team:
            eastern_standings.loc[df['Eastern Conference'] == team, ["Made_Playoffs"]] = True
        eastern_standings['Eastern Conference'] = eastern_standings['Eastern Conference'].apply(lambda x: x.replace('*', '').upper())
        eastern_standings['Eastern Conference'] = eastern_standings['Eastern Conference'].apply(lambda x: TEAM_TO_ABBR[x])
    
    for team in western_standings['Western Conference']:
        if '*' in team:
            western_standings.loc[df['Western Conference'] == team, ["Made_Playoffs"]] = True
        western_standings['Western Conference'] = western_standings['Western Conference'].apply(lambda x: x.replace('*', '').upper())
        western_standings['Western Conference'] = western_standings['Western Conference'].apply(lambda x: TEAM_TO_ABBR[x])
    
    
    return eastern_standings, western_standings
    

In [7]:
def get_standings_from_espn(season):
    first_year, second_year = format_season(season)
    url = "https://www.espn.com/nba/standings/_/season/{}".format(season)
    
    html = urlopen(url)
    soup = BeautifulSoup(html)
    
    tables = soup.findAll('table')
    eastern_teams = pd.read_html(str(tables[0]))[0]
    eastern_teams.loc[-1] = eastern_teams.columns[0]
    eastern_teams.index = eastern_teams.index + 2  # shifting index
    eastern_teams.sort_index(inplace=True)
    eastern_teams.rename(columns={eastern_teams.columns[0]:'Team'}, inplace=True)
    eastern_stats = pd.read_html(str(tables[1]))[0]
    eastern_stats.index = eastern_stats.index + 1
    western_teams = pd.read_html(str(tables[2]))[0]
    western_teams.loc[-1] = western_teams.columns[0]
    western_teams.index = western_teams.index + 2  # shifting index
    western_teams.sort_index(inplace=True) 
    western_teams.rename(columns={western_teams.columns[0]:'Team'}, inplace=True)
    western_stats = pd.read_html(str(tables[3]))[0]
    western_stats.index = western_stats.index + 1
    
    eastern_standings = pd.concat([eastern_teams, eastern_stats], axis=1)
    eastern_standings["Made_Playoffs"] = False
    western_standings = pd.concat([western_teams, western_stats], axis=1)
    western_standings["Made_Playoffs"] = False
    
    for team in eastern_standings["Team"]:
        if team[1] in ['x', 'y', 'z', '*']: #Chars that start the row and indicate the team made the playoffs
            eastern_standings.loc[eastern_standings["Team"] == team, ["Made_Playoffs"]] = True
            
        if '--' in team:
            adjusted_team_name = team.split('--')[1]
        else:
            adjusted_team_name = team

        while not (adjusted_team_name[1]>= 'a' and adjusted_team_name[1] <= 'z'): #Make sure the column only has team names
            adjusted_team_name = adjusted_team_name[1:]
            
        eastern_standings.loc[eastern_standings["Team"] == team, ["Team"]] = adjusted_team_name.upper()

    for team in western_standings["Team"]:
        if team[1] in ['x', 'y', 'z', '*']: #Chars that start the row and indicate the team made the playoffs
            western_standings.loc[western_standings["Team"] == team, ["Made_Playoffs"]] = True                
        
        if '--' in team:
            adjusted_team_name = team.split('--')[1]
        else:
            adjusted_team_name = team
            
        if "Clippers" in adjusted_team_name:
            adjusted_team_name = "Los Angeles Clippers"
            
        while not (adjusted_team_name[1]>= 'a' and adjusted_team_name[1] <= 'z'): #Make sure the column only has team names
            adjusted_team_name = adjusted_team_name[1:]
        
        adjusted_team_name = adjusted_team_name.replace('  ', ' ')
        
        western_standings.loc[western_standings["Team"] == team, ["Team"]] = adjusted_team_name.upper()
        
    eastern_standings["Team"] = eastern_standings['Team'].apply(lambda x: TEAM_TO_ABBR[x])
    western_standings["Team"] = western_standings['Team'].apply(lambda x: TEAM_TO_ABBR[x])

            
    return eastern_standings, western_standings

In [8]:
def get_expanded_standings_to_df(season):
    selector = "div_expanded_standings"
    url = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2Fleagues%2FNBA_{season}.html&div={selector}'
    html = urlopen(url)
    soup = BeautifulSoup(html)
    table = soup.find('table')
    df = pd.read_html(str(table))[0]
    
    print(df)

In [9]:
def get_playoff_series(season):
    selector = "div_all_playoffs"
    url = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2Fleagues%2FNBA_{season}.html&div={selector}'
    html = urlopen(url)
    soup = BeautifulSoup(html)
    table = soup.find('table')
    df = pd.read_html(str(table))[0]
    
    df.drop([2, 3, 4, 5], axis='columns', inplace=True)
    
    df.rename(columns={0 : 'Series', 1: "Winner_over_loser"}, inplace=True)
    
    for row, col in df.iterrows():
        if "Game" in str(col[0]):
            df.drop(row, inplace=True)
            
    df.dropna(how='all', inplace=True)
            
    winners, losers_and_results = [wl.split(" over ")[0] for wl in df["Winner_over_loser"]], [wl.split(" over ")[1] for wl in df["Winner_over_loser"]]
    losers, results = [lr.split("(")[0] for lr in losers_and_results], [lr.split("(")[1] for lr in losers_and_results]
    wins, losses = [int(wl.split("-")[0]) for wl in results], [int(wl.split("-")[1][0]) for wl in results]
    
    df["Winner"] = winners
    df["Loser"] = losers
    df["Winner_total_wins"] = wins
    df["Loser_total_wins"] = losses
    
    df.drop("Winner_over_loser", axis='columns', inplace=True)

    
    return df

In [10]:
def merge_standings(eastern_standings, western_standings):
    standings = pd.concat([western_standings, eastern_standings])
    standings.sort_values(by=["Made_Playoffs", "W", "Rank", "Team"], ascending=[False, False, True, True], inplace=True)
    standings.reset_index(inplace=True)
    standings.drop('index', axis=1, inplace=True)
    standings.index += 1
    teams_ranked = dict(standings["Team"])
    inv_teams_ranked = {v: k for k, v in teams_ranked.items()}
    return inv_teams_ranked

In [35]:
def build_playoff_standings(season):
    playoffs = get_playoff_series(season)
    first_year, second_year = format_season(season)
    western_standings = pd.read_csv("Standings/standings_western_conference_{0}-{1}.csv".format(first_year, second_year))
    eastern_standings = pd.read_csv("Standings/standings_eastern_conference_{0}-{1}.csv".format(first_year, second_year))
    standings = merge_standings(eastern_standings, western_standings)
    
    teams = []
    
    winners = pd.DataFrame(playoffs.groupby(by='Winner').sum())
    
    
    #add champion and finalist
    finals = playoffs[playoffs["Series"] == "Finals"]
    champion = finals["Winner"][0].strip()
    finalist = finals["Loser"][0].strip()
    
    champion_wins = winners.loc[champion]['Winner_total_wins']
    champion_losses = winners.loc[champion]['Loser_total_wins']
    finalist_wins = winners.loc[finalist]['Winner_total_wins'] + finals["Loser_total_wins"][0]
    finalist_losses = winners.loc[finalist]['Loser_total_wins'] + finals["Winner_total_wins"][0]
    
    teams.append((champion, champion_wins, champion_losses))
    teams.append((finalist, finalist_wins, finalist_losses))
    
    #add conference finalists
    
    western_conf_finals = playoffs[playoffs["Series"] == "Western Conference Finals"]
    western_conf_finals.reset_index(drop=True, inplace=True)
    eastern_conf_finals = playoffs[playoffs["Series"] == "Eastern Conference Finals"]
    eastern_conf_finals.reset_index(drop=True, inplace=True)
    
    west_finalist = western_conf_finals['Loser'][0].strip()
    east_finalist = eastern_conf_finals['Loser'][0].strip()
    
    west_finalist_wins = winners.loc[west_finalist]["Winner_total_wins"] + western_conf_finals["Loser_total_wins"][0]
    west_finalist_losses = winners.loc[west_finalist]["Loser_total_wins"] + western_conf_finals["Winner_total_wins"][0]
    east_finalist_wins = winners.loc[east_finalist]["Winner_total_wins"] + eastern_conf_finals["Loser_total_wins"][0]    
    east_finalist_losses = winners.loc[east_finalist]["Loser_total_wins"] + eastern_conf_finals["Winner_total_wins"][0]

    teams.append((west_finalist, west_finalist_wins, west_finalist_losses))
    teams.append((east_finalist, east_finalist_wins, east_finalist_losses))
    
    #add conference semifinalists
    
    western_conf_semifinals = playoffs[playoffs["Series"] == "Western Conference Semifinals"]
    eastern_conf_semifinals = playoffs[playoffs["Series"] == "Eastern Conference Semifinals"]
    
    for team in western_conf_semifinals['Loser']:
        semifinals = western_conf_semifinals[western_conf_semifinals['Loser'] == team]
        semifinals.reset_index(drop=True, inplace=True)
        team = team.strip()
        team_wins = winners.loc[team]["Winner_total_wins"] + semifinals['Loser_total_wins'][0]
        team_losses = winners.loc[team]["Loser_total_wins"] + semifinals['Winner_total_wins'][0]
        teams.append((team, team_wins, team_losses))
        
    for team in eastern_conf_semifinals['Loser']:
        semifinals = eastern_conf_semifinals[eastern_conf_semifinals['Loser'] == team]
        semifinals.reset_index(drop=True, inplace=True)
        team = team.strip()
        team_wins = winners.loc[team]["Winner_total_wins"] + semifinals['Loser_total_wins'][0]
        team_losses = winners.loc[team]["Loser_total_wins"] + semifinals['Winner_total_wins'][0]
        teams.append((team, team_wins, team_losses))
        
    western_conf_first_round = playoffs[playoffs["Series"] == "Western Conference First Round"]
    eastern_conf_first_round = playoffs[playoffs["Series"] == "Eastern Conference First Round"]
    
    for team in western_conf_first_round['Loser']:
        first_round = western_conf_first_round[western_conf_first_round['Loser'] == team]
        first_round.reset_index(drop=True, inplace=True)
        team_wins = playoffs[playoffs["Loser"] == team]["Winner_total_wins"] + first_round['Loser_total_wins'][0]
        team_losses = winners.loc[team]["Loser_total_wins"] + first_round['Winner_total_wins'][0]
        team = team.strip()
        teams.append((team, team_wins, team_losses))
        
    for team in eastern_conf_first_round['Loser']:
        first_round = eastern_conf_first_round[eastern_conf_first_round['Loser'] == team]
        first_round.reset_index(drop=True, inplace=True)
        team_wins = winners.loc[team]["Winner_total_wins"] + first_round['Loser_total_wins'][0]
        team_losses = winners.loc[team]["Loser_total_wins"] + first_round['Winner_total_wins'][0]
        team = team.strip()
        teams.append((team, team_wins, team_losses))
        
    
    print(teams)

In [36]:
build_playoff_standings(2021)

KeyError: 'Portland Trail Blazers '

In [109]:
get_playoff_series(2021)

Unnamed: 0,Series,Winner,Loser,Winner_total_wins,Loser_total_wins
0,Finals,Milwaukee Bucks,Phoenix Suns,4,2
9,Eastern Conference Finals,Milwaukee Bucks,Atlanta Hawks,4,2
17,Western Conference Finals,Phoenix Suns,Los Angeles Clippers,4,2
26,Eastern Conference Semifinals,Atlanta Hawks,Philadelphia 76ers,4,3
35,Eastern Conference Semifinals,Milwaukee Bucks,Brooklyn Nets,4,3
44,Western Conference Semifinals,Los Angeles Clippers,Utah Jazz,4,2
52,Western Conference Semifinals,Phoenix Suns,Denver Nuggets,4,0
59,Eastern Conference First Round,Atlanta Hawks,New York Knicks,4,1
66,Eastern Conference First Round,Brooklyn Nets,Boston Celtics,4,1
73,Eastern Conference First Round,Milwaukee Bucks,Miami Heat,4,0


In [8]:
get_expanded_standings_to_df(2021)

ValueError: No tables found

In [12]:
def standings_to_csv(seasons, expanded=False):
    for season in seasons:
        eastern, western = get_standings_from_espn(season)
        
        first_year, second_year = format_season(season)
        
        csv_file_name_eastern = ""
        csv_file_name_western = ""
#         if expanded:
#             csv_file_name = "Standings/standings_{0}-{1}.csv".format(first_year, second_year)
#         else:
        csv_file_name_eastern = "Standings/standings_eastern_conference_{0}-{1}.csv".format(first_year, second_year)
        csv_file_name_western = "Standings/standings_western_conference_{0}-{1}.csv".format(first_year, second_year)    
        
        eastern.to_csv(csv_file_name_eastern, index=True, index_label="Rank")
        western.to_csv(csv_file_name_western, index=True, index_label="Rank")       

In [13]:
seasons = range(2003,2023)

standings_to_csv(seasons)

In [14]:
eastern_standings = pd.read_csv("Standings/standings_eastern_conference_2002-03.csv")
eastern_standings

Unnamed: 0,Rank,Team,W,L,PCT,GB,HOME,AWAY,DIV,CONF,PPG,OPP PPG,DIFF,STRK,L10,Made_Playoffs
0,1,DET,50,32,0.61,-,30-11,20-21,19-9,0-0,91.4,87.7,3.7,L1,4-6,True
1,2,BRK,49,33,0.598,1,33-8,16-25,16-8,0-0,95.4,90.1,5.3,L2,5-5,True
2,3,IND,48,34,0.585,2,32-9,16-25,19-9,0-0,96.8,93.3,3.5,W2,6-4,True
3,4,PHI,48,34,0.585,2,25-16,23-18,17-7,0-0,96.8,94.5,2.3,W1,5-5,True
4,5,NOP,47,35,0.573,3,29-12,18-23,17-11,0-0,93.9,91.8,2.1,W5,7-3,True
5,6,BOS,44,38,0.537,6,25-16,19-22,13-12,0-0,92.7,93.0,-0.3,W2,6-4,True
6,7,MIL,42,40,0.512,8,25-16,17-24,16-12,0-0,99.5,99.3,0.2,W4,8-2,True
7,8,ORL,42,40,0.512,8,26-15,16-25,14-11,0-0,98.5,98.4,0.1,L2,4-6,True
8,9,WAS,37,45,0.451,13,23-18,14-27,11-13,0-0,91.5,92.5,-1.0,L3,3-7,False
9,10,NYK,37,45,0.451,13,24-17,13-28,9-15,0-0,95.9,97.2,-1.3,L1,5-5,False


In [15]:
western_standings = pd.read_csv("Standings/standings_western_conference_2021-22.csv")
western_standings

Unnamed: 0,Rank,Team,W,L,PCT,GB,HOME,AWAY,DIV,CONF,PPG,OPP PPG,DIFF,STRK,L10,Made_Playoffs
0,1,GSW,29,8,0.784,-,17-3,12-5,7-1,16-5,110.5,101.4,9.1,L1,7-3,False
1,2,PHO,29,8,0.784,-,16-4,13-4,4-4,19-7,112.6,104.9,7.7,W2,7-3,False
2,3,UTA,28,10,0.737,1.5,14-7,14-3,9-0,18-4,116.0,106.2,9.8,W2,8-2,False
3,4,MEM,25,14,0.641,5,13-8,12-6,4-2,19-9,111.3,108.1,3.2,W6,7-3,False
4,5,DAL,20,18,0.526,9.5,10-8,10-10,7-2,17-10,104.9,103.4,1.5,W4,6-4,False
5,6,LAL,20,19,0.513,10,13-10,7-9,2-5,12-13,111.3,112.0,-0.7,W3,4-6,False
6,7,LAC,19,19,0.5,10.5,12-11,7-8,3-4,12-15,105.5,105.7,-0.2,L1,3-7,False
7,8,DEN,18,18,0.5,10.5,8-7,10-11,2-5,11-11,105.9,106.3,-0.4,L2,5-5,False
8,9,MIN,18,20,0.474,11.5,11-10,7-10,3-4,13-13,108.1,108.4,-0.3,W2,5-5,False
9,10,SAS,15,22,0.405,14,7-10,8-12,1-4,8-15,110.9,110.4,0.5,W1,5-5,False


In [11]:
get_expanded_standings_to_df(2022)

IndexError: list index out of range