# Web Scrape NBA Official Stats Site [nba.com/stats/](https://www.nba.com/stats/)

In [None]:
import copy
import time

from bs4 import BeautifulSoup as soup
import pandas as pd
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
# Set up Splinter (prep the automated browser).
executable_path = {"executable_path": ChromeDriverManager().install()}
browser = Browser("chrome", **executable_path, headless=False)

<br>
<hr>
<br>

## Collect Team Statistics URLs

In [None]:
# Visit NBA stats page.
base_url = 'https://www.nba.com'
href = '/stats'
browser.visit(f'{base_url}{href}')
# Delay to allow the page to load.
time.sleep(4)

In [None]:
# Retrieve html.
html = browser.html
nba_soup = soup(html, "html.parser")

In [None]:
# Find the anchor tags within div's that have the sidebar-module... class.
sidebar_divs = nba_soup.find_all("div", class_="sidebar-module / sidebar__leaders / sidebar-module-next / sidebar-module-quick-links")
team_stats_a_tags = sidebar_divs[4].find_all("a")
# Extract the href string and combine with base_url to form the team stats urls.
team_stats_urls = [f'{base_url}{a_tag.attrs["href"]}' for a_tag in team_stats_a_tags]
team_stats_urls

## Collect Data From Each Team Statistics URLs

### URL: "Teams General Traditional" Stats

#### Selectable Filters
* `browser.find_by_tag("select")[0]`: **Season**
    * 1996-97 to 2021-22 (default)
* `browser.find_by_tag("select")[1]`: **Season Type**
    * Regular Season (default)
    * Playoffs
* `browser.find_by_tag("select")[2]`: **Per Mode**
    * Per Game (default)
    * Totals
* `browser.find_by_tag("select")[3]`: **Season Segment**
    * All Games (default)

#### Function to Retrieve the DataFrame for One Season

In [None]:
def season_stat_table(stat_table_soup):
    """Retrieve the table of data for a season via the stat_table_soup html"""
    table = stat_table_soup.find("div", class_="nba-stat-table").find("table")
    # Find the column names in the header of the table.
    headers = table.find("thead").find_all("th")
    # The conditional is for removing hidden header values that have no meaning
    # to us. The first list element is removed since it refers to a ranking that
    # we will not need.
    headers = [header.decode_contents().replace('<br/>', ' ').replace('\xa0', ' ').strip().upper()
               for header in headers
               if 'RANK' not in header.text][1:]
    
    # Rows that contain the table data.
    rows = table.find("tbody").find_all("tr")
    
    # dataframe_data will contain dict elements for each row of data.
    dataframe_data = []
    
    # Loop over each row in the table.
    for row in rows:
        # All the table data "td" tags for a given row (i.e. all column values).
        cols = row.find_all("td")
        # Remove the first element that is a ranking.
        cols = [td.text.strip() for td in cols][1:]
    
        # row_dict represents the data for an entire row.
        row_dict = {}

        # Loop over each column in a given row, add the value to row_dict with
        # a key that is the column's header name. 
        for index, value in enumerate(cols):
            # Add team name string.
            if index == 0:
                row_dict[headers[index]] = value
                continue
            # Add team record information: GP, W, and L as integers.
            if index in (1,2,3):
                row_dict[headers[index]] = int(value)
                continue
            # Add the remaining team stats as floats.
            if ',' in value:
                row_dict[headers[index]] = int(value.replace(',', ''))
            else:
                row_dict[headers[index]] = float(value)
        # Add the row's row_dict to dataframe_data.
        dataframe_data.append(row_dict)
        
    return pd.DataFrame(dataframe_data)

#### Function to Retrieve the DataFrames for All Seasons

In [None]:
def retrieve_all_seasons(seasons):
    """Retrieve dataframes for all seasons"""
    dataframes = {}
    for season in seasons:
        # Select Season --------------------------------------------------------
        # Find the select elements. The first select is the "SEASON" dropdown 
        # menu and the value to select is the value that the nba website 
        # assigned that was collected in the seasons list.
        browser.find_by_tag("select")[0].select(value=season[1])
        # Delay to allow the page to load.
        time.sleep(2)

        # Retrieve html --------------------------------------------------------
        html = browser.html
        stat_table_soup = soup(html, "html.parser")

        # Retrieve DataFrame ---------------------------------------------------
        # Assign the DataFrame to the dataframes dict as the value, and the
        # season string as the key.
        dataframes[season[0]] = season_stat_table(stat_table_soup)
        
    return dataframes

#### Function to Retrieve All Team Statistics Data

In [None]:
def retrieve_team_stats(team_stats_urls):
    team_stats = {}
    for url in team_stats_urls:
        # Visit URL ------------------------------------------------------------
        # Visit a team stats url.
        browser.visit(url)
        # Delay to allow the page to load.
        time.sleep(4)

        # Retrieve Available Seasons and Their <option> Tag Values -------------
        html = browser.html
        select_soup = soup(html, "html.parser")

        # Find the option tags from the first select tag ("SEASON" dropdown 
        # menu).
        options = select_soup.find('select').find_all('option')
        # Store the season and option value strings for each option.
        seasons = [(option.text, option['value']) for option in options]

        # Retrieve Table Title Using select_soup -------------------------------
        # Find the title for the table at the current url.
        table_title_nav = select_soup.find("nav-dropdown")
        # The title is configured in three parts (3 different attributes on the
        # nav tag).
        table_title = list(table_title_nav.attrs.values())
        # Join the title words and capitalize.
        table_title = ' '.join(table_title).title()

        # Retrieve the DataFrames for Each Season ------------------------------
        team_stats[table_title] = retrieve_all_seasons(seasons)
    return team_stats

#### Retrieve the Team Statistics Data

In [None]:
team_stats = retrieve_team_stats(team_stats_urls)

In [None]:
team_stats.keys()

In [None]:
def retrieve_playoff_teams(url):
    # Visit URL ----------------------------------------------------------------
    # Visit a team stats url.
    browser.visit(url)
    # Delay to allow the page to load.
    time.sleep(4)

    # Retrieve Available Seasons and Their <option> Tag Values -----------------
    html = browser.html
    playoff_soup = soup(html, "html.parser")

    # Select Playoffs ----------------------------------------------------------
    # Find the options for the "SEASON TYPE" dropdown menu
    options = playoff_soup.find(attrs={'name':'SeasonType'}).find_all('option')
    playoff_option = [option['value'] for option in options if option.text == 'Playoffs']
    # Select the "Playoffs" option to bring up that season's playoff table.
    browser.find_by_tag("select")[1].select(value=playoff_option[0])
    # Delay to allow the page to load.
    time.sleep(2)    

    # Find the option tags from the first select tag ("SEASON" dropdown menu).
    options = playoff_soup.find(attrs={'name':'Season'}).find_all('option')
    # Store the season and option value strings for each option.
    seasons = [(option.text, option['value']) for option in options]

    # Retrieve the playoff team DataFrame --------------------------------------
    playoff_teams = retrieve_all_playoff_seasons(seasons)
    return playoff_teams

In [None]:
def retrieve_all_playoff_seasons(seasons):
    """Retrieve dataframes for all seasons"""
    dataframes = {}
    for index, season in enumerate(seasons):
        # Skip 2021-22 season, nba site shows no data currently
        if index == 0:
            continue
        # Select Season --------------------------------------------------------
        # Find the select elements. The first select is the "SEASON" dropdown 
        # menu and the value to select is the value that the nba website 
        # assigned that was collected in the seasons list. Select the season to 
        # bring up that season's table.
        browser.find_by_tag("select")[0].select(value=season[1])
        time.sleep(2)

        # Retrieve html --------------------------------------------------------
        html = browser.html
        stat_table_soup = soup(html, "html.parser")

        # Retrieve DataFrame ---------------------------------------------------
        # Assign the DataFrame to the dataframes dict as the value, and the
        # season string as the key.
        dataframes[season[0]] = season_playoff_teams(stat_table_soup)
        
    return dataframes

In [None]:
def season_playoff_teams(stat_table_soup):
    """Retrieve the table of data for a season via the stat_table_soup html"""
    table = stat_table_soup.find("div", class_="nba-stat-table").find("table")
    # Find the column names in the header of the table.
    headers = table.find("thead").find_all("th")
    # The conditional is for removing hidden header values that have no meaning
    # to us. The first list element is removed since it refers to a ranking that
    # we will not need.
    headers = [header.decode_contents().replace('<br/>', ' ').replace('\xa0', ' ').strip().upper()
               for header in headers
               if 'RANK' not in header.text][1:]
    
    # Rows that contain the table data.
    rows = table.find("tbody").find_all("tr")
    
    # dataframe_data will contain dict elements for each row of data.
    dataframe_data = []
    

    # Loop over each row in the table.
    for row in rows:
        # All the table data "td" tags for a given row (i.e. all column values).
        cols = row.find_all("td")
        # Remove the first element that is a ranking.
        cols = [td.text.strip() for td in cols][1:]
        # Team name is index 0 and team wins (to later determine champions) is
        # index 2.
        team_name = cols[0]
        team_wins = int(cols[2])
       
        # row_dict represents the data for an entire row.
        row_dict = {}      
        
        row_dict[headers[0]] = team_name
        row_dict[headers[2]] = team_wins

        dataframe_data.append(row_dict)
        
    return pd.DataFrame(dataframe_data)

#### Retrieve the Playoff Team Data

In [None]:
playoff_teams = retrieve_playoff_teams(team_stats_urls[0])

In [None]:
for season, playoff_dataframe in playoff_teams.items():
    
    playoff_dataframe = playoff_dataframe.sort_values(by=['W'], ascending=False)
    playoff_dataframe = playoff_dataframe.reset_index(drop=True).drop(columns='W')
    playoff_dataframe.columns = [season]
    if season == '2020-21':
        playoff_teams_df = playoff_dataframe
        continue
    playoff_teams_df = playoff_teams_df.join(playoff_dataframe)
playoff_teams_df

In [None]:
champions = playoff_teams_df.iloc[0]
champions

In [None]:
browser.quit()