## Parsing NBA Data

In [None]:
# import libraries
import os
import pandas as pd
from bs4 import BeautifulSoup

#### Get the Box Score HTML Files

In [None]:
SCORES_DIR = "data/scores"

box_scores = os.listdir(SCORES_DIR)
box_scores = [os.path.join(SCORES_DIR, f) for f in box_scores if f.endswith(".html")]

#### Write Function for Parsing the HTML

In [None]:
# parsing through individual box score and getting data
def parse_html(temp_box_score):
    with open(temp_box_score) as f:
        html = f.read()
        
    # create instance of BeautifulSoup
    soup = BeautifulSoup(html)
    
    # will remove any headers and reserve lines from the html
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    
    return soup

#### Write Function for Reading Line Scores

In [None]:
# reading the line scores and storing them using pandas
def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs={"id": "line_score"})[0]
    
    # convert the columns names to a list
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols
    
    # only take the team and total, remove the quarterly scores
    line_score = line_score[["team", "total"]]
    return line_score

#### Write Function for Reading Stats

In [None]:
# extracting stats with pandas
def read_stats(soup, temp_team, stat):
    df = pd.read_html(str(soup), attrs={"id": f"box-{temp_team}-game-{stat}"}, index_col=0)[0]
    df = df.apply(pd.to_numeric, errors="coerce")
    return df

#### Write Function for Getting Season and Game Information

In [None]:
# get background information
def get_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

#### Attain Stats from All Games

In [None]:
base_cols = None
games = []

for box_score in box_scores:
    my_soup = parse_html(box_score)
    
    my_line_score = read_line_score(my_soup)
    teams = list(my_line_score["team"])

    game_summaries = []
    for team in teams:
        basic_stats = read_stats(my_soup, team, "basic")
        advanced_stats = read_stats(my_soup, team, "advanced")

        # getting the advanced and basic stats into a single column correct total index values
        total_values = pd.concat([basic_stats.iloc[-1,:], advanced_stats.iloc[-1,:]])
        total_values.index = total_values.index.str.lower()
        
        # getting maximum values for each player and correct max index values
        max_values = pd.concat([basic_stats.iloc[:-1,:].max(), advanced_stats.iloc[:-1,:].max()])
        max_values.index = max_values.index.str.lower() + "_max"

        # combine into one dataframe
        team_summary = pd.concat([total_values, max_values])

        # get rid of ambiguous columns that may show up for certain games 
        if base_cols is None:
            base_cols = list(team_summary.index.drop_duplicates(keep="first"))
            base_cols = [b for b in base_cols if "bpm" not in b]

        # reassign
        team_summary = team_summary[base_cols]
        game_summaries.append(team_summary)

    # create summary for both teams
    summary = pd.concat(game_summaries, axis=1).T

    # call this the game that has the summary and the line score
    game_review = pd.concat([summary, my_line_score], axis=1)

    # creating opposition stats
    game_review["home"] = [0, 1]
    game_review_opp = game_review.iloc[::-1].reset_index()
    game_review_opp.columns += "_opp"

    # concatenate for an entire game
    entire_game = pd.concat([game_review, game_review_opp], axis=1)

    # correct the season and date info
    entire_game["season"] = get_season_info(my_soup)
    entire_game["date"] = os.path.basename(box_score)[:8]
    entire_game["date"] = pd.to_datetime(entire_game["date"], format="%Y%m%d")

    # figure out the winner of the game
    entire_game["won"] = entire_game["total"] > entire_game["total_opp"]

    # append to entire games list
    games.append(entire_game)

#### Turn All the Data into a Dataframe

In [None]:
# turn to dataframe
games_df = pd.concat(games, ignore_index=True)

#### Export as a CSV File

In [None]:
# turn to csv
games_df.to_csv("nba_games.csv")