In [30]:
# In this notebook, our goal is to parse data to use later in models

import os
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
# %pip install pandas

In [2]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [8]:
SCORE_DIR = "data/scores"

In [10]:
box_scores = os.listdir(SCORE_DIR)

In [12]:
len(box_scores)

454

In [14]:
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [16]:
box_scores

['data/scores/202001160NOP.html',
 'data/scores/202001170DAL.html',
 'data/scores/202106100MIL.html',
 'data/scores/202110200POR.html',
 'data/scores/202001060SAC.html',
 'data/scores/202001040BRK.html',
 'data/scores/202106100UTA.html',
 'data/scores/202009130DEN.html',
 'data/scores/202001040CHI.html',
 'data/scores/202110260OKC.html',
 'data/scores/202003020CLE.html',
 'data/scores/202110270OKC.html',
 'data/scores/202110280PHI.html',
 'data/scores/202001220DET.html',
 'data/scores/202001220MIA.html',
 'data/scores/202001240NYK.html',
 'data/scores/202001280DAL.html',
 'data/scores/202110270BRK.html',
 'data/scores/202001100PHO.html',
 'data/scores/202001220ATL.html',
 'data/scores/202003060NOP.html',
 'data/scores/202003060DAL.html',
 'data/scores/202110270TOR.html',
 'data/scores/202009060LAL.html',
 'data/scores/202110240SAC.html',
 'data/scores/202106120LAC.html',
 'data/scores/202001200CHO.html',
 'data/scores/202110270BOS.html',
 'data/scores/202001030WAS.html',
 'data/scores/

In [18]:
# Since we downloaded the html in the previous notebook, its time to parse it
# Here we clean up the html by removing over_head and thead

def parse_html(box_score):
    with open(box_score) as f:
        html = f.read()
        
    soup = BeautifulSoup(html)
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup

In [32]:
# This function grabs the scores and assigns them accordingly

def read_line_score(soup):
    line_score = pd.read_html(StringIO(str(soup)), attrs={"id": "line_score"})[0]
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols

    line_score = line_score[["team", "total"]]
    return line_score

In [40]:
def read_stats(soup, team, stat):
    df = pd.read_html(StringIO(str(soup)), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
    df = df.apply(pd.to_numeric, errors="coerce")
    return df

In [90]:
def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all("a")]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [106]:
base_cols = None
games = []

for box_score in box_scores:
    soup = parse_html(box_score)
    line_score = read_line_score(soup)
    teams = list(line_score["team"])
    
    # Getting stats ready for machine learning application
    # Data processing and data clean up
    # Summaries contains data from both teams
    summaries = []
    for team in teams:
        # Individual stats for each player advanced and basic metrics
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")
    
        # Puts all values into a single column, giving team totals
        totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
        totals.index = totals.index.str.lower()
    
        # Maximum value for each player, take the best player in each category
        maxes = pd.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
        maxes.index = maxes.index.str.lower() + "_max"
    
        summary = pd.concat([totals, maxes])
    
        # Some table discrepencies so we have to make sure the data is consistent
        # Remove duplicates while we are at it
        if base_cols is None:
            base_cols = list(summary.index.drop_duplicates(keep="first"))
            base_cols = [b for b in base_cols if "bpm" not in b]
    
        summary = summary[base_cols]
        summaries.append(summary)
    summary = pd.concat(summaries, axis=1).T
    game = pd.concat([summary, line_score], axis=1)
    
    game ["home"] = [0,1]
    game_opp = game.iloc[::-1].reset_index()
    game_opp.columns += "_opp"
    
    # Concatenating opponent and home for one game
    full_game = pd.concat([game, game_opp], axis=1)
    
    full_game["season"] = read_season_info(soup)
    full_game["date"] = os.path.basename(box_score)[:8]
    
    full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")
    full_game["won"] = full_game["total"] > full_game["total_opp"]
    games.append(full_game)

    if len(games) % 100 == 0:
        print(f"{len(games)} / {len(box_scores)}")

100 / 454
200 / 454
300 / 454
400 / 454


In [104]:
full_game

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,265.0,265.0,46.0,100.0,0.46,15.0,39.0,0.385,25.0,32.0,...,20.7,39.9,160.0,125.0,NOP,138,1,2020,2020-01-16,False
1,265.0,265.0,51.0,93.0,0.548,11.0,26.0,0.423,25.0,35.0,...,25.0,39.5,197.0,132.0,UTA,132,0,2020,2020-01-16,True


In [108]:
games_df = pd.concat(games, ignore_index=True)

In [110]:
games_df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,265.0,265.0,46.0,100.0,0.460,15.0,39.0,0.385,25.0,32.0,...,20.7,39.9,160.0,125.0,NOP,138,1,2020,2020-01-16,False
1,265.0,265.0,51.0,93.0,0.548,11.0,26.0,0.423,25.0,35.0,...,25.0,39.5,197.0,132.0,UTA,132,0,2020,2020-01-16,True
2,240.0,240.0,44.0,85.0,0.518,9.0,30.0,0.300,15.0,22.0,...,29.1,36.5,153.0,120.0,DAL,120,1,2020,2020-01-17,False
3,240.0,240.0,39.0,89.0,0.438,20.0,47.0,0.426,22.0,28.0,...,20.0,36.3,157.0,131.0,POR,112,0,2020,2020-01-17,True
4,240.0,240.0,34.0,94.0,0.362,8.0,32.0,0.250,7.0,8.0,...,12.5,41.2,136.0,94.0,MIL,86,1,2021,2021-06-10,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903,240.0,240.0,43.0,86.0,0.500,14.0,29.0,0.483,23.0,28.0,...,20.0,35.3,149.0,125.0,NOP,113,0,2020,2020-01-03,True
904,240.0,240.0,43.0,92.0,0.467,10.0,34.0,0.294,24.0,27.0,...,17.2,27.6,159.0,115.0,BRK,139,1,2020,2020-03-06,False
905,240.0,240.0,51.0,96.0,0.531,15.0,36.0,0.417,22.0,31.0,...,13.9,29.1,200.0,136.0,SAS,120,0,2020,2020-03-06,True
906,240.0,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,...,27.7,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True


In [115]:
[g.shape[1] for g in games if g.shape[1] != 154]

[]

In [117]:
games_df.to_csv("nba_games.csv")