In [11]:
import pandas as pd
import os
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Comment

In [7]:
SCORE_DIR = 'data/scores'

In [8]:
box_scores = os.listdir(SCORE_DIR)

In [9]:
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith('.html')]

In [92]:
def parse_html(box_score):
    try:
        with open(box_score, encoding='utf-8') as f:
            html = f.read()

            soup = BeautifulSoup(html)
            [s.decompose() for s in soup.select('tr.over_header')]
            [s.decompose() for s in soup.select('tr.thead')]
            return soup
    except(UnicodeDecodeError):
        pass

In [52]:
def read_line_score(soup):
    
    comments = soup.findAll(text=lambda text:isinstance(text,Comment))
    for comment in comments:
        if 'table' in comment:
            table_soup = BeautifulSoup(comment, 'html.parser')
            table = table_soup.find('table', {'id': 'line_score'})
            if table is not None:
                table = table
                break
    line_score = pd.read_html(str(table))[0]
    cols = list(line_score.columns)
    cols[0] = 'team'
    cols[-1] = 'total'
    line_score.columns = cols
    
    line_score = line_score[['team', 'total']]
    
    return line_score

In [80]:
def read_season_info(soup):
    
    nav = soup.select('#bottom_nav_container')[0]
    hrefs = [a['href'] for a in nav.find_all('a')]
    season = os.path.basename(hrefs[1]).split('_')[0]
    
    return season

In [36]:
def read_stats(soup, team, stat):
    
    df = pd.read_html(str(soup), attrs={'id' : f'box-{team}-game-{stat}'}, index_col=0)[0]
    df = df.apply(pd.to_numeric, errors='coerce')
    return df

In [94]:
base_cols = None
games = []

for box_score in box_scores:
    
    try:
        soup = parse_html(box_score)
        line_score = read_line_score(soup)
        teams = list(line_score['team'])

        summaries = []
        for team in teams:
            basic = read_stats(soup, team, 'basic')
            advanced = read_stats(soup, team, 'advanced')

            totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
            totals.index = totals.index.str.lower()
            maxes = pd.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
            maxes.index = maxes.index.str.lower() + '_max'

            summary = pd.concat([totals, maxes])

            if base_cols is None:
                base_cols = list(summary.index.drop_duplicates(keep='first'))
                base_cols = [b for b in base_cols if 'bpm' not in b]

            summary = summary[base_cols]

            summaries.append(summary)

        summary = pd.concat(summaries, axis=1).T

        game = pd.concat([summary, line_score], axis=1)

        game['home'] = [0, 1]
        game_opp = game.iloc[::-1].reset_index()
        game_opp.columns += '_opp'

        full_game = pd.concat([game, game_opp], axis=1)

        full_game['season'] = read_season_info(soup)

        full_game['date'] = os.path.basename(box_score)[:8]
        full_game['date'] = pd.to_datetime(full_game['date'], format='%Y%m%d')

        full_game['won'] = full_game['total'] > full_game['total_opp']
        games.append(full_game)

        if len(games) % 100 == 0:
            print(f'{len(games)} / {len(box_scores)}')
    except:
        pass

100 / 9932
200 / 9932
300 / 9932
400 / 9932
500 / 9932
600 / 9932
700 / 9932
800 / 9932
900 / 9932
1000 / 9932
1100 / 9932
1200 / 9932
1300 / 9932
1400 / 9932
1500 / 9932
1600 / 9932
1700 / 9932
1800 / 9932
1900 / 9932
2000 / 9932
2100 / 9932
2200 / 9932
2300 / 9932
2400 / 9932
2500 / 9932
2600 / 9932
2700 / 9932
2800 / 9932
2900 / 9932
3000 / 9932
3100 / 9932
3200 / 9932
3300 / 9932
3400 / 9932
3500 / 9932
3600 / 9932
3700 / 9932
3800 / 9932
3900 / 9932
4000 / 9932
4100 / 9932
4200 / 9932
4300 / 9932
4400 / 9932
4500 / 9932
4600 / 9932
4700 / 9932
4800 / 9932
4900 / 9932
5000 / 9932
5100 / 9932
5200 / 9932
5300 / 9932
5400 / 9932
5500 / 9932
5600 / 9932
5700 / 9932
5800 / 9932
5900 / 9932
6000 / 9932
6100 / 9932
6200 / 9932
6300 / 9932
6400 / 9932
6500 / 9932
6600 / 9932
6700 / 9932
6800 / 9932
6900 / 9932
7000 / 9932
7100 / 9932
7200 / 9932
7300 / 9932
7400 / 9932
7500 / 9932
7600 / 9932
7700 / 9932
7800 / 9932
7900 / 9932
8000 / 9932
8100 / 9932
8200 / 9932
8300 / 9932
8400 / 9932
8

In [98]:
games_df = pd.concat(games, ignore_index=True)

In [99]:
games_df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19853,240.0,240.0,47.0,89.0,0.528,11.0,32.0,0.344,18.0,26.0,...,26.7,33.9,214.0,126.0,IND,139,0,2023,2023-03-16,False
19854,240.0,240.0,43.0,86.0,0.500,8.0,28.0,0.286,19.0,23.0,...,33.3,30.6,154.0,118.0,PHO,116,1,2023,2023-03-16,False
19855,240.0,240.0,43.0,90.0,0.478,13.0,35.0,0.371,17.0,17.0,...,25.0,27.6,154.0,122.0,ORL,113,0,2023,2023-03-16,True
19856,240.0,240.0,40.0,94.0,0.426,13.0,31.0,0.419,18.0,20.0,...,33.3,42.1,226.0,122.0,TOR,128,1,2023,2023-03-16,False


In [100]:
games_df.to_csv('nba_games.csv')