In [1]:
import logging
from logging.config import dictConfig

import pyarrow
from pyarrow import parquet
from pbpstats.client import Client

from pbpstats_parser import parse_season


SETTINGS = {
    "dir": "/home/matt/Desktop/nba_data/pbpstats",
    "Boxscore": {"source": "file", "data_provider": "stats_nba"},
    "EnhancedPbp": {"source": "file", "data_provider": "stats_nba"},
    "Games": {"source": "file", "data_provider": "stats_nba"},
    "Pbp": {"source": "file", "data_provider": "stats_nba"},
    "Possessions": {"source": "file", "data_provider": "stats_nba"},
    "Shots": {"source": "file", "data_provider": "stats_nba"},
}
LEAGUES = ["nba"]
YEARS = [
    "2000-01",
    "2001-02",
    "2002-03",
    "2003-04",
    "2004-05",
    "2005-06",
    "2006-07",
    "2007-08",
    "2008-09",
    "2009-10",
    "2010-11",
    "2011-12",
    "2012-13",
    "2013-14",
    "2014-15",
    "2015-16",
    "2016-17",
    "2017-18",
    "2018-19",
    "2019-20",
]
SEASON_TYPES = [
    "Regular Season",
    "Playoffs"
]
DATA_PATH = "/home/matt/Desktop/nba_data/parsed_pbpstats"


def configure_logger():
    logging_config = {'version': 1,
     'formatters': {'standard': {'format': '%(asctime)s %(module)s:%(lineno)d %(levelname)s %(message)s'}},
     'handlers': {'console': {'class': 'logging.StreamHandler',
       'level': 'DEBUG',
       'formatter': 'standard',
       'stream': 'ext://sys.stdout'}},
     'loggers': {'parse_pbpstats': {'level': 'DEBUG', 'handlers': [], 'propagate': True}},
     'disable_existing_loggers': False,
     'root': {'level': 'WARNING', 'handlers': ['console']}}
    dictConfig(logging_config)
    return logging.getLogger('parse_pbpstats')


logger = configure_logger()

In [2]:
def parse_pbpstats(leagues, years, season_types):
    client = Client(SETTINGS)
    for league in leagues:
        for year in years:
            for season_type in season_types:
                logger.info(f'Parsing pbpstats for {league} {year} {season_type}')
                season_df = parse_season(client, league, year, season_type)
                logger.info(f'Saving pbpstats for {league} {year} {season_type}')
                save_df_to_pq(season_df, DATA_PATH)
    logger.info('Complete')

def save_df_to_pq(df, path, *args, **kwargs):
    table = pyarrow.Table.from_pandas(df, preserve_index=False)
    parquet.write_to_dataset(table, path, *args, **kwargs)

In [3]:
parse_pbpstats(LEAGUES, YEARS, SEASON_TYPES)

2021-03-10 18:48:30,939 <ipython-input-2-2244a591c71a>:6 INFO Parsing pbpstats for nba 2000-01 Regular Season
2021-03-10 18:49:59,915 <ipython-input-2-2244a591c71a>:8 INFO Saving pbpstats for nba 2000-01 Regular Season
2021-03-10 18:50:00,080 <ipython-input-2-2244a591c71a>:6 INFO Parsing pbpstats for nba 2000-01 Playoffs
2021-03-10 18:50:04,798 <ipython-input-2-2244a591c71a>:8 INFO Saving pbpstats for nba 2000-01 Playoffs
2021-03-10 18:50:04,816 <ipython-input-2-2244a591c71a>:6 INFO Parsing pbpstats for nba 2001-02 Regular Season
2021-03-10 18:51:31,637 <ipython-input-2-2244a591c71a>:8 INFO Saving pbpstats for nba 2001-02 Regular Season
2021-03-10 18:51:31,805 <ipython-input-2-2244a591c71a>:6 INFO Parsing pbpstats for nba 2001-02 Playoffs
2021-03-10 18:51:37,297 <ipython-input-2-2244a591c71a>:8 INFO Saving pbpstats for nba 2001-02 Playoffs
2021-03-10 18:51:37,316 <ipython-input-2-2244a591c71a>:6 INFO Parsing pbpstats for nba 2002-03 Regular Season
2021-03-10 18:53:05,080 <ipython-input