In [1]:
import pandas as pd

In [2]:
from nba_api.stats.endpoints import ScheduleLeagueV2

ScheduleLeagueV2?

[31mInit signature:[39m
ScheduleLeagueV2(
    league_id=[33m'00'[39m,
    season=[33m'2024-25'[39m,
    proxy=[38;5;28;01mNone[39;00m,
    headers=[38;5;28;01mNone[39;00m,
    timeout=[32m30[39m,
    get_request=[38;5;28;01mTrue[39;00m,
)
[31mDocstring:[39m      <no docstring>
[31mFile:[39m           ~/anaconda3/envs/MBAI/lib/python3.13/site-packages/nba_api/stats/endpoints/scheduleleaguev2.py
[31mType:[39m           type
[31mSubclasses:[39m     

In [3]:
schedule = ScheduleLeagueV2()

In [4]:
schedule_df = schedule.get_data_frames()[0]

In [5]:
schedule_df.columns.to_list()

['leagueId',
 'seasonYear',
 'gameDate',
 'gameId',
 'gameCode',
 'gameStatus',
 'gameStatusText',
 'gameSequence',
 'gameDateEst',
 'gameTimeEst',
 'gameDateTimeEst',
 'gameDateUTC',
 'gameTimeUTC',
 'gameDateTimeUTC',
 'awayTeamTime',
 'homeTeamTime',
 'day',
 'monthNum',
 'weekNumber',
 'weekName',
 'ifNecessary',
 'seriesGameNumber',
 'gameLabel',
 'gameSubLabel',
 'seriesText',
 'arenaName',
 'arenaState',
 'arenaCity',
 'postponedStatus',
 'branchLink',
 'gameSubtype',
 'isNeutral',
 'homeTeam_teamId',
 'homeTeam_teamName',
 'homeTeam_teamCity',
 'homeTeam_teamTricode',
 'homeTeam_teamSlug',
 'homeTeam_wins',
 'homeTeam_losses',
 'homeTeam_score',
 'homeTeam_seed',
 'awayTeam_teamId',
 'awayTeam_teamName',
 'awayTeam_teamCity',
 'awayTeam_teamTricode',
 'awayTeam_teamSlug',
 'awayTeam_wins',
 'awayTeam_losses',
 'awayTeam_score',
 'awayTeam_seed',
 'pointsLeaders_0_personId',
 'pointsLeaders_0_firstName',
 'pointsLeaders_0_lastName',
 'pointsLeaders_0_teamId',
 'pointsLeaders_0_t

In [6]:
cols2keep = [
    'gameId', 'gameDateUTC', 'gameTimeUTC', 
    'arenaName', 'arenaState', 'arenaCity', 
    'homeTeam_teamId', 'awayTeam_teamId'
]

In [7]:
schedule_df[cols2keep]

Unnamed: 0,gameId,gameDateUTC,gameTimeUTC,arenaName,arenaState,arenaCity,homeTeam_teamId,awayTeam_teamId
0,0012400001,2024-10-04T04:00:00Z,1900-01-01T16:00:00Z,Etihad Arena,,Abu Dhabi,1610612743,1610612738
1,0012400002,2024-10-04T04:00:00Z,1900-01-01T01:00:00Z,Delta Center,UT,Salt Lake City,1610612762,15020
2,0012400003,2024-10-04T04:00:00Z,1900-01-01T02:30:00Z,Acrisure Arena,CA,Palm Desert,1610612747,1610612750
3,0012400004,2024-10-05T04:00:00Z,1900-01-01T23:00:00Z,Stan Sheriff Center,HI,Honolulu,1610612746,1610612744
4,0012400005,2024-10-06T04:00:00Z,1900-01-01T14:00:00Z,Etihad Arena,,Abu Dhabi,1610612738,1610612743
...,...,...,...,...,...,...,...,...
1395,0042400403,2025-06-11T04:00:00Z,1900-01-01T00:30:00Z,Gainbridge Fieldhouse,IN,Indianapolis,1610612754,1610612760
1396,0042400404,2025-06-13T04:00:00Z,1900-01-01T00:30:00Z,Gainbridge Fieldhouse,IN,Indianapolis,1610612754,1610612760
1397,0042400405,2025-06-16T04:00:00Z,1900-01-01T00:30:00Z,Paycom Center,OK,Oklahoma City,1610612760,1610612754
1398,0042400406,2025-06-19T04:00:00Z,1900-01-01T00:30:00Z,Gainbridge Fieldhouse,IN,Indianapolis,1610612754,1610612760


In [8]:
df = pd.DataFrame()

In [9]:
df['GAME_ID'] = schedule_df['gameId'].astype('string')

In [10]:
dates = pd.to_datetime(schedule_df['gameDateUTC']).dt.normalize()
times = pd.to_datetime(schedule_df['gameTimeUTC']).dt.time
time_deltas = pd.to_timedelta(times.astype(str))

df['GAME_DATE'] = dates + time_deltas

In [11]:
df['ARENA_NAME'] = schedule_df['arenaName'].astype('string')
df['ARENA_STATE'] = schedule_df['arenaState'].astype('string')
df['ARENA_CITY'] = schedule_df['arenaCity'].astype('string')

In [12]:
df['HOME_TEAM_ID'] = pd.to_numeric(schedule_df['homeTeam_teamId'], downcast='unsigned')
df['AWAY_TEAM_ID'] = pd.to_numeric(schedule_df['awayTeam_teamId'], downcast='unsigned')

In [13]:
df.dtypes

GAME_ID              string[python]
GAME_DATE       datetime64[ns, UTC]
ARENA_NAME           string[python]
ARENA_STATE          string[python]
ARENA_CITY           string[python]
HOME_TEAM_ID                 uint32
AWAY_TEAM_ID                 uint32
dtype: object

We need to check if the 'GAME_ID's are consistent with the one we got from the gamelogs.

In [24]:
from pathlib import Path

data_path = Path("~/MBAI/data").expanduser()
current_season_id = "22024"
current_season_path = data_path / f"rs{current_season_id}"

In [26]:
rs_game_ids = [
    game_dir.name[1:]
        for game_dir in current_season_path.iterdir()
            if game_dir.is_dir() and game_dir.name.startswith('g')
]

In [32]:
rs_games_df = df[df['GAME_ID'].isin(rs_game_ids)]

In [33]:
assert len(rs_games_df) == len(rs_game_ids), "We are missing some games!!!"

We can now get all the schedules and save them.

In [None]:
for season_dir in data_path.iterdir():
    if season_dir.is_dir() and season_dir.name.startswith('rs'):
        season_id = season_dir.name[3:]
        schedule = ScheduleLeagueV2(season=season_id)
        schedule_df = schedule.get_data_frames()[0]

        rs_game_ids = [
            game_dir.name[1:]
                for game_dir in season_dir.iterdir()
                    if game_dir.is_dir() and game_dir.name.startswith('g')
        ]
        schedule_df = schedule_df[schedule_df['gameId'].isin(rs_game_ids)]

        assert len(schedule_df) == len(rs_game_ids)
            
        df = pd.DataFrame()
        df['GAME_ID'] = schedule_df['gameId'].astype('string')
        
        dates = pd.to_datetime(schedule_df['gameDateUTC']).dt.normalize()
        times = pd.to_datetime(schedule_df['gameTimeUTC']).dt.time
        time_deltas = pd.to_timedelta(times.astype(str))
        df['GAME_DATE'] = dates + time_deltas
        
        df['ARENA_NAME'] = schedule_df['arenaName'].astype('string')
        df['ARENA_STATE'] = schedule_df['arenaState'].astype('string')
        df['ARENA_CITY'] = schedule_df['arenaCity'].astype('string')
                
        df['HOME_TEAM_ID'] = pd.to_numeric(schedule_df['homeTeam_teamId'], downcast='unsigned')
        df['AWAY_TEAM_ID'] = pd.to_numeric(schedule_df['awayTeam_teamId'], downcast='unsigned')

        df.to_parquet(season_dir / "schedule.parquet")