In [2]:
import os
import re
import pandas as pd
from nba_api.stats.endpoints import leaguegamelog
from datetime import timedelta, datetime
import pprint

In [36]:
# Dictionary to map team abbreviations to full team names
team_name_mapping = {
    "ATL": "Atlanta Hawks",
    "BOS": "Boston Celtics",
    "BKN": "Brooklyn Nets",
    "CHA": "Charlotte Hornets",
    "CHI": "Chicago Bulls",
    "CLE": "Cleveland Cavaliers",
    "DAL": "Dallas Mavericks",
    "DEN": "Denver Nuggets",
    "DET": "Detroit Pistons",
    "GSW": "Golden State Warriors",
    "HOU": "Houston Rockets",
    "IND": "Indiana Pacers",
    "LAC": "Los Angeles Clippers",
    "LAL": "Los Angeles Lakers",
    "MEM": "Memphis Grizzlies",
    "MIA": "Miami Heat",
    "MIL": "Milwaukee Bucks",
    "MIN": "Minnesota Timberwolves",
    "NOP": "New Orleans Pelicans",
    "NYK": "New York Knicks",
    "OKC": "Oklahoma City Thunder",
    "ORL": "Orlando Magic",
    "PHI": "Philadelphia 76ers",
    "PHX": "Phoenix Suns",
    "POR": "Portland Trail Blazers",
    "SAC": "Sacramento Kings",
    "SAS": "San Antonio Spurs",
    "TOR": "Toronto Raptors",
    "UTA": "Utah Jazz",
    "WAS": "Washington Wizards"
}
# Dictionary to map teams to team_ids
team_name_to_id_mapping = {
    "Atlanta Hawks": "1610612737",
    "Boston Celtics": "1610612738",
    "Brooklyn Nets": "1610612751",
    "Charlotte Hornets": "1610612766",
    "Chicago Bulls": "1610612741",
    "Cleveland Cavaliers": "1610612739",
    "Dallas Mavericks": "1610612742",
    "Denver Nuggets": "1610612743",
    "Detroit Pistons": "1610612765",
    "Golden State Warriors": "1610612744",
    "Houston Rockets": "1610612745",
    "Indiana Pacers": "1610612754",
    "Los Angeles Clippers": "1610612746",
    "Los Angeles Lakers": "1610612747",
    "Memphis Grizzlies": "1610612763",
    "Miami Heat": "1610612748",
    "Milwaukee Bucks": "1610612749",
    "Minnesota Timberwolves": "1610612750",
    "New Orleans Pelicans": "1610612740",
    "New York Knicks": "1610612752",
    "Oklahoma City Thunder": "1610612760",
    "Orlando Magic": "1610612753",
    "Philadelphia 76ers": "1610612755",
    "Phoenix Suns": "1610612756",
    "Portland Trail Blazers": "1610612757",
    "Sacramento Kings": "1610612758",
    "San Antonio Spurs": "1610612759",
    "Toronto Raptors": "1610612761",
    "Utah Jazz": "1610612762",
    "Washington Wizards": "1610612764"
}

def get_season_schedule(season, start_date, end_date):
    # Fetch the season's game log data from the NBA API
    game_log = leaguegamelog.LeagueGameLog(season=season)
    schedule_df = game_log.get_data_frames()[0]

    # Convert dates to datetime objects for comparison
    schedule_df['GAME_DATE'] = pd.to_datetime(schedule_df['GAME_DATE'])
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    # Filter the schedule based on the date range
    schedule_df = schedule_df[(schedule_df['GAME_DATE'] >= start_date) & (schedule_df['GAME_DATE'] <= end_date)]
    
    # Extract 'home_name', 'away_name', 'home_id', and 'away_id' from the 'MATCHUP' field
    def extract_teams(row):
        matchup = row['MATCHUP']
        home_team = ''
        away_team = ''
        
        if '@' in matchup:
            # Format like "LAC @ LAL"
            away_team, home_team = re.split(r'\s*@\s*', matchup)
        elif 'vs.' in matchup:
            # Format like "GSW vs. SAC"
            home_team, away_team = re.split(r'\s*vs\.\s*', matchup)
        
        return pd.Series({
            'HOME_NAME': home_team,
            'AWAY_NAME': away_team
        })
    
    # Apply the function to the DataFrame and add the new columns
    teams_df = schedule_df.apply(extract_teams, axis=1)

    # Merge the extracted data back into the original DataFrame
    schedule_df = pd.concat([schedule_df, teams_df], axis=1)

    # Drop unnecessary columns and keep only the required fields
    schedule_df = schedule_df[['SEASON_ID', 'GAME_DATE', 'GAME_ID', 'AWAY_NAME', 'HOME_NAME']]

    # Map the abbreviated team names to full names using the team_name_mapping dictionary
    schedule_df['AWAY_NAME'] = schedule_df['AWAY_NAME'].map(team_name_mapping)
    schedule_df['HOME_NAME'] = schedule_df['HOME_NAME'].map(team_name_mapping)

    # Map the full team names to team IDs using the team_name_to_id_mapping dictionary
    schedule_df['AWAY_ID'] = schedule_df['AWAY_NAME'].map(team_name_to_id_mapping)
    schedule_df['HOME_ID'] = schedule_df['HOME_NAME'].map(team_name_to_id_mapping)

    # Remove duplicate games based on GAME_ID
    schedule_df = schedule_df.drop_duplicates(subset='GAME_ID')

    return schedule_df

#print(get_season_schedule('2023-24', '2023-11-01', '2023-12-31').head())

In [34]:
def get_game_pks(start_season, end_season, output_file):
    # Initialize an empty list to store DataFrames for each season
    all_seasons_data = []
    
    # Extract the start and end years from the season strings
    start_year = int(start_season[:4])
    end_year = int(end_season[:4])
    
    # Loop through seasons from start_season to end_season
    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year+1)[-2:]}"
        
        # Set the date range for each season
        start_date = f"{year}-07-01"  # Start from July 1st
        end_date = f"{year+1}-06-30"  # End on June 30th of the next year
        
        print(f"Fetching data for season {season}...")
        season_data = get_season_schedule(season, start_date, end_date)
        all_seasons_data.append(season_data)
    
    # Combine all seasons' data
    combined_data = pd.concat(all_seasons_data, ignore_index=True)
    
    # Sort the data by date
    combined_data = combined_data.sort_values('GAME_DATE')
    
    # Save to CSV
    combined_data.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

get_game_pks('2020-21', '2023-24', 'nba_REGULARSEASON_ONLY_game_pks.csv')

Fetching data for season 2020-21...
Fetching data for season 2021-22...
Fetching data for season 2022-23...
Fetching data for season 2023-24...
Data saved to nba_REGULARSEASON_ONLY_game_pks.csv


# Run the one below when the 2024-25 season starts 

In [None]:
def get_game_pks(start_season, output_file):
    # Get the current date
    current_date = datetime.now().strftime('%Y-%m-%d')
    
    # Initialize an empty list to store DataFrames for each season
    all_seasons_data = []
    
    # Loop through seasons from start_season to the current season
    current_year = datetime.now().year
    for year in range(int(start_season[:4]), current_year + 1):
        season = f"{year}-{str(year+1)[-2:]}"
        
        # For the first season, use the provided start_season
        if season == start_season:
            start_date = f"{year}-10-01"  # Assuming the NBA season typically starts in October
        else:
            start_date = f"{year}-07-01"  # Start from July 1st for subsequent seasons
        
        # For the current season, use the current date as the end date
        if year == current_year:
            end_date = current_date
        else:
            end_date = f"{year+1}-06-30"  # End on June 30th for completed seasons
        
        print(f"Fetching data for season {season}...")
        season_data = get_season_schedule(season, start_date, end_date)
        all_seasons_data.append(season_data)
    
    # Combine all seasons' data
    combined_data = pd.concat(all_seasons_data, ignore_index=True)
    
    # Sort the data by date
    combined_data = combined_data.sort_values('GAME_DATE')
    
    # Save to CSV
    combined_data.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

get_game_pks('2020-21', 'nba_game_pks.csv')