In [1]:
import requests
from datetime import datetime, timedelta
import pandas as pd

def fetch_nba_games_and_stats(start_date_str, end_date_str):
    # Base URL for the balldontlie API
    games_url = "https://www.balldontlie.io/api/v1/games"
    stats_url = "https://www.balldontlie.io/api/v1/stats"

    # Initialize variables for pagination
    all_games = []
    all_stats = []
    games_page = 1
    games_per_page = 100  # Maximum value as per the API documentation
    games_total_pages = 1  # Placeholder, will be updated after the first request

    # Fetch games
    while games_page <= games_total_pages:
        params = {
            "start_date": start_date_str,
            "end_date": end_date_str,
            "per_page": games_per_page,
            "page": games_page
        }

        response = requests.get(games_url, params=params)
        if response.status_code == 200:
            data = response.json()
            all_games.extend(data['data'])

            if games_page == 1:
                games_total_pages = data['meta']['total_pages']

            games_page += 1

    # Fetch stats for each game
    for game in all_games:
        game_id = game['id']
        stats_page = 1
        stats_per_page = 100
        stats_total_pages = 1

        while stats_page <= stats_total_pages:
            params = {
                "game_ids[]": game_id,
                "per_page": stats_per_page,
                "page": stats_page
            }

            response = requests.get(stats_url, params=params)
            if response.status_code == 200:
                data = response.json()
                all_stats.extend(data['data'])

                if stats_page == 1:
                    stats_total_pages = data['meta']['total_pages']

                stats_page += 1

    # Merge games and stats with flattened structure
    merged_data = []
    for stat in all_stats:
        game_info = next((game for game in all_games if game['id'] == stat['game']['id']), None)
        if game_info:
            # Flatten the game, player, team, home_team, and visitor_team data
            flattened_game_info = {f'game_{k}': v for k, v in game_info.items() if k != 'home_team' and k != 'visitor_team'}
            flattened_player_info = {f'player_{k}': v for k, v in stat['player'].items()}
            flattened_team_info = {f'player_team_{k}': v for k, v in stat['team'].items()}
            flattened_home_team_info = {f'home_team_{k}': v for k, v in game_info['home_team'].items()}
            flattened_visitor_team_info = {f'visitor_team_{k}': v for k, v in game_info['visitor_team'].items()}

            # Merge all flattened data
            merged_entry = {**flattened_game_info, **flattened_player_info, **flattened_team_info, **flattened_home_team_info, **flattened_visitor_team_info, **stat}
            merged_data.append(merged_entry)

    return merged_data

def backfill_nba_data(latest_date):
    # Calculate the day after the latest date
    next_day = latest_date + timedelta(days=1)
    
    # Format dates in the required format (YYYY-MM-DD)
    start_date_str = next_day.strftime("%Y-%m-%d")
    end_date_str = datetime.now().strftime("%Y-%m-%d")

    # Fetch data from next_day to today
    backfilled_data = fetch_nba_games_and_stats(start_date_str, end_date_str)
    return backfilled_data

def backfill_nba_data(latest_date):
    # Calculate the day after the latest date
    next_day = latest_date + timedelta(days=1)
    
    # Format the start and end dates in the required format (YYYY-MM-DD)
    start_date_str = next_day.strftime("%Y-%m-%d")
    end_date_str = datetime.now().strftime("%Y-%m-%d")

    # Fetch data from next_day to today
    backfilled_data = fetch_nba_games_and_stats(start_date_str, end_date_str)
    return backfilled_data

# Load your existing data from the CSV file
nba_data = pd.read_csv("./nba_games.csv")

# Convert the 'game_date' column to datetime and find the latest date
nba_data['game_date'] = pd.to_datetime(nba_data['game_date'])
latest_date_in_dataset = nba_data['game_date'].max()

# Call the backfill function with the latest date from your dataset
backfilled_data = backfill_nba_data(latest_date_in_dataset)

new_data = pd.DataFrame(backfilled_data)
all_data = pd.concat([nba_data, new_data])
all_data.to_csv('nba_games.csv', index=False)


In [13]:
import pandas as pd

# Load the original dataset
nba_data = pd.read_csv('./nba_games.csv') 

# Aggregate player statistics for each team in each game
team_game_stats = nba_data.groupby(['game_id', 'player_team_id']).agg({
    'ast': 'sum', 
    'blk': 'sum', 
    'dreb': 'sum', 
    'fg3_pct': 'mean', 
    'fg3a': 'sum', 
    'fg3m': 'sum',
    'fg_pct': 'mean', 
    'fga': 'sum', 
    'fgm': 'sum', 
    'ft_pct': 'mean', 
    'fta': 'sum', 
    'ftm': 'sum', 
    'min': 'sum', 
    'oreb': 'sum', 
    'pf': 'sum', 
    'pts': 'sum', 
    'reb': 'sum', 
    'stl': 'sum', 
    'turnover': 'sum'
}).reset_index()

# Extract team meta information
team_meta = nba_data[['player_team_id', 'player_team_abbreviation', 'player_team_city', 'player_team_conference', 'player_team_division', 'player_team_full_name', 'player_team_name']].drop_duplicates()

# Merge team stats with team meta information
team_game_stats = pd.merge(team_game_stats, team_meta, left_on='player_team_id', right_on='player_team_id', how='left')

# Separating home and visitor team stats
home_team_stats = team_game_stats.add_suffix('_home')
visitor_team_stats = team_game_stats.add_suffix('_visitor')

# Merge home and visitor team stats into a single row per game
final_data = nba_data[['game_id', 'home_team_id', 'visitor_team_id', 'game_date', 'game_home_team_score', 'game_visitor_team_score', 'game_season']].drop_duplicates()
final_data = final_data.merge(home_team_stats, left_on=['game_id', 'home_team_id'], right_on=['game_id_home', 'player_team_id_home'], how='left')
final_data = final_data.merge(visitor_team_stats, left_on=['game_id', 'visitor_team_id'], right_on=['game_id_visitor', 'player_team_id_visitor'], how='left')

# Drop redundant columns
final_data.drop(['game_id_home', 'player_team_id_home', 'game_id_visitor', 'player_team_id_visitor'], axis=1, inplace=True)

# Rename columns to remove 'player_' prefix
final_data = final_data.rename(columns=lambda x: x.replace('player_', ''))

# Calculate point spread
final_data['point_spread'] = final_data['game_home_team_score'] - final_data['game_visitor_team_score']

# Add a column to indicate the game winner
final_data['game_winner'] = final_data.apply(lambda row: 'HOME' if row['game_home_team_score'] > row['game_visitor_team_score'] else 'VISITOR', axis=1)

# Final structured dataset
final_dataset = final_data


In [14]:
# Convert to DataFrame
df = pd.DataFrame(final_dataset)

# Export to CSV
csv_filename = './nba_games_formatted.csv'
df.to_csv(csv_filename, index=False)

csv_filename

'./nba_games_formatted.csv'

In [20]:
import pandas as pd

# Load the dataset
file_path = './nba_games_formatted.csv'  # Replace with your dataset's path
nba_data = pd.read_csv(file_path)

# Convert 'game_date' to datetime format
nba_data['game_date'] = pd.to_datetime(nba_data['game_date']).dt.date

# List of statistics for which to calculate 10-game and season averages
stats_columns_home = [f'{stat}_home' for stat in ['ast', 'blk', 'dreb', 'fg3_pct', 'fg_pct', 'ft_pct', 'oreb', 'pf', 'pts', 'reb', 'stl', 'turnover']]
stats_columns_visitor = [f'{stat}_visitor' for stat in ['ast', 'blk', 'dreb', 'fg3_pct', 'fg_pct', 'ft_pct', 'oreb', 'pf', 'pts', 'reb', 'stl', 'turnover']]

# Function to calculate 10-game averages
def calculate_10_game_averages(row, data):
    game_date = row['game_date']
    home_team_id = row['home_team_id']
    visitor_team_id = row['visitor_team_id']

    # Filter for the last 10 games for each team before the game date
    last_10_home = data[((data['home_team_id'] == home_team_id) | (data['visitor_team_id'] == home_team_id)) & 
                        (data['game_date'] < game_date)].sort_values(by='game_date', ascending=False).head(10)
    last_10_visitor = data[((data['home_team_id'] == visitor_team_id) | (data['visitor_team_id'] == visitor_team_id)) & 
                           (data['game_date'] < game_date)].sort_values(by='game_date', ascending=False).head(10)

    averages = {}
    for stat in stats_columns_home + stats_columns_visitor:
        home_stat = last_10_home[last_10_home['home_team_id'] == home_team_id][stat]
        visitor_stat = last_10_visitor[last_10_visitor['visitor_team_id'] == visitor_team_id][stat]
        averages[f"{stat}_10game_avg"] = pd.concat([home_stat, visitor_stat]).mean()

    return averages

# Apply 10-game averages calculation
for index, row in nba_data.iterrows():
    averages = calculate_10_game_averages(row, nba_data)
    nba_data.loc[index, averages.keys()] = averages.values()

# Function to calculate season-level averages up to the date of each game
def calculate_season_averages_up_to_date(row, data, team_id_column, stats_columns):
    game_date = row['game_date']
    team_id = row[team_id_column]

    # Filter for games of the team up to the date of the current game
    previous_games = data[(data[team_id_column] == team_id) & (data['game_date'] < game_date)]

    # Calculate the mean of each statistic
    season_averages = previous_games[stats_columns].mean()

    # Rename columns and return
    return season_averages.add_suffix(f"_season_avg")

# Apply season averages calculation for each game
for index, row in nba_data.iterrows():
    home_averages = calculate_season_averages_up_to_date(row, nba_data, 'home_team_id', stats_columns_home)
    visitor_averages = calculate_season_averages_up_to_date(row, nba_data, 'visitor_team_id', stats_columns_visitor)
    nba_data.loc[index, home_averages.keys()] = home_averages.values
    nba_data.loc[index, visitor_averages.keys()] = visitor_averages.values

# Display the updated dataset
nba_data.head()


Unnamed: 0,game_id,home_team_id,visitor_team_id,game_date,game_home_team_score,game_visitor_team_score,game_season,ast_home,blk_home,dreb_home,...,dreb_visitor_season_avg,fg3_pct_visitor_season_avg,fg_pct_visitor_season_avg,ft_pct_visitor_season_avg,oreb_visitor_season_avg,pf_visitor_season_avg,pts_visitor_season_avg,reb_visitor_season_avg,stl_visitor_season_avg,turnover_visitor_season_avg
0,3277446,30,20,2023-11-17,99,120,2023,25,4,25,...,34.166667,0.154394,0.242693,0.215666,13.666667,18.833333,106.833333,47.833333,7.5,13.666667
1,3277447,1,23,2023-11-17,116,126,2023,24,4,25,...,34.0,0.1426,0.246903,0.21619,8.0,23.0,115.0,42.0,10.0,10.666667
2,3277451,5,22,2023-11-17,97,103,2023,19,3,29,...,32.5,0.142714,0.261598,0.274846,10.0,20.666667,103.666667,42.5,11.0,14.166667
3,3277452,19,8,2023-11-17,115,110,2023,33,5,35,...,34.75,0.152998,0.268897,0.17037,10.0,20.25,107.25,44.75,7.25,14.0
4,3277453,29,24,2023-11-17,128,131,2023,27,10,29,...,35.8,0.165639,0.256099,0.308153,13.6,17.4,107.8,49.4,7.6,17.6


In [22]:
# Export to CSV
csv_filename = './game_stats.csv'
nba_data.to_csv(csv_filename, index=False)

csv_filename

'./game_stats.csv'