# Backfill season data

In [1]:
import requests
from datetime import datetime, timedelta
import pandas as pd

def fetch_nba_games_and_stats(season_year):
    # Base URL for the balldontlie API
    games_url = "https://www.balldontlie.io/api/v1/games"
    stats_url = "https://www.balldontlie.io/api/v1/stats"

    # Set start_date and end_date for the given season
    start_date = datetime(season_year, 10, 24)
    end_date = datetime(season_year + 1, 7, 1)  # Assuming season ends by July 1st

    # Format dates in the required format (YYYY-MM-DD)
    start_date_str = start_date.strftime("%Y-%m-%d")
    end_date_str = end_date.strftime("%Y-%m-%d")

    # Initialize variables for pagination
    all_games = []
    all_stats = []
    games_page = 1
    games_per_page = 100  # Maximum value as per the API documentation
    games_total_pages = 1  # Placeholder, will be updated after the first request

    # Fetch games
    while games_page <= games_total_pages:
        params = {
            "start_date": start_date_str,
            "end_date": end_date_str,
            "per_page": games_per_page,
            "page": games_page
        }

        response = requests.get(games_url, params=params)
        if response.status_code == 200:
            data = response.json()
            all_games.extend(data['data'])

            if games_page == 1:
                games_total_pages = data['meta']['total_pages']

            games_page += 1

    # Fetch stats for each game
    for game in all_games:
        game_id = game['id']
        stats_page = 1
        stats_per_page = 100
        stats_total_pages = 1

        while stats_page <= stats_total_pages:
            params = {
                "game_ids[]": game_id,
                "per_page": stats_per_page,
                "page": stats_page
            }

            response = requests.get(stats_url, params=params)
            if response.status_code == 200:
                data = response.json()
                all_stats.extend(data['data'])

                if stats_page == 1:
                    stats_total_pages = data['meta']['total_pages']

                stats_page += 1

    # Merge games and stats with flattened structure
    merged_data = []
    for stat in all_stats:
        game_info = next((game for game in all_games if game['id'] == stat['game']['id']), None)
        if game_info:
            # Flatten the game, player, team, home_team, and visitor_team data
            flattened_game_info = {f'game_{k}': v for k, v in game_info.items() if k != 'home_team' and k != 'visitor_team'}
            flattened_player_info = {f'player_{k}': v for k, v in stat['player'].items()}
            flattened_team_info = {f'player_team_{k}': v for k, v in stat['team'].items()}
            flattened_home_team_info = {f'home_team_{k}': v for k, v in game_info['home_team'].items()}
            flattened_visitor_team_info = {f'visitor_team_{k}': v for k, v in game_info['visitor_team'].items()}

            # Merge all flattened data
            merged_entry = {**flattened_game_info, **flattened_player_info, **flattened_team_info, **flattened_home_team_info, **flattened_visitor_team_info, **stat}
            merged_data.append(merged_entry)

    return merged_data

# Fetch all NBA games and stats for the past 5 seasons
current_year = datetime.now().year if datetime.now().month >= 10 else datetime.now().year - 1
all_seasons_data = []

for season_year in range(current_year, current_year - 3, -1):
    season_data = fetch_nba_games_and_stats(season_year)
    all_seasons_data.extend(season_data)

# You can convert the data into a DataFrame or process it as needed
df = pd.DataFrame(all_seasons_data)


In [3]:
# Convert to DataFrame
df = pd.DataFrame(merged_data)

# Export to CSV
csv_filename = './nba_games_historical.csv'
df.to_csv(csv_filename, index=False)

csv_filename

'./nba_games.csv'

In [2]:
import pandas as pd

# Load the original dataset
nba_data = pd.read_csv('./nba_games_historical.csv')  # Replace with your file path

# Aggregate player statistics for each team in each game
team_game_stats = nba_data.groupby(['game_id', 'player_team_id']).agg({
    'ast': 'sum', 
    'blk': 'sum', 
    'dreb': 'sum', 
    'fg3_pct': 'mean', 
    'fg3a': 'sum', 
    'fg3m': 'sum',
    'fg_pct': 'mean', 
    'fga': 'sum', 
    'fgm': 'sum', 
    'ft_pct': 'mean', 
    'fta': 'sum', 
    'ftm': 'sum', 
    'min': 'sum', 
    'oreb': 'sum', 
    'pf': 'sum', 
    'pts': 'sum', 
    'reb': 'sum', 
    'stl': 'sum', 
    'turnover': 'sum'
}).reset_index()

# Extract team meta information
team_meta = nba_data[['player_team_id', 'player_team_abbreviation', 'player_team_city', 'player_team_conference', 'player_team_division', 'player_team_full_name', 'player_team_name']].drop_duplicates()

# Merge team stats with team meta information
team_game_stats = pd.merge(team_game_stats, team_meta, left_on='player_team_id', right_on='player_team_id', how='left')

# Separating home and visitor team stats
home_team_stats = team_game_stats.add_suffix('_home')
visitor_team_stats = team_game_stats.add_suffix('_visitor')

# Merge home and visitor team stats into a single row per game
final_data = nba_data[['game_id', 'home_team_id', 'visitor_team_id', 'game_date', 'game_home_team_score', 'game_visitor_team_score']].drop_duplicates()
final_data = final_data.merge(home_team_stats, left_on=['game_id', 'home_team_id'], right_on=['game_id_home', 'player_team_id_home'], how='left')
final_data = final_data.merge(visitor_team_stats, left_on=['game_id', 'visitor_team_id'], right_on=['game_id_visitor', 'player_team_id_visitor'], how='left')

# Drop redundant columns
final_data.drop(['game_id_home', 'player_team_id_home', 'game_id_visitor', 'player_team_id_visitor'], axis=1, inplace=True)

# Rename columns to remove 'player_' prefix
final_data = final_data.rename(columns=lambda x: x.replace('player_', ''))

# Calculate point spread
final_data['point_spread'] = final_data['game_home_team_score'] - final_data['game_visitor_team_score']

# Add a column to indicate the game winner
final_data['game_winner'] = final_data.apply(lambda row: 'HOME' if row['game_home_team_score'] > row['game_visitor_team_score'] else 'VISITOR', axis=1)

# Final structured dataset
final_dataset = final_data


FileNotFoundError: [Errno 2] No such file or directory: './nba_games.csv'

In [None]:
# Convert to DataFrame
df = pd.DataFrame(final_dataset)

# Export to CSV
csv_filename = './nba_games_formatted.csv'
df.to_csv(csv_filename, index=False)

csv_filename

In [None]:
import pandas as pd

# Load the dataset
file_path = './nba_games_formatted.csv'  # Replace with your dataset's path
nba_data = pd.read_csv(file_path)

# Convert 'game_date' to datetime format
nba_data['game_date'] = pd.to_datetime(nba_data['game_date']).dt.date

# List of statistics for which to calculate 10-day averages
stats_columns = ['ast', 'blk', 'dreb', 'fg3_pct', 'fg_pct', 'ft_pct', 'oreb', 'pf', 'pts', 'reb', 'stl', 'turnover']

# Function to calculate 10-day averages
def calculate_10_day_averages(row, data):
    game_date = row['game_date']
    home_team_id = row['home_team_id']
    visitor_team_id = row['visitor_team_id']

    # Filter for the last 10 games for each team before the game date
    last_10_home = data[((data['home_team_id'] == home_team_id) | (data['visitor_team_id'] == home_team_id)) & 
                        (data['game_date'] < game_date)].sort_values(by='game_date', ascending=False).head(10)
    last_10_visitor = data[((data['home_team_id'] == visitor_team_id) | (data['visitor_team_id'] == visitor_team_id)) & 
                           (data['game_date'] < game_date)].sort_values(by='game_date', ascending=False).head(10)

    # Calculate averages for each team
    averages = {}
    for stat in stats_columns:
        # Home team averages
        home_stat_home_games = last_10_home[last_10_home['home_team_id'] == home_team_id][f"{stat}_home"]
        home_stat_visitor_games = last_10_home[last_10_home['visitor_team_id'] == home_team_id][f"{stat}_visitor"]
        averages[f"{stat}_home_10day_avg"] = pd.concat([home_stat_home_games, home_stat_visitor_games]).mean()

        # Visitor team averages
        visitor_stat_home_games = last_10_visitor[last_10_visitor['home_team_id'] == visitor_team_id][f"{stat}_home"]
        visitor_stat_visitor_games = last_10_visitor[last_10_visitor['visitor_team_id'] == visitor_team_id][f"{stat}_visitor"]
        averages[f"{stat}_visitor_10day_avg"] = pd.concat([visitor_stat_home_games, visitor_stat_visitor_games]).mean()

    return averages

# Calculating 10-day averages and appending to the dataset
for index, row in nba_data.iterrows():
    averages = calculate_10_day_averages(row, nba_data)
    nba_data.loc[index, averages.keys()] = averages.values()

# Display the updated dataset
nba_data.head()



In [None]:

# Export to CSV
csv_filename = './nba_games_formatted_with_10_day_averages.csv'
nba_data.to_csv(csv_filename, index=False)

csv_filename