In [1]:
# GENERAL USE LIBRARY IMPORTS

import datetime as dt
import random
import random as rand
import warnings
from datetime import datetime
import time

import matplotlib.pyplot as plt
import numpy as np
import numpy.random
import pandas as pd
import os
import requests
from zipfile import ZipFile

warnings.filterwarnings('ignore')

In [2]:
# Create dictionary of team abbvs and names
team_abbvs = ['ATL', 'BOS', 'BRK', 'CHO', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK',  'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC','SAS', 'TOR', 'UTA','WAS']
team_names = ['Atlanta Hawks','Boston Celtics','Brooklyn Nets','Charlotte Hornets','Chicago Bulls','Cleveland Cavaliers','Dallas Mavericks','Denver Nuggets','Detroit Pistons','Golden State Warriors','Houston Rockets','Indiana Pacers','Los Angeles Clippers','Los Angeles Lakers','Memphis Grizzlies','Miami Heat','Milwaukee Bucks','Minnesota Timberwolves','New Orleans Pelicans','New York Knicks','Oklahoma City Thunder','Orlando Magic','Philadelphia 76ers','Phoenix Suns','Portland Trail Blazers','Sacramento Kings','San Antonio Spurs','Toronto Raptors','Utah Jazz','Washington Wizards']
name_to_abbv = dict(zip(team_names,team_abbvs))

In [3]:
def get_boxscore(response):
    
    boxscore = pd.read_html(response)
    # differing layouts if ot
    num = len(boxscore)
    num_div_two = int(num/2)
    awayteam_basic = boxscore[0]
    awayteam_advanced = boxscore[num_div_two -1]
    hometeam_basic = boxscore[num_div_two]
    hometeam_advanced = boxscore[num-1]
    
    awayteam = pd.merge(awayteam_basic,awayteam_advanced,left_index= True,right_index =True)
    awayteam = awayteam.drop(5,axis = 0).drop(columns = ('Unnamed: 0_level_0_y','Starters'),axis = 1)
    awayteam = awayteam.rename(columns={'Starters': 'Players'})
    awayteam.columns = awayteam.columns.droplevel(0)
    
    hometeam = pd.merge(hometeam_basic,hometeam_advanced,left_index= True,right_index =True)
    hometeam = hometeam.drop(5,axis = 0).drop(columns = ('Unnamed: 0_level_0_y','Starters'),axis = 1)
    hometeam = hometeam.rename(columns={'Starters': 'Players'})
    hometeam.columns = hometeam.columns.droplevel(0)

    return awayteam, hometeam

In [7]:
full_schedule_df = pd.read_csv("full_scedule.csv")
full_schedule_df['f_Date'] = pd.to_datetime(full_schedule_df['Date']).dt.strftime('%Y%m%d')
full_schedule_df['Home_abbr'] = full_schedule_df['Home/Neutral'].map(name_to_abbv)
full_schedule_df['Away_abbr'] = full_schedule_df['Visitor/Neutral'].map(name_to_abbv)

#print(full_schedule_df)

In [9]:
simple_games_df = full_schedule_df[['f_Date', 'Home_abbr', 'Away_abbr']].rename(columns={
    'f_Date': 'Date',
    'Home_abbr': 'Home',
    'Away_abbr': 'Away'
})
print(simple_games_df)

          Date Home Away
0     20231024  DEN  LAL
1     20231024  GSW  PHO
2     20231025  ORL  HOU
3     20231025  NYK  BOS
4     20231025  IND  WAS
...        ...  ...  ...
1111  20240331  HOU  DAL
1112  20240331  MIN  CHI
1113  20240331  NYK  OKC
1114  20240331  SAS  GSW
1115  20240331  SAC  UTA

[1116 rows x 3 columns]


In [10]:
# create list of dates:
# Define the start and end dates
start_date = '2023-10-24'
#testing with smaller set
#end_date = '2023-10-31'
end_date = '2023-10-30'

# Generate the range of dates
date_range = pd.date_range(start=start_date, end=end_date)

# Convert to list
date_list = list(date_range)

# format dates:
formatted_date_list = [date.strftime('%Y%m%d') for date in date_range]

# Print the formatted list of dates
print(formatted_date_list)
# Print the list of dates


['20231024', '20231025', '20231026', '20231027', '20231028', '20231029', '20231030']


In [11]:
def fetch_with_retry_after(url):
    response = requests.get(url)
    if response.status_code == 429:
        retry_after = int(response.headers.get('Retry-After', 60))  # Default to 60 seconds if header is missing
        print(f"Rate limit exceeded. Retrying after {retry_after} seconds...")
        time.sleep(retry_after)
        return fetch_with_retry_after(url)  # Recursively retry fetching
    response.raise_for_status()  # Raise an error for other status codes
    return response

In [12]:
def save_box_scores(formatted_date_list, simple_games_df, url_base):
    # Create a main directory to hold all data before zipping
    main_folder = 'NBA_Box_Scores'
    os.makedirs(main_folder, exist_ok=True)
    
    for date in formatted_date_list:
        games_df = simple_games_df[simple_games_df['Date'] == date]
        for index, row in games_df.iterrows():
            home_abbr = row['Home']
            away_abbr = row['Away']
            game_folder = f"{date}/{away_abbr}@{home_abbr}"  # Folder name format: YYYYMMDD/Away@Home
            full_folder_path = os.path.join(main_folder, game_folder)
            os.makedirs(full_folder_path, exist_ok=True)

            # Format the URL
            formatted_url = f"{url_base}{date}0{home_abbr}.html"
            # Fetch and save box scores
            try: 
                response = fetch_with_retry_after(formatted_url)
                
            # Save each team's box score in the specific game folder
                away_df,home_df = get_boxscore(response.text)
                away_df.to_csv(f"{full_folder_path}/away_team.csv", index=False)
                home_df.to_csv(f"{full_folder_path}/home_team.csv", index=False)
            except Exception as e:
                print(f"Error fetching data for URL {formatted_url}: {str(e)}")
            
    # Zip the entire directory
    with ZipFile(f"{main_folder}.zip", 'w') as zipf:
        for root, dirs, files in os.walk(main_folder):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(main_folder, '..')))

# Example usage
url_base = "https://www.basketball-reference.com/boxscores/"

save_box_scores(formatted_date_list, simple_games_df, url_base)

Rate limit exceeded. Retrying after 2153 seconds...


KeyboardInterrupt: 