In [1]:
import os

# BeautifulSoup for parsing the html pages
from bs4 import BeautifulSoup

# PlayWright for using the web to access basketball reference
# An async function runs in a different thread while the rest of your code runs.
# This saves time especially when your async function is taking a lot of time.
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout

import time

In [2]:
# A constant which we use to store the seasons which we want to scrape.
SEASONS = list(range(2016,2024))

In [3]:
# os.path.join just makes the strings for the directories have backslashes when you join them
# print(DATA_DIR)
# print(STANDINGS_DIR)
# print(SCORES_DIR)

In [4]:
# Creating a directory called data2
DATA_DIR = "data2"

# A directory for standings, stemming from data2,
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")

# A directory for scores, stemming from data2,
SCORES_DIR = os.path.join(DATA_DIR, "scores")

In [21]:
# Async function to get html
async def get_html(url, selector, sleep=5, retries=3): 
    html = None
    for i in range (1, retries+1):
        time.sleep(sleep*i) # incrementally pausing code so that server doesn't ban for scraping

        try:
            # initialize the playwright instance (like opening a file).
            async with async_playwright() as p:
                browser = await p.firefox.launch() # await allows you to wait for the async function to finish
                page = await browser.new_page() # creating new tab
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout: # Imported earlier as PlaywrightTimeout because 'Timeout' already exists in base python
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

In [23]:
# the code for scraping the season page
async def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = await get_html(url, "#content .filter") # looks for the id named content and then finds the class named filter

    # initialize beautiful soup
    soup = BeautifulSoup(html)
    links = soup.find_all("a")
    href = [l["href"] for l in links]  # Adds the link's href tag for every link in the links list.
    # create full urls
    standings_pages = [f"https://basketball-reference.com{l}" for l in href]

    for url in standings_pages:
        # split() will split the string into a list based on the given separator. If no separator is give, it defaults to space
        save_path = os.path.join(STANDINGS_DIR, url.split("/")[-1]) 
        if os.path.exists(save_path):
            continue
        html = await get_html(url, "#all_schedule") # id in which our table is
        with open(save_path, "w+") as f: # create a new file with the same path name, save_path
            f.write(html)

In [24]:
# loop through and scrape the seasons
for season in SEASONS:
    await scrape_season(season)

Timeout error on https://www.basketball-reference.com/leagues/NBA_2016_games.html
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
Timeout error on https://basketball-reference.com/leagues/NBA_2016_games-january.html
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
Timeout error on

In [25]:
await scrape_season(2024) # I deleted april

Timeout error on https://www.basketball-reference.com/leagues/NBA_2024_games.html
2023-24 NBA Schedule | Basketball-Reference.com
2023-24 NBA Schedule | Basketball-Reference.com
2023-24 NBA Schedule | Basketball-Reference.com
2023-24 NBA Schedule | Basketball-Reference.com
Timeout error on https://basketball-reference.com/leagues/NBA_2024_games-january.html
Timeout error on https://basketball-reference.com/leagues/NBA_2024_games-january.html
2023-24 NBA Schedule | Basketball-Reference.com
2023-24 NBA Schedule | Basketball-Reference.com
2023-24 NBA Schedule | Basketball-Reference.com
Timeout error on https://basketball-reference.com/leagues/NBA_2024_games-april.html
2023-24 NBA Schedule | Basketball-Reference.com


In [26]:
standings_files = os.listdir(STANDINGS_DIR)

In [27]:
standings_files

['NBA_2022_games-october.html',
 'NBA_2021_games-june.html',
 'NBA_2020_games-march.html',
 'NBA_2020_games-september.html',
 'NBA_2020_games-january.html',
 'NBA_2020_games-august.html',
 'NBA_2023_games-may.html',
 'NBA_2019_games-april.html',
 'NBA_2023_games-april.html',
 'NBA_2022_games-may.html',
 '.DS_Store',
 'NBA_2019_games-february.html',
 'NBA_2018_games-february.html',
 'NBA_2023_games-january.html',
 'NBA_2016_games-april.html',
 'NBA_2024_games-march.html',
 'NBA_2021_games-march.html',
 'NBA_2018_games-january.html',
 'NBA_2024_games-november.html',
 'NBA_2017_games-february.html',
 'NBA_2024_games-december.html',
 'NBA_2016_games-february.html',
 'NBA_2017_games-october.html',
 'NBA_2018_games-april.html',
 'NBA_2020_games-december.html',
 'NBA_2019_games-october.html',
 'NBA_2020_games-november.html',
 'NBA_2021_games-may.html',
 'NBA_2021_games-december.html',
 'NBA_2022_games-april.html',
 'NBA_2022_games-december.html',
 'NBA_2023_games-november.html',
 'NBA_2024_ga

In [32]:
standings_file = os.path.join(STANDINGS_DIR, standings_files[0])
with open(standings_file, 'r') as f:
    html = f.read()

soup = BeautifulSoup(html)
links = soup.find_all("a")
hrefs = [l.get("href") for l in links] # not sure why, but use l.get("href")
box_scores = [l for l in hrefs if l and "boxscore" in l and ".html" in l] # makes sure the link is not a 'NONE' and is a boxscore

In [33]:
box_scores

['/boxscores/202110190MIL.html',
 '/boxscores/202110190LAL.html',
 '/boxscores/202110200CHO.html',
 '/boxscores/202110200DET.html',
 '/boxscores/202110200NYK.html',
 '/boxscores/202110200TOR.html',
 '/boxscores/202110200MEM.html',
 '/boxscores/202110200MIN.html',
 '/boxscores/202110200NOP.html',
 '/boxscores/202110200SAS.html',
 '/boxscores/202110200UTA.html',
 '/boxscores/202110200POR.html',
 '/boxscores/202110200PHO.html',
 '/boxscores/202110210ATL.html',
 '/boxscores/202110210MIA.html',
 '/boxscores/202110210GSW.html',
 '/boxscores/202110220ORL.html',
 '/boxscores/202110220WAS.html',
 '/boxscores/202110220CLE.html',
 '/boxscores/202110220BOS.html',
 '/boxscores/202110220PHI.html',
 '/boxscores/202110220HOU.html',
 '/boxscores/202110220CHI.html',
 '/boxscores/202110220DEN.html',
 '/boxscores/202110220LAL.html',
 '/boxscores/202110220SAC.html',
 '/boxscores/202110230CLE.html',
 '/boxscores/202110230IND.html',
 '/boxscores/202110230TOR.html',
 '/boxscores/202110230CHI.html',
 '/boxscor