In [1]:
import os

# BeautifulSoup for parsing the html pages
from bs4 import BeautifulSoup

# PlayWright for using the web to access basketball reference
# An async function runs in a different thread while the rest of your code runs.
# This saves time especially when your async function is taking a lot of time.
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout

import time

In [2]:
# A constant which we use to store the seasons which we want to scrape.
SEASONS = list(range(2016,2024))

In [3]:
# os.path.join just makes the strings for the directories have backslashes when you join them
# print(DATA_DIR)
# print(STANDINGS_DIR)
# print(SCORES_DIR)

In [4]:
# Creating a directory called data2
DATA_DIR = "data2"

# A directory for standings, stemming from data2,
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")

# A directory for scores, stemming from data2,
SCORES_DIR = os.path.join(DATA_DIR, "scores")

In [5]:
# Async function to get html
async def get_html(url, selector, sleep=5, retries=3):
    html = None
    for i in range (1, retries+1):
        time.sleep(sleep*i) # incrementally pausing code so that server doesn't ban for scraping

        try:
            # initialize the playwright instance (like opening a file).
            async with async_playwright() as p:
                browser = await p.firefox.launch() # await allows you to wait for the async function to finish
                page = await browser.new_page() # creating new tab
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout: # Imported earlier as PlaywrightTimeout because 'Timeout' already exists in base python
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

In [6]:
# the code for scraping the season page

season = 2016

url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"

html = await get_html(url, "#content .filter") # looks for the id named content and then finds the class named filter

2015-16 NBA Schedule | Basketball-Reference.com


In [7]:
html

'\n\n<div class="">\n\t<a href="/leagues/NBA_2016_games-october.html">October</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2016_games-november.html">November</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2016_games-december.html">December</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2016_games-january.html">January</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2016_games-february.html">February</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2016_games-march.html">March</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2016_games-april.html">April</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2016_games-may.html">May</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2016_games-june.html">June</a>\n</div>'