In [96]:
from enum import Enum

In [97]:
class BREFTeams(Enum):
    ANGELS = "ANA"
    DIAMONDBACKS = "ARI"
    BRAVES = "ATL"
    ORIOLES = "BAL"
    RED_SOX = "BOS"
    CUBS = "CHC"
    WHITE_SOX = "CHW"
    REDS = "CIN"
    GUARDIANS = "CLE"
    ROCKIES = "COL"
    TIGERS = "DET"
    MARLINS = "FLA"
    ASTROS = "HOU"
    ROYALS = "KCR"
    DODGERS = "LAD"
    BREWERS = "MIL"
    TWINS = "MIN"
    METS = "NYM"
    YANKEES = "NYY"
    ATHLETICS = "OAK"
    PHILLIES = "PHI"
    PIRATES = "PIT"
    PADRES = "SDP"
    MARINERS = "SEA"
    GIANTS = "SFG"
    CARDINALS = "STL"
    RAYS = "TBD"
    RANGERS = "TEX"
    BLUE_JAYS = "TOR"
    NATIONALS = "WSN"

    @classmethod
    def show_options(cls):
        return "\n".join([f"{team.name}: {team.value}" for team in cls])

In [None]:
import nest_asyncio

nest_asyncio.apply()  # patch asyncio to allow nested loops in notebooks

In [99]:
BREF_TEAM_RECORD_URL = "https://www.baseball-reference.com/teams/{team_code}/"

In [100]:
import asyncio

from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

In [101]:
async def fetch_baseball_page(url: str) -> str:
    """
    Fetches the full HTML of a Baseball Reference page using Playwright
    (bypasses Cloudflare JavaScript challenge).
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
        # Wait until network is idle (all JS/XHRs done)
        await page.wait_for_load_state("networkidle")
        html = await page.content()
        await browser.close()
        return html

In [102]:
url = "https://www.baseball-reference.com/teams/NYY/"

In [103]:
html = asyncio.run(fetch_baseball_page(url))

TimeoutError: Timeout 30000ms exceeded.

In [None]:
soup = BeautifulSoup(html, "html.parser")

In [None]:
print("Page Title:", soup.title.string)

Page Title: New York Yankees Team History & Encyclopedia | Baseball-Reference.com


In [None]:
franch_history_table = soup.find("table", {"id": "franchise_years"})

In [None]:
franch_history_table

<table class="sortable stats_table now_sortable sticky_table eq1 re1 le1" data-cols-to-freeze=",1" id="franchise_years">
<caption>Franchise History</caption>
<colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
<thead>
<tr>
<th aria-label="Year" class="poptip sort_default_asc show_partial_when_sorting left" data-stat="year_ID" data-tip="A Star indicates an all-star that season.&lt;br&gt;A Ring indicates the player appeared in WS for winning team." scope="col">Year</th>
<th aria-label="Tm" class="poptip sort_default_asc left" data-stat="team_name" scope="col">Tm</th>
<th aria-label="Lg" class="poptip sort_default_asc center" data-stat="lg_ID" data-tip="&lt;strong&gt;League&lt;/strong&gt;&lt;br&gt; &lt;strong&gt;AL&lt;/strong&gt; - American League (1901-present)&lt;br&gt; &lt;strong&gt;NL&lt;/strong&gt; - National League (1876-present)&lt;br&gt; &lt;strong&gt;AA&lt;/strong&gt; - American Associ

In [None]:
def _extract_table(table):
    trs = table.tbody.find_all("tr")
    row_data = {}
    for tr in trs:
        if tr.has_attr("class") and "thead" in tr["class"]:
            continue
        tds = tr.find_all("th")
        tds.extend(tr.find_all("td"))
        if len(tds) == 0:
            continue
        for td in tds:
            data_stat = td.attrs["data-stat"]
            if data_stat not in row_data:
                row_data[data_stat] = []
            if td.find("a"):
                row_data[data_stat].append(td.find("a").text)
            elif td.find("span"):
                row_data[data_stat].append(td.find("span").string)
            elif td.find("strong"):
                row_data[data_stat].append(td.find("strong").string)
            else:
                row_data[data_stat].append(td.string)
    return row_data


In [115]:
import polars as pl

In [116]:
df = pl.DataFrame(_extract_table(franch_history_table))

In [117]:
df

year_ID,team_name,lg_ID,G,W,L,ties,win_loss_perc,win_loss_perc_pythag,finish,games_back,playoffs,R,RA,attendance,age_bat,age_pit,batters_used,pitchers_used,war_leader,managers
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""2025""","""New York Yankees""","""AL East""","""132""","""72""","""60""","""0""",""".545""",""".580""","""3rd of 5""","""4.5""",,"""681""","""571""","""2,859,427""","""28.5""","""30.7""","""52""","""33""","""A.Judge""","""A.Boone"""
"""2024""","""New York Yankees""","""AL East""","""162""","""94""","""68""","""0""",""".580""",""".590""","""1st of 5""","""--""","""Lost WS (4-1)""","""815""","""668""","""3,309,838""","""28.0""","""30.1""","""54""","""33""","""A.Judge""","""A.Boone"""
"""2023""","""New York Yankees""","""AL East""","""162""","""82""","""80""","""0""",""".506""",""".483""","""4th of 5""","""19.0""",,"""673""","""698""","""3,269,016""","""28.5""","""29.1""","""54""","""32""","""G.Cole""","""A.Boone"""
"""2022""","""New York Yankees""","""AL East""","""162""","""99""","""63""","""0""",""".611""",""".656""","""1st of 5""","""--""","""Lost ALCS (4-0)""","""807""","""567""","""3,136,207""","""30.3""","""29.2""","""54""","""33""","""A.Judge""","""A.Boone"""
"""2021""","""New York Yankees""","""AL East""","""162""","""92""","""70""","""0""",""".568""",""".528""","""2nd of 5""","""8.0""","""Lost ALWC (1-0)""","""711""","""669""","""1,959,854""","""29.3""","""29.3""","""59""","""30""","""A.Judge""","""A.Boone"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""1907""","""New York Highlanders""","""AL""","""152""","""70""","""78""","""4""",""".473""",""".455""","""5th of 8""","""21.0""",,"""605""","""667""","""350,020""","""28.1""","""29.3""","""33""","""17""","""J.Chesbro""","""C.Griffith"""
"""1906""","""New York Highlanders""","""AL""","""155""","""90""","""61""","""4""",""".596""",""".575""","""2nd of 8""","""3.0""",,"""640""","""543""","""434,700""","""28.2""","""29.4""","""26""","""11""","""A.Orth""","""C.Griffith"""
"""1905""","""New York Highlanders""","""AL""","""152""","""71""","""78""","""3""",""".477""",""".473""","""6th of 8""","""21.5""",,"""586""","""621""","""309,100""","""29.2""","""29.0""","""35""","""11""","""A.Orth""","""C.Griffith"""
"""1904""","""New York Highlanders""","""AL""","""155""","""92""","""59""","""4""",""".609""",""".558""","""2nd of 8""","""1.5""",,"""598""","""526""","""438,919""","""29.3""","""29.1""","""26""","""9""","""J.Chesbro""","""C.Griffith"""


In [118]:
df = df.with_columns(
    [
        pl.col(
            [
                "G",
                "W",
                "L",
                "ties",
                "R",
                "RA",
                "batters_used",
                "pitchers_used",
                "year_ID",
            ]
        ).cast(pl.Int16),
        pl.col(
            [
                "win_loss_perc",
                "win_loss_perc_pythag",
                "age_bat",
                "age_pit",
            ]
        ).cast(pl.Float32),
    ]
)
df.head()

year_ID,team_name,lg_ID,G,W,L,ties,win_loss_perc,win_loss_perc_pythag,finish,games_back,playoffs,R,RA,attendance,age_bat,age_pit,batters_used,pitchers_used,war_leader,managers
i16,str,str,i16,i16,i16,i16,f32,f32,str,str,str,i16,i16,str,f32,f32,i16,i16,str,str
2025,"""New York Yankees""","""AL East""",132,72,60,0,0.545,0.58,"""3rd of 5""","""4.5""",,681,571,"""2,859,427""",28.5,30.700001,52,33,"""A.Judge""","""A.Boone"""
2024,"""New York Yankees""","""AL East""",162,94,68,0,0.58,0.59,"""1st of 5""","""--""","""Lost WS (4-1)""",815,668,"""3,309,838""",28.0,30.1,54,33,"""A.Judge""","""A.Boone"""
2023,"""New York Yankees""","""AL East""",162,82,80,0,0.506,0.483,"""4th of 5""","""19.0""",,673,698,"""3,269,016""",28.5,29.1,54,32,"""G.Cole""","""A.Boone"""
2022,"""New York Yankees""","""AL East""",162,99,63,0,0.611,0.656,"""1st of 5""","""--""","""Lost ALCS (4-0)""",807,567,"""3,136,207""",30.299999,29.200001,54,33,"""A.Judge""","""A.Boone"""
2021,"""New York Yankees""","""AL East""",162,92,70,0,0.568,0.528,"""2nd of 5""","""8.0""","""Lost ALWC (1-0)""",711,669,"""1,959,854""",29.299999,29.299999,59,30,"""A.Judge""","""A.Boone"""


In [119]:
df = df.with_columns(pl.col("games_back").str.replace("--", "0").cast(pl.Float32))
df.head()

year_ID,team_name,lg_ID,G,W,L,ties,win_loss_perc,win_loss_perc_pythag,finish,games_back,playoffs,R,RA,attendance,age_bat,age_pit,batters_used,pitchers_used,war_leader,managers
i16,str,str,i16,i16,i16,i16,f32,f32,str,f32,str,i16,i16,str,f32,f32,i16,i16,str,str
2025,"""New York Yankees""","""AL East""",132,72,60,0,0.545,0.58,"""3rd of 5""",4.5,,681,571,"""2,859,427""",28.5,30.700001,52,33,"""A.Judge""","""A.Boone"""
2024,"""New York Yankees""","""AL East""",162,94,68,0,0.58,0.59,"""1st of 5""",0.0,"""Lost WS (4-1)""",815,668,"""3,309,838""",28.0,30.1,54,33,"""A.Judge""","""A.Boone"""
2023,"""New York Yankees""","""AL East""",162,82,80,0,0.506,0.483,"""4th of 5""",19.0,,673,698,"""3,269,016""",28.5,29.1,54,32,"""G.Cole""","""A.Boone"""
2022,"""New York Yankees""","""AL East""",162,99,63,0,0.611,0.656,"""1st of 5""",0.0,"""Lost ALCS (4-0)""",807,567,"""3,136,207""",30.299999,29.200001,54,33,"""A.Judge""","""A.Boone"""
2021,"""New York Yankees""","""AL East""",162,92,70,0,0.568,0.528,"""2nd of 5""",8.0,"""Lost ALWC (1-0)""",711,669,"""1,959,854""",29.299999,29.299999,59,30,"""A.Judge""","""A.Boone"""
