In [1]:
import polars as pl
import requests
from bs4 import BeautifulSoup

pl.Config.set_tbl_cols(100000)

polars.config.Config

In [2]:
url = "https://www.fangraphs.com/leaders/major-league?pos={pos}&stats=bat&lg={league}&qual={qual}&type={stat_type}&season={end_season}&season1={start_season}&ind=0&startdate={start_date}&enddate={end_date}&month=0&team=0&pagenum=1&pageitems=2000000000"

# pos options: c, 1b,2b,3b, ss, lf, cf, rf, dh, of, p, all
# qual options: y, n
# league options: "", "al", "nl"
# start date, end date are strings in the format "yyyy-mm-dd"
# stat options: 8 (dashboard), 0 (standard), 1 (advanced), 2 (batted ball), 3 (win_probability), 6 (value), 23 (+stats),24 (statcast), 48 (violations)

In [3]:
def get_table_data(
    stat_type, pos, league, start_date, end_date, qual, start_season, end_season
):
    # Assuming `cont` contains the HTML content
    cont = requests.get(
        url.format(
            pos=pos,
            league=league,
            stat_type=stat_type,
            start_season=start_season,
            end_season=end_season,
            qual=qual,
            start_date=start_date,
            end_date=end_date,
        )
    ).content.decode("utf-8")

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(cont, "html.parser")

    # Find the main table using the provided CSS selector
    main_table = soup.select_one(
        "#content > div.leaders-major_leaders-major__table__hcmbm > div.fg-data-grid.table-type > div.table-wrapper-outer > div > div.table-scroll > table"
    )

    # Find the table header
    thead = main_table.find("thead")

    # Extract column names from the data-col-id attribute of the <th> elements, excluding "divider"
    headers = [
        th["data-col-id"]
        for th in thead.find_all("th")
        if "data-col-id" in th.attrs and th["data-col-id"] != "divider"
    ]

    # Find the table body within the main table
    tbody = main_table.find("tbody")

    # Initialize a list to store the extracted data
    data = []

    # Iterate over each row in the table body
    for row in tbody.find_all("tr"):
        row_data = {header: None for header in headers}  # Initialize with None
        for cell in row.find_all("td"):
            col_id = cell.get("data-col-id")

            if col_id and col_id != "divider":
                if cell.find("a"):
                    row_data[col_id] = cell.find("a").text
                elif cell.find("span"):
                    row_data[col_id] = cell.find("span").text
                else:
                    text = cell.text.strip().replace("%", "")
                    if text == "":
                        row_data[col_id] = None
                    else:
                        try:
                            row_data[col_id] = float(text) if "." in text else int(text)
                        except ValueError:
                            row_data[col_id] = text
                        except Exception as e:
                            print(e)
                            print(cell.attrs["data-col-id"])
                            row_data[col_id] = text
        # Print row_data for debugging
        data.append(row_data)

    # Create a Polars DataFrame from the extracted data
    df = pl.DataFrame(data, infer_schema_length=None)
    return df

In [4]:
# Define the available stat types as an Enum
from enum import Enum


class FangraphsBattingStatType(Enum):
    DASHBOARD = 8
    STANDARD = 0
    ADVANCED = 1
    BATTED_BALL = 2
    WIN_PROBABILITY = 3
    VALUE = 6
    PLUS_STATS = 23
    STATCAST = 24
    VIOLATIONS = 48
    SPORTS_INFO_PITCH_TYPE = 4
    SPORTS_INFO_PITCH_VALUE = 7
    SPORTS_INFO_PLATE_DISCIPLINE = 5
    STATCAST_PITCH_TYPE = 9
    STATCAST_VELO = 10
    STATCAST_H_MOVEMENT = 11
    STATCAST_V_MOVEMENT = 12
    STATCAST_PITCH_TYPE_VALUE = 13
    STATCAST_PITCH_TYPE_VALUE_PER_100 = 14
    STATCAST_PLATE_DISCIPLINE = 15

In [5]:
stat_types = {}
for stat_type in FangraphsBattingStatType:
    stat_types[stat_type] = stat_type.value

In [6]:
df_list = []
for stat_type in stat_types:
    print(f"Fetching data for {stat_type}...")
    df = get_table_data(
        stat_types[stat_type], "all", "", "2024-04-01", "2024-05-01", "y", "", ""
    )
    if df is not None:
        print(f"Data fetched for {stat_type}")
        df_list.append(df)
    else:
        print(f"Warning: No data returned for {stat_type}")

Fetching data for FangraphsBattingStatType.DASHBOARD...
Data fetched for FangraphsBattingStatType.DASHBOARD
Fetching data for FangraphsBattingStatType.STANDARD...
Data fetched for FangraphsBattingStatType.STANDARD
Fetching data for FangraphsBattingStatType.ADVANCED...
Data fetched for FangraphsBattingStatType.ADVANCED
Fetching data for FangraphsBattingStatType.BATTED_BALL...
Data fetched for FangraphsBattingStatType.BATTED_BALL
Fetching data for FangraphsBattingStatType.WIN_PROBABILITY...
Data fetched for FangraphsBattingStatType.WIN_PROBABILITY
Fetching data for FangraphsBattingStatType.VALUE...
Data fetched for FangraphsBattingStatType.VALUE
Fetching data for FangraphsBattingStatType.PLUS_STATS...
Data fetched for FangraphsBattingStatType.PLUS_STATS
Fetching data for FangraphsBattingStatType.STATCAST...
Data fetched for FangraphsBattingStatType.STATCAST
Fetching data for FangraphsBattingStatType.VIOLATIONS...
Data fetched for FangraphsBattingStatType.VIOLATIONS
Fetching data for Fang

In [7]:
from polars import selectors as cs

df = df_list[0]
len_cols = len(df.columns)
for i in range(1, len(df_list)):
    len_cols += len(df_list[i].columns)
    df = df.join(df_list[i], on="Name", how="inner").select(
        ~cs.ends_with("_right"),
    )
df.head()

Name,Team,G,PA,HR,R,RBI,SB,BB%,K%,ISO,BABIP,AVG,OBP,SLG,wOBA,xwOBA,wRC+,BaseRunning,Offense,Defense,WAR,AB,H,1B,2B,3B,BB,IBB,SO,HBP,SF,SH,GDP,CS,BB/K,OPS,Spd,UBR,GDPRuns,XBR,wBsR,wRC,wRAA,GB/FB,LD%,GB%,FB%,IFFB%,HR/FB,IFH,IFH%,BUH,BUH%,Pull%,Cent%,Oppo%,Soft%,Med%,Hard%,WPA,-WPA,+WPA,RE24,REW,pLI,phLI,PH,WPA/LI,Clutch,Batting,Fielding,Positional,wLeague,Replacement,RAR,Dollars,BB%+,K%+,AVG+,OBP+,SLG+,ISO+,BABIP+,LD%+,GB%+,FB%+,Pull%+,Cent%+,Oppo%+,Events,EV,maxEV,LA,Barrels,Barrel%,HardHit,HardHit%,xAVG,xSLG,PPTV,CPTV,DGV,DSV,BPTV,BTV,rPPTV,rCPTV,rDGV,rDSV,rBPTV,rBTV,EBV,ESV,rFTeamV,rBTeamV,rTV,FB%1,FBv,SL%,SLv,CT%,CTv,CB%,CBv,CH%,CHv,SF%,SFv,KN%,KNv,XX%,wFB,wSL,wCT,wCB,wCH,wSF,wKN,wFB/C,wSL/C,wCT/C,wCB/C,wCH/C,wSF/C,wKN/C,O-Swing%,Z-Swing%,Swing%,O-Contact%,Z-Contact%,Contact%,Zone%,F-Strike%,SwStr%,CStr%,C+SwStr%,pfxFA%,pfxFT%,pfxFC%,pfxFS%,pfxFO%,pfxSI%,pfxSL%,pfxCU%,pfxKC%,pfxEP%,pfxCH%,pfxSC%,pfxKN%,pfxUN%,pfxvFA,pfxvFT,pfxvFC,pfxvFS,pfxvFO,pfxvSI,pfxvSL,pfxvCU,pfxvKC,pfxvEP,pfxvCH,pfxvSC,pfxvKN,pfxFA-X,pfxFT-X,pfxFC-X,pfxFS-X,pfxFO-X,pfxSI-X,pfxSL-X,pfxCU-X,pfxKC-X,pfxEP-X,pfxCH-X,pfxSC-X,pfxKN-X,pfxFA-Z,pfxFT-Z,pfxFC-Z,pfxFS-Z,pfxFO-Z,pfxSI-Z,pfxSL-Z,pfxCU-Z,pfxKC-Z,pfxEP-Z,pfxCH-Z,pfxSC-Z,pfxKN-Z,pfxwFA,pfxwFT,pfxwFC,pfxwFS,pfxwFO,pfxwSI,pfxwSL,pfxwCU,pfxwKC,pfxwEP,pfxwCH,pfxwSC,pfxwKN,pfxwFA/C,pfxwFT/C,pfxwFC/C,pfxwFS/C,pfxwFO/C,pfxwSI/C,pfxwSL/C,pfxwCU/C,pfxwKC/C,pfxwEP/C,pfxwCH/C,pfxwSC/C,pfxwKN/C,pfxO-Swing%,pfxZ-Swing%,pfxSwing%,pfxO-Contact%,pfxZ-Contact%,pfxContact%,pfxZone%,pfxPace
str,str,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,i64,f64,i64,f64,f64,f64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,null,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,null,f64,null,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,null,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,null,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,null,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,null,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Josh Naylor""","""CLE""",152,633,31,84,108,6,9.2,16.6,0.213,0.246,0.243,0.32,0.456,0.332,0.335,118,-2.5,10.7,-9.9,2.3,563,137,79,27,0,58,7,105,7,4,0,17,2,0.55,0.776,3.2,-2.3,-2.1,-1.8,-0.7,85,10.9,1.29,17.1,46.8,36.1,10.8,18.6,5,2.3,0,0.0,41.6,37.4,21.0,16.0,52.6,31.4,0.52,-11.62,12.14,14.6,1.46,1.0,0.41,1,1.28,-0.76,13.2,1.9,-11.8,2.0,19.1,21.9,"""$18.1""",113,73,102,103,116,138,86,88,113,93,102,107,87,463,89.9,110.9,11.0,39,8.4,189,40.8,0.257,0.443,5,0,0,0,0,0,0.4,,,,,,5,0,0.4,0.0,0.4,39.7,93.5,20.1,84.1,8.5,89.3,10.1,80.8,16.4,86.2,4.9,87.5,0.4,80.3,1.2,5.4,1.1,-0.4,1.7,1.1,-1.7,-0.3,0.6,0.25,-0.21,0.75,0.3,-1.53,-3.19,35.5,73.7,50.3,69.8,86.7,79.4,38.8,59.9,10.4,12.6,23.0,27.2,,8.6,4.6,,12.2,19.1,8.1,3.0,0.3,16.7,,0.4,,94.1,,89.3,87.2,,93.1,84.2,80.0,82.6,51.7,86.2,,80.2,-2.3,,0.4,-6.0,,0.3,0.8,2.9,3.1,1.3,-7.7,,-2.9,9.1,,4.3,2.2,,4.8,1.0,-4.5,-5.5,7.6,2.7,,-2.8,10.2,,-0.9,-1.1,,-3.7,1.3,-1.5,2.5,-0.4,0.2,,-0.3,1.65,,-0.44,-1.05,,-1.33,0.3,-0.8,3.75,-7.41,0.06,,-3.22,32.1,70.8,50.3,64.7,86.7,79.3,47.1,20.1
"""Christian Walker""","""ARI""",130,552,26,72,84,2,10.0,24.1,0.217,0.287,0.251,0.335,0.468,0.343,0.351,119,-3.2,9.4,1.3,3.0,479,120,68,26,0,55,4,133,10,8,0,12,2,0.41,0.803,2.2,1.2,-0.2,-1.8,-1.4,79,14.8,0.81,15.3,37.9,46.9,12.7,15.7,6,4.5,0,0.0,42.1,35.3,22.6,14.4,46.9,38.7,0.14,-9.34,9.48,6.37,0.67,0.96,1.45,3,0.95,-0.81,12.7,10.8,-9.5,1.7,16.7,29.1,"""$24.1""",121,107,101,106,116,138,97,77,88,126,104,100,93,354,91.3,113.9,16.7,47,13.3,170,48.0,0.246,0.457,3,0,0,0,0,0,0.1,,,,,,3,0,0.1,0.0,0.1,49.0,94.2,24.6,85.1,5.7,89.9,8.3,80.7,9.7,85.2,2.1,86.2,0.6,77.0,0.5,3.0,-3.0,0.5,4.1,6.0,-1.3,1.0,0.27,-0.53,0.4,2.15,2.66,-2.7,7.43,28.3,76.6,48.1,58.0,85.7,76.1,41.1,57.2,11.5,13.4,24.8,33.1,,5.9,1.9,0.1,15.9,24.7,6.4,1.6,,9.9,,0.6,,94.5,,89.9,86.6,83.6,93.5,85.0,79.9,82.0,,85.1,,77.1,-1.1,,1.0,-5.2,2.9,-4.7,2.5,1.5,-0.1,,2.6,,-4.7,9.1,,4.8,3.1,4.7,4.9,1.2,-4.3,-3.7,,3.8,,-2.0,2.4,,1.3,-1.3,0.1,-0.2,-2.5,2.3,0.8,,5.8,,1.1,0.31,,0.98,-3.1,6.39,-0.06,-0.44,1.52,2.28,,2.56,,7.6,24.2,70.4,48.0,50.0,84.7,76.2,51.6,19.9
"""Anthony Santander""","""BAL""",155,665,44,91,102,2,8.7,19.4,0.271,0.225,0.235,0.308,0.506,0.345,0.324,129,-2.7,19.5,-9.6,3.3,595,140,69,25,2,58,3,129,7,5,0,9,0,0.45,0.814,3.2,-1.2,0.7,-2.1,-0.6,97,18.7,0.56,14.4,30.8,54.8,16.7,17.1,4,2.8,0,0.0,44.8,31.6,23.6,18.9,46.1,35.0,1.98,-11.11,13.08,13.17,1.29,0.92,4.37,1,2.77,-0.63,22.2,-1.2,-8.5,2.1,20.1,32.1,"""$26.5""",107,86,98,100,128,175,79,74,74,140,110,90,98,471,89.8,114.4,22.7,55,11.7,195,41.4,0.228,0.445,4,0,0,0,0,0,0.1,,,,,,4,0,0.1,0.0,0.1,49.6,94.2,14.3,84.8,11.3,89.9,9.3,79.8,11.8,86.1,3.7,85.9,,,0.5,-4.4,3.0,5.7,3.6,7.3,3.1,,-0.33,0.77,1.85,1.4,2.25,3.04,,37.5,68.8,49.7,76.5,87.1,82.2,39.2,61.4,8.8,15.6,24.4,39.0,,10.6,3.3,,10.7,14.6,7.7,2.0,,12.2,,,,94.2,,90.0,85.8,,93.8,85.0,79.4,81.1,,86.0,,,-2.0,,0.5,-5.7,,-3.0,2.2,2.6,3.8,,-3.2,,,9.3,,4.5,2.3,,4.6,1.1,-5.8,-6.3,,3.2,,,-5.2,,5.2,3.6,,0.4,2.4,3.3,0.5,,6.2,,,-0.49,,1.79,3.99,,0.12,0.6,1.56,0.94,,1.88,,,35.1,66.1,49.7,72.9,87.7,82.2,47.3,19.7
"""Ryan Mountcastle""","""BAL""",124,507,13,54,63,3,5.3,22.5,0.154,0.327,0.271,0.308,0.425,0.316,0.332,108,-0.9,4.0,-6.5,1.5,473,128,83,30,2,27,0,114,1,6,0,13,0,0.24,0.733,3.6,-4.2,-1.5,-0.6,-0.3,61,2.2,1.31,21.6,44.4,34.0,8.1,10.5,11,6.8,0,0.0,27.9,38.6,33.4,14.0,52.6,33.4,0.29,-7.59,7.87,1.63,0.21,0.86,1.46,11,-0.06,0.39,4.8,2.6,-9.1,1.6,15.3,14.4,"""$11.9""",66,100,113,100,108,100,114,111,107,87,68,110,139,365,90.8,113.5,10.4,32,8.8,165,45.2,0.272,0.445,3,0,0,0,0,0,0.2,,,,,,3,0,0.2,0.0,0.2,48.9,93.9,26.2,84.3,8.1,89.5,6.1,80.5,8.5,84.7,2.3,85.9,,,0.2,3.9,-1.8,-3.3,3.5,2.3,0.2,,0.42,-0.36,-2.19,3.02,1.45,0.52,,40.5,77.9,56.1,65.3,85.7,77.1,41.7,63.9,12.8,12.4,25.3,31.5,,7.6,2.2,,17.3,26.5,5.7,0.7,,8.5,,0.1,,94.0,,89.6,86.1,,93.6,84.4,79.8,84.2,,84.5,,58.6,-0.7,,0.8,-4.6,,-5.6,3.1,2.7,3.1,,1.8,,6.4,9.4,,4.7,1.9,,4.7,1.0,-4.7,-5.5,,3.3,,2.6,3.3,,-2.7,-0.3,,-3.9,-5.3,5.1,-0.6,,2.9,,-0.1,0.55,,-1.84,-0.77,,-1.17,-1.06,4.74,-4.31,,1.78,,-5.48,36.9,75.1,56.1,59.0,85.7,76.9,50.3,19.5
"""Cal Raleigh""","""SEA""",153,628,34,73,100,6,11.1,28.0,0.216,0.251,0.22,0.312,0.436,0.323,0.346,117,-2.6,9.7,21.8,5.4,546,120,70,16,0,70,4,176,6,6,0,7,1,0.4,0.748,2.9,-2.3,1.5,-2.2,-0.3,80,6.3,0.6,17.0,31.1,51.9,12.8,17.4,8,6.8,0,0.0,48.9,29.3,21.8,16.5,44.1,39.4,1.46,-10.59,12.04,18.24,1.92,1.03,1.88,7,0.64,0.77,12.3,14.3,7.6,1.9,19.0,52.5,"""$43.3""",137,124,92,101,110,140,88,87,75,133,120,83,91,376,91.0,113.9,21.2,58,15.4,181,48.1,0.233,0.476,3,0,0,0,0,0,0.1,,,,,,3,0,0.1,0.0,0.1,41.1,94.2,17.3,84.5,8.2,89.6,10.5,81.1,18.0,85.8,5.1,87.6,,,0.7,-0.1,3.8,2.5,-1.3,-0.5,-0.2,,-0.01,0.88,1.21,-0.49,-0.1,-0.13,,36.1,76.1,52.2,58.0,81.7,72.0,40.4,63.5,14.6,11.8,26.4,31.7,,8.1,4.5,,9.1,17.5,7.9,2.7,0.1,18.3,0.0,,,94.4,,89.9,86.8,,93.8,84.5,80.8,81.8,60.5,85.9,85.5,,-2.3,,0.6,-6.9,,-3.5,2.2,3.0,3.4,-6.6,-2.8,-7.2,,9.3,,4.9,1.9,,4.4,1.1,-5.3,-5.2,8.3,3.5,0.7,,-0.7,,-0.7,-0.5,,0.8,8.4,-1.7,-1.6,0.5,-0.4,0.0,,-0.09,,-0.37,-0.41,,0.37,1.91,-0.85,-2.37,15.71,-0.08,-3.94,,33.0,71.4,52.2,51.1,81.7,72.0,50.0,19.4


In [9]:
df.shape

(129, 244)