In [None]:
import polars as pl
import requests
from bs4 import BeautifulSoup

pl.Config.set_tbl_cols(100000)

In [None]:
url = "https://www.fangraphs.com/leaders/major-league?pos={pos}&stats=bat&lg={league}&qual={qual}&type={stat_type}&season={end_season}&season1={start_season}&ind=0&startdate={start_date}&enddate={end_date}&month=0&team=0&pagenum=1&pageitems=2000000000"

# pos options: c, 1b,2b,3b, ss, lf, cf, rf, dh, of, p, all
# qual options: y, n
# league options: "", "al", "nl"
# start date, end date are strings in the format "yyyy-mm-dd"
# stat options: 8 (dashboard), 0 (standard), 1 (advanced), 2 (batted ball), 3 (win_probability), 6 (value), 23 (+stats),24 (statcast), 48 (violations)

In [None]:
def get_table_data(
    stat_type, pos, league, start_date, end_date, qual, start_season, end_season
):
    # Assuming `cont` contains the HTML content
    cont = requests.get(
        url.format(
            pos=pos,
            league=league,
            stat_type=stat_type,
            start_season=start_season,
            end_season=end_season,
            qual=qual,
            start_date=start_date,
            end_date=end_date,
        )
    ).content.decode("utf-8")

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(cont, "html.parser")

    # Find the main table using the provided CSS selector
    main_table = soup.select_one(
        "#content > div.leaders-major_leaders-major__table__hcmbm > div.fg-data-grid.table-type > div.table-wrapper-outer > div > div.table-scroll > table"
    )

    # Find the table header
    thead = main_table.find("thead")

    # Extract column names from the data-col-id attribute of the <th> elements, excluding "divider"
    headers = [
        th["data-col-id"]
        for th in thead.find_all("th")
        if "data-col-id" in th.attrs and th["data-col-id"] != "divider"
    ]

    # Find the table body within the main table
    tbody = main_table.find("tbody")

    # Initialize a list to store the extracted data
    data = []

    # Iterate over each row in the table body
    for row in tbody.find_all("tr"):
        row_data = {header: None for header in headers}  # Initialize with None
        for cell in row.find_all("td"):
            col_id = cell.get("data-col-id")

            if col_id and col_id != "divider":
                if cell.find("a"):
                    row_data[col_id] = cell.find("a").text
                elif cell.find("span"):
                    row_data[col_id] = cell.find("span").text
                else:
                    text = cell.text.strip().replace("%", "")
                    if text == "":
                        row_data[col_id] = None
                    else:
                        try:
                            row_data[col_id] = float(text) if "." in text else int(text)
                        except ValueError:
                            row_data[col_id] = text
                        except Exception as e:
                            print(e)
                            print(cell.attrs["data-col-id"])
                            row_data[col_id] = text
        # Print row_data for debugging
        data.append(row_data)

    # Create a Polars DataFrame from the extracted data
    df = pl.DataFrame(data, infer_schema_length=None)
    return df

In [None]:
# Define the available stat types as an Enum
from enum import Enum


class FangraphsBattingStatType(Enum):
    DASHBOARD = 8
    STANDARD = 0
    ADVANCED = 1
    BATTED_BALL = 2
    WIN_PROBABILITY = 3
    VALUE = 6
    PLUS_STATS = 23
    STATCAST = 24
    VIOLATIONS = 48
    SPORTS_INFO_PITCH_TYPE = 4
    SPORTS_INFO_PITCH_VALUE = 7
    SPORTS_INFO_PLATE_DISCIPLINE = 5
    STATCAST_PITCH_TYPE = 9
    STATCAST_VELO = 10
    STATCAST_H_MOVEMENT = 11
    STATCAST_V_MOVEMENT = 12
    STATCAST_PITCH_TYPE_VALUE = 13
    STATCAST_PITCH_TYPE_VALUE_PER_100 = 14
    STATCAST_PLATE_DISCIPLINE = 15

In [None]:
stat_types = {}
for stat_type in FangraphsBattingStatType:
    stat_types[stat_type] = stat_type.value

In [None]:
df_list = []
for stat_type in stat_types:
    print(f"Fetching data for {stat_type}...")
    df = get_table_data(
        stat_types[stat_type], "all", "", "2024-04-01", "2024-05-01", "y", "", ""
    )
    if df is not None:
        print(f"Data fetched for {stat_type}")
        df_list.append(df)
    else:
        print(f"Warning: No data returned for {stat_type}")

In [None]:
from polars import selectors as cs

df = df_list[0]
len_cols = len(df.columns)
for i in range(1, len(df_list)):
    len_cols += len(df_list[i].columns)
    df = df.join(df_list[i], on="Name", how="inner").select(
        ~cs.ends_with("_right"),
    )
df.head()

In [None]:
df.shape