In [None]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import time
from io import StringIO

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://fbref.com/"
}

url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"

session = requests.Session()
session.headers.update(headers)

In [7]:
resp = session.get(url, timeout=15)

In [8]:
if "Cloudflare" in resp.text or "Checking your browser" in resp.text:
    raise RuntimeError("❌ Blocked by Cloudflare — not receiving real HTML")

In [9]:
def scraping_fbref(url, table_id, retries=3, delay=2):

    for attempt in range(1, retries + 1):

        resp = session.get(url, timeout=15)

        if "Checking your browser" in resp.text:
            raise RuntimeError("Blocked by Cloudflare")

        soup = BeautifulSoup(resp.text, "html.parser")

        table = None
        comments = soup.find_all(string=lambda t: isinstance(t, Comment))

        for c in comments:
            comment_soup = BeautifulSoup(c, "html.parser")
            t = comment_soup.find("table", id=table_id)
            if t:
                table = t
                break

        if table is None:
            table = soup.find("table", id=table_id)

        if table is None:
            print(f"[Attempt {attempt}] Could not find {table_id}, retrying...")
            time.sleep(delay)
            continue

        df = pd.read_html(StringIO(str(table)))[0]
        df.columns = ["_".join(col).strip() for col in df.columns]

        rk_col = [c for c in df.columns if "Rk" in c][0]
        df = df[df[rk_col].astype(str).str.isdigit()]
        df[rk_col] = df[rk_col].astype(int)

        df.columns = [
            col.split("_")[-1] if col.startswith("Unnamed") else col
            for col in df.columns
        ]

        return df

    raise ValueError(f"❌ Table '{table_id}' not found after {retries} attempts.")


In [None]:
# Pass the base ID instead of the full dynamic ID
df_fixtures = scraping_fbref(
    url="https://fbref.com/en/comps/9/passing/Premier-League-Stats", 
    table_id="stats_passing"
)

[Attempt 1] Could not find sched_2024-2025_9_1, retrying...
[Attempt 2] Could not find sched_2024-2025_9_1, retrying...
[Attempt 3] Could not find sched_2024-2025_9_1, retrying...


ValueError: ❌ Table 'sched_2024-2025_9_1' not found after 3 attempts.