In [1]:
import pandas as pd
import time, random, requests
from pathlib import Path
import inspect, sys

In [None]:
# This is used to include wait time between requests to avoid being blocked 
# keep the true function so we can still call it
_orig_read_html = pd.read_html

def read_html_polite(url, *args, **kwargs):
    """
    Drop-in replacement for pandas.read_html that
    • fetches the URL with realistic headers,
    • sleeps 3-6 s afterward (polite),
    • then lets pandas parse the HTML.
    Works for every place you already call pd.read_html.
    """
    # If the first arg is an http(s) URL we fetch it ourselves
    if isinstance(url, str) and url.startswith(("http://", "https://")):
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/124.0 Safari/537.36"
            )
        }
        resp = requests.get(url, headers=headers, timeout=20)
        
        # quick & simple retry on 429
        if resp.status_code == 429:
            time.sleep(30)               # wait 30 s, then one more try
            resp = requests.get(url, headers=headers, timeout=20)

        resp.raise_for_status()          # still bad? → raise the HTTP error
        html = resp.text

        # polite random delay
        time.sleep(random.uniform(3, 6))

        return _orig_read_html(html, *args, **kwargs)

    # Otherwise (local file / string) just call the original
    return _orig_read_html(url, *args, **kwargs)

# monkey-patch pandas so every subsequent call uses the polite version
pd.read_html = read_html_polite


In [None]:
# Current season 2024-2025
df_player_stats_2425 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/Valencia-Stats', attrs={"id": "stats_standard_12"})[0]
df_player_shooting_2425 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/Valencia-Stats', attrs={"id": "stats_shooting_12"})[0]
df_player_passing_2425 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/Valencia-Stats', attrs={"id": "stats_passing_12"})[0]
df_player_passing_types_2425 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/Valencia-Stats', attrs={"id": "stats_passing_types_12"})[0]
df_player_gca_2425 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/Valencia-Stats', attrs={"id": "stats_gca_12"})[0]
df_player_defense_2425 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/Valencia-Stats', attrs={"id": "stats_defense_12"})[0]
df_player_possession_2425 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/Valencia-Stats', attrs={"id": "stats_possession_12"})[0]

In [None]:
# Season 2023-2024
df_player_stats_2324 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2023-2024/Valencia-Stats', attrs={"id": "stats_standard_12"})[0]
df_player_shooting_2324 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2023-2024/Valencia-Stats', attrs={"id": "stats_shooting_12"})[0]
df_player_passing_2324 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2023-2024/Valencia-Stats', attrs={"id": "stats_passing_12"})[0]
df_player_passing_types_2324 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2023-2024/Valencia-Stats', attrs={"id": "stats_passing_types_12"})[0]
df_player_gca_2324 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2023-2024/Valencia-Stats', attrs={"id": "stats_gca_12"})[0]
df_player_defense_2324 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2023-2024/Valencia-Stats', attrs={"id": "stats_defense_12"})[0]
df_player_possession_2324 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2023-2024/Valencia-Stats', attrs={"id": "stats_possession_12"})[0]

In [None]:
# Season 2022-2023
df_player_stats_2223 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2022-2023/Valencia-Stats', attrs={"id": "stats_standard_12"})[0]
df_player_shooting_2223 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2022-2023/Valencia-Stats', attrs={"id": "stats_shooting_12"})[0]
df_player_passing_2223 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2022-2023/Valencia-Stats', attrs={"id": "stats_passing_12"})[0]
df_player_passing_types_2223 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2022-2023/Valencia-Stats', attrs={"id": "stats_passing_types_12"})[0]
df_player_gca_2223 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2022-2023/Valencia-Stats', attrs={"id": "stats_gca_12"})[0]
df_player_defense_2223 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2022-2023/Valencia-Stats', attrs={"id": "stats_defense_12"})[0]
df_player_possession_2223 = pd.read_html('https://fbref.com/en/squads/dcc91a7b/2022-2023/Valencia-Stats', attrs={"id": "stats_possession_12"})[0]

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df_player_stats_2425.head()

In [None]:
# 1 Folder →  data/raw   (create if it doesn't exist)

RAW_DIR = Path("data", "raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

# 2 Find every variable in the notebook whose name starts with df_
frames = {
    name: obj
    for name, obj in globals().items()
    if name.startswith("df_") and isinstance(obj, pd.DataFrame)
}

# 3  Save each DataFrame to CSV
for name, df in frames.items():
    filepath = RAW_DIR / f"{name}.csv"
    df.to_csv(filepath, index=False)
    print(f"✔️  {filepath}")