## Simplified Data Cleaning Pipeline for FBRef (Valencia)

In [3]:
from pathlib import Path
import sys
sys.path.append("..")

from src.preprocessing.clean_fbref_data import (
    load_fbref_season_data,
    drop_matches_column,
    find_players_in_multiple_seasons,
    add_age_from_latest_season,
    save_to_interim
)

team_name = "Valencia CF"
raw_base = Path("..", "data", "raw", team_name, "fbref")

# Load all season data
data_2223 = load_fbref_season_data("2223", raw_base)
data_2324 = load_fbref_season_data("2324", raw_base)
data_2425 = load_fbref_season_data("2425", raw_base)

# Combine all into one dict
all_data = {**data_2223, **data_2324, **data_2425}

# Clean noisy columns
all_data = drop_matches_column(all_data)

# Extract continuity tracking (optional, if needed)
season_stats = [
    (data_2223["df_player_stats_2223"], "2223"),
    (data_2324["df_player_stats_2324"], "2324"),
    (data_2425["df_player_stats_2425"], "2425"),
]
multi_season_players = find_players_in_multiple_seasons(season_stats)
multi_season_players = add_age_from_latest_season(multi_season_players, data_2425["df_player_stats_2425"])

# Save cleaned output
save_to_interim(all_data, team_name)

Saved: ../data/interim/Valencia CF/fbref/df_player_stats_2223.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_shooting_2223.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_passing_2223.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_passing_types_2223.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_gca_2223.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_defense_2223.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_possession_2223.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_stats_2324.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_shooting_2324.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_passing_2324.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_passing_types_2324.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_gca_2324.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_defense_2324.csv
Saved: ../data/interim/Valencia CF/fbref/df_player_possession_2324.csv
Saved: ../data/interim/Valencia CF/fbref

## FBREF multiple teams scraper

In [None]:
# ------------------------------------------------------------------------------
# 0) Imports & helper fix
# ------------------------------------------------------------------------------
from pathlib import Path
import sys
sys.path.append("..")

from src.preprocessing.clean_fbref_data import (        # existing helpers
    load_fbref_season_data, drop_matches_column,
    find_players_in_multiple_seasons, save_to_interim,
)

import pandas as pd

# patched helper -----------------------------------------------------------
def add_age_from_latest_season(players_df: pd.DataFrame,
                               latest_df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds an 'Age' column to players_df using ages from latest_df.
    Safely handles duplicate player rows by keeping the first occurrence.
    """
    age_map = (
        latest_df.drop_duplicates(subset="Player", keep="first")
                 .set_index("Player")["Age"]
    )
    players_df["Age"] = players_df["Player"].map(age_map)
    return players_df

# ------------------------------------------------------------------------------
# 1) Parameters
# ------------------------------------------------------------------------------
SEASONS = ["2223", "2324", "2425"]
TEAMS = [
    "Real Madrid CF", "FC Barcelona", "Sevilla FC", "Atlético Madrid",
    "Athletic Club", "Villarreal CF", "Real Sociedad", "Real Betis",
    "Valencia CF",
]

# ------------------------------------------------------------------------------
# 2) Cleaning loop
# ------------------------------------------------------------------------------
for team in TEAMS:
    print(f"\n🔧  Cleaning {team}")
    raw_base = Path("..", "data", "raw", team, "fbref")

    # --- load only seasons that actually exist on disk ------------------------
    seasons_data = {}
    for s in SEASONS:
        stats_csv = raw_base / f"df_player_stats_{s}.csv"
        if stats_csv.exists():
            seasons_data[s] = load_fbref_season_data(s, raw_base)
        else:
            print(f"   {team} {s}: raw CSV not found – skipped")

    if not seasons_data:
        print(f"   No seasons found for {team}; skipping team.")
        continue

    # --- merge & clean --------------------------------------------------------
    all_data = {k: v for d in seasons_data.values() for k, v in d.items()}
    all_data = drop_matches_column(all_data)

    # --- identify multi-season players + add age ------------------------------
    season_stats = [
        (seasons_data[s]["df_player_stats_" + s], s) for s in seasons_data
    ]
    multi = find_players_in_multiple_seasons(season_stats)

    latest = max(seasons_data)                                # newest season available
    latest_stats = seasons_data[latest]["df_player_stats_" + latest]
    multi = add_age_from_latest_season(multi, latest_stats)

    # --- save everything to interim ------------------------------------------
    save_to_interim(all_data, team)


🔧  Cleaning Real Madrid CF
Saved: ../data/interim/Real Madrid CF/fbref/df_player_stats_2223.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_shooting_2223.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_passing_2223.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_passing_types_2223.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_gca_2223.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_defense_2223.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_possession_2223.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_stats_2324.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_shooting_2324.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_passing_2324.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_passing_types_2324.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_gca_2324.csv
Saved: ../data/interim/Real Madrid CF/fbref/df_player_defense_2324.csv
Saved: ../data/interim/Real Madrid CF/fbref/