# **EXTRACCIÓN DE LOS DATOS**

In [None]:
!pip install soccerdata



In [None]:
!pip install -q requests_html beautifulsoup4 tqdm pyarrow

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.9/84.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.9/82.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.2/144.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for websockets (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
selenium 4.35.0 requires urllib3[socks]<3.0,>=2.5.0, but you have urllib3 1.26.20 which is incompatible.
google-adk 1.13.0 requires websockets<16.0.0,>=15.0.1, but you have websockets 10.4 which is incompatible.
d

In [None]:
!pip install fake_useragent



In [None]:
from google.colab import drive
from itertools import product
from datetime import datetime
from bs4 import BeautifulSoup
from tqdm import tqdm
from fake_useragent import UserAgent

import pandas as pd
import numpy as np
import soccerdata as sd

import io
import os
import requests
import unicodedata
import re
import time

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


# 1. football-data.co.uk

In [None]:
# ╔════════════════════════════════════════════════════════╗
# ║  FOOTBALL-DATA.CO.UK  •  La Liga  •  MATCHES           ║
# ╚════════════════════════════════════════════════════════╝

DRIVE_BASE = '/content/drive/MyDrive/TFM'
RAW_DIR  = f"{DRIVE_BASE}/data/raw/football-data"
PROC_DIR = f"{DRIVE_BASE}/data/processed"
PARQUET_PATH = f"{PROC_DIR}/football-data.co.uk_2005_2025.parquet"

os.makedirs(RAW_DIR,  exist_ok=True)
os.makedirs(PROC_DIR, exist_ok=True)

DIVISIONS = ["SP1"]
FROZEN = ["B365H", "B365D", "B365A"]


def current_season_code(today=None):
    from datetime import datetime
    if today is None:
        today = datetime.now()
    y = today.year % 100
    start = y if today.month >= 7 else (y - 1) % 100
    end = (start + 1) % 100
    return f"{start:02d}{end:02d}"

def season_codes(first_start=5, last_code=None):
    if last_code is None:
        last_code = current_season_code()
    codes, y = [], first_start
    while True:
        code = f"{y:02d}{(y+1)%100:02d}"
        codes.append(code)
        if code == last_code: break
        y += 1
    return codes

def fetch_fd_csv(season:str, div:str="SP1") -> pd.DataFrame:
    base = "https://www.football-data.co.uk/mmz4281"
    url  = f"{base}/{season}/{div}.csv"
    r = requests.get(url, headers={"User-Agent":"Mozilla/5.0","Cache-Control":"no-cache"}, timeout=30)
    r.raise_for_status()
    df = pd.read_csv(io.BytesIO(r.content))
    return df

def norm_str(s):
    return s.astype(str).str.strip().str.upper().str.replace(r"\s+", " ", regex=True)

def make_temp_key(df: pd.DataFrame) -> pd.Series:
    """Clave temporal en memoria para emparejar partidos. NO se guarda en disco."""
    for c in ["Div","Date","HomeTeam","AwayTeam"]:
        if c not in df.columns:
            df[c] = ""
    div  = norm_str(df["Div"])
    date = norm_str(df["Date"])
    home = norm_str(df["HomeTeam"])
    away = norm_str(df["AwayTeam"])
    return (div + "|" + date + "|" + home + "|" + away)


if not os.path.isfile(PARQUET_PATH):
    raise FileNotFoundError(
        f"No existe el parquet maestro en {PARQUET_PATH}. "
        "Crea primero el archivo inicial con tus 191 columnas."
    )

master = pd.read_parquet(PARQUET_PATH)
cols_master = list(master.columns)
master_key = make_temp_key(master)
master["_TMP_KEY_"] = master_key

pre_rows = len(master)
pre_keys = set(master["_TMP_KEY_"])


seasons = season_codes(first_start=5)

live_list = []
for season, div in product(seasons, DIVISIONS):
    df_season = fetch_fd_csv(season, div)

    df_season = df_season.reindex(columns=cols_master, fill_value=pd.NA)

    for c in ["Div","Date","HomeTeam","AwayTeam"]:
        if c in df_season.columns:
            df_season[c] = df_season[c].astype(str)

    df_season["_TMP_KEY_"] = make_temp_key(df_season)
    df_season = df_season.drop_duplicates(subset=["_TMP_KEY_"], keep="last")

    live_list.append(df_season)

if not live_list:
    print("No se descargó nada nuevo.")
    if "_TMP_KEY_" in master.columns:
        master = master.drop(columns=["_TMP_KEY_"])
    master.to_parquet(PARQUET_PATH, index=False)
else:
    live = pd.concat(live_list, ignore_index=True)

    keep_cols_live = cols_master + ["_TMP_KEY_"]
    live = live.reindex(columns=keep_cols_live)

    if not master.empty:
        prev = master[["_TMP_KEY_"] + [c for c in FROZEN if c in cols_master]].copy()
        merged = live.merge(prev, on="_TMP_KEY_", how="left", suffixes=("", "_OLD"))

        for col in FROZEN:
            if col in cols_master:
                old = f"{col}_OLD"
                if old in merged.columns:
                    merged[col] = merged[old].combine_first(merged[col])

        merged = merged.drop(columns=[c for c in merged.columns if c.endswith("_OLD")])
    else:
        merged = live.copy()

    to_keep_old = master[~master["_TMP_KEY_"].isin(set(merged["_TMP_KEY_"]))].copy()
    updated = merged.copy()

    combo = pd.concat([to_keep_old, updated], ignore_index=True)

    combo = combo.drop_duplicates(subset=["_TMP_KEY_"], keep="last")

    combo = combo.reindex(columns=cols_master + ["_TMP_KEY_"])

    combo = combo.drop(columns=["_TMP_KEY_"])
    combo.to_parquet(PARQUET_PATH, index=False)

    post_rows = len(combo)
    post_keys = set(make_temp_key(combo))

    added   = len(post_keys - pre_keys)
    touched = len(post_keys & pre_keys)

    print(f"Estructura mantenida => {len(cols_master)} columnas (sin columnas nuevas).")
    print(f"Partidos antes: {pre_rows:,}")
    print(f"Partidos ahora: {post_rows:,}")
    print(f"Nuevos añadidos: {added:,}")
    print(f"Coincidentes (posiblemente actualizados): {touched:,}")
    print("(B365H/B365D/B365A preservadas si ya existían; si estaban NaN, se rellenan con el valor nuevo).")

Estructura mantenida => 191 columnas (sin columnas nuevas).
Partidos antes: 7,631
Partidos ahora: 7,631
Nuevos añadidos: 0
Coincidentes (posiblemente actualizados): 7,631
(B365H/B365D/B365A preservadas si ya existían; si estaban NaN, se rellenan con el valor nuevo).


In [None]:
FD_PATH  = "/content/drive/MyDrive/TFM/data/processed/football-data.co.uk_2005_2025.parquet"
fd = pd.read_parquet(FD_PATH)

fd

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BMGMCA,BVCH,BVCD,BVCA,CLCH,CLCD,CLCA,LBCH,LBCD,LBCA
0,SP1,27/08/05,Alaves,Barcelona,0,0,D,0,0,D,...,,,,,,,,,,
1,SP1,27/08/05,Ath Bilbao,Sociedad,3,0,H,0,0,D,...,,,,,,,,,,
2,SP1,27/08/05,Valencia,Betis,1,0,H,0,0,D,...,,,,,,,,,,
3,SP1,28/08/05,Ath Madrid,Zaragoza,0,0,D,0,0,D,...,,,,,,,,,,
4,SP1,28/08/05,Cadiz,Real Madrid,1,2,A,0,1,A,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7626,SP1,30/08/2025,Real Madrid,Mallorca,2,1,H,2,1,H,...,16.00,1.15,7.50,17.00,1.18,7.50,15.00,1.17,7.50,15.00
7627,SP1,31/08/2025,Celta,Villarreal,1,1,D,0,0,D,...,2.28,3.20,3.50,2.20,3.10,3.50,2.25,3.00,3.50,2.25
7628,SP1,31/08/2025,Betis,Ath Bilbao,1,2,A,0,0,D,...,2.43,3.20,3.13,2.38,3.10,3.10,2.40,3.10,3.10,2.40
7629,SP1,31/08/2025,Espanol,Osasuna,1,0,H,0,0,D,...,3.10,2.45,3.10,3.10,2.37,2.87,2.87,2.37,2.87,2.87


# 2. Understat (xG)

In [None]:
# ╔════════════════════════════════════╗
# ║  UNDERSTAT  •  La Liga  •  xG      ║
# ╚════════════════════════════════════╝

PROC_DIR = '/content/drive/MyDrive/TFM/data/processed'
os.makedirs(PROC_DIR, exist_ok=True)
SAVE_PATH = "understat_2014_2025.parquet"
save_file = os.path.join(PROC_DIR, SAVE_PATH)

# CAMBIAR PARA AÑADIR DATOS FUTUROS
UNDER_SEASONS = [1415, 1516, 1617, 1718, 1819, 1920, 2021, 2122, 2223, 2324, 2425, 2526]

us = sd.Understat(leagues="ESP-La Liga", seasons=UNDER_SEASONS)

team_stats = us.read_team_match_stats()
print("Rows team_stats:", len(team_stats))

xg_df = (
    team_stats[[
        "game_id", "date",
        "home_team", "away_team",
        "home_xg",  "away_xg"
    ]]
    .rename(columns={
        "game_id"   : "match_id",
        "date"      : "Date",
        "home_team" : "HomeTeam",
        "away_team" : "AwayTeam",
        "home_xg"   : "h_xg",
        "away_xg"   : "a_xg"
    })
)
xg_df["Date"] = pd.to_datetime(xg_df["Date"]).dt.date

xg_df.to_parquet(save_file, index=False)
print("Guardado:", save_file)
print("Partidos con xG:", len(xg_df))
xg_df.tail(10)

Rows team_stats: 4211
Guardado: /content/drive/MyDrive/TFM/data/processed/understat_2014_2025.parquet
Partidos con xG: 4211


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,match_id,Date,HomeTeam,AwayTeam,h_xg,a_xg
league,season,game,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ESP-La Liga,2526,2025-08-29 Elche-Levante,29179,2025-08-29,Elche,Levante,1.44309,0.969663
ESP-La Liga,2526,2025-08-29 Valencia-Getafe,29180,2025-08-29,Valencia,Getafe,1.54336,0.757418
ESP-La Liga,2526,2025-08-30 Alaves-Atletico Madrid,29181,2025-08-30,Alaves,Atletico Madrid,0.855094,0.960547
ESP-La Liga,2526,2025-08-30 Girona-Sevilla,29183,2025-08-30,Girona,Sevilla,2.00908,1.62032
ESP-La Liga,2526,2025-08-30 Real Madrid-Mallorca,29184,2025-08-30,Real Madrid,Mallorca,1.84116,0.549872
ESP-La Liga,2526,2025-08-30 Real Oviedo-Real Sociedad,29182,2025-08-30,Real Oviedo,Real Sociedad,0.45706,0.92221
ESP-La Liga,2526,2025-08-31 Celta Vigo-Villarreal,29185,2025-08-31,Celta Vigo,Villarreal,0.876041,0.96978
ESP-La Liga,2526,2025-08-31 Espanyol-Osasuna,29187,2025-08-31,Espanyol,Osasuna,1.36886,1.38167
ESP-La Liga,2526,2025-08-31 Rayo Vallecano-Barcelona,29188,2025-08-31,Rayo Vallecano,Barcelona,1.87124,2.35774
ESP-La Liga,2526,2025-08-31 Real Betis-Athletic Club,29186,2025-08-31,Real Betis,Athletic Club,0.716186,1.11141


In [None]:
FD_PATH = "/content/drive/MyDrive/TFM/data/processed/football-data.co.uk_2005_2025.parquet"
XG_PATH = "/content/drive/MyDrive/TFM/data/processed/understat_2014_2025.parquet"

fd = pd.read_parquet(FD_PATH)
xg = pd.read_parquet(XG_PATH)

print("FD partidos:", len(fd))
print("xG partidos:", len(xg))

FD partidos: 7631
xG partidos: 4211


Las tablas que queremos juntar tienen diferente formato de fechas por lo que primero que hacemos es unificarlos. Prevenimos el problema de que en muchos casos el año puede venir en dos y en cuatro digitos.

In [None]:
fd["Date"] = pd.to_datetime(
    fd["Date"].astype(str).str.strip(),
    format="%d/%m/%y", errors="coerce"
).fillna(
    pd.to_datetime(fd["Date"], format="%d/%m/%Y", errors="coerce")
).dt.date

Además vemos que cada tabla usa nombres diferentes para los equipos.

In [None]:
fd['HomeTeam'].unique()

array(['Alaves', 'Ath Bilbao', 'Valencia', 'Ath Madrid', 'Cadiz', 'Celta',
       'Espanol', 'Mallorca', 'Osasuna', 'Sevilla', 'Betis', 'La Coruna',
       'Real Madrid', 'Barcelona', 'Getafe', 'Malaga', 'Santander',
       'Sociedad', 'Villarreal', 'Zaragoza', 'Recreativo', 'Gimnastic',
       'Levante', 'Murcia', 'Almeria', 'Valladolid', 'Numancia',
       'Sp Gijon', 'Tenerife', 'Xerez', 'Hercules', 'Granada',
       'Vallecano', 'Elche', 'Eibar', 'Cordoba', 'Las Palmas', 'Leganes',
       'Girona', 'Huesca', 'Oviedo'], dtype=object)

In [None]:
xg['HomeTeam'].unique()

<StringArray>
[            'Almeria',             'Granada',              'Malaga',
             'Sevilla',           'Barcelona',          'Celta Vigo',
               'Eibar',             'Levante',      'Rayo Vallecano',
         'Real Madrid',              'Getafe',            'Valencia',
       'Athletic Club',     'Atletico Madrid',             'Cordoba',
            'Espanyol', 'Deportivo La Coruna',               'Elche',
       'Real Sociedad',          'Villarreal',      'Sporting Gijon',
          'Real Betis',          'Las Palmas',             'Osasuna',
              'Alaves',             'Leganes',              'Girona',
     'Real Valladolid',           'SD Huesca',            'Mallorca',
               'Cadiz',         'Real Oviedo']
Length: 32, dtype: string

In [None]:
def norm(s: str) -> str:
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("utf-8")
    s = re.sub(r"[. ]+", " ", s).lower().strip()
    return s

for col in ["HomeTeam", "AwayTeam"]:
    fd[col+"_norm"] = fd[col].apply(norm)
    xg[col+"_norm"] = xg[col].apply(norm)

alias = {
    # Understat → football-data
    "real oviedo"        : "oviedo",
    "almeria"            : "almeria",
    "granada"            : "granada",
    "malaga"             : "malaga",
    "sevilla"            : "sevilla",
    "barcelona"          : "barcelona",
    "celta vigo"         : "celta",
    "eibar"              : "eibar",
    "levante"            : "levante",
    "rayo vallecano"     : "vallecano",
    "real madrid"        : "real madrid",
    "getafe"             : "getafe",
    "valencia"           : "valencia",
    "athletic club"      : "ath bilbao",
    "atletico madrid"    : "ath madrid",
    "cordoba"            : "cordoba",
    "espanyol"           : "espanol",
    "deportivo la coruna": "la coruna",
    "elche"              : "elche",
    "real sociedad"      : "sociedad",
    "villarreal"         : "villarreal",
    "sporting gijon"     : "sp gijon",
    "real betis"         : "betis",
    "las palmas"         : "las palmas",
    "osasuna"            : "osasuna",
    "alaves"             : "alaves",
    "leganes"            : "leganes",
    "girona"             : "girona",
    "real valladolid"    : "valladolid",
    "sd huesca"          : "huesca",
    "mallorca"           : "mallorca",
    "cadiz"              : "cadiz",
    "zaragoza"           : "zaragoza",
    "recreativo"         : "recreativo",
    "gimnastic"          : "gimnastic",
    "murcia"             : "murcia",
    "numancia"           : "numancia",
    "xerez"              : "xerez",
    "tenerife"           : "tenerife",
    "santander"          : "santander",
    "hercules"           : "hercules"
}
xg["HomeTeam_norm"] = xg["HomeTeam_norm"].replace(alias)
xg["AwayTeam_norm"] = xg["AwayTeam_norm"].replace(alias)

Ahora las variables "HomeTeam_norm" y "AwayTeam_norm" de ambos datasets tienen los mismos nombres de equipos.

In [None]:
key = ["Date", "HomeTeam_norm", "AwayTeam_norm"]

merged = (
    fd.merge(
        xg[key + ["h_xg", "a_xg"]],
        on=key,
        how="left",
        validate="many_to_one"
    )
)

print("Partidos totales:", len(merged))
print("Partidos con xG:", merged["h_xg"].notna().sum())

Partidos totales: 7631
Partidos con xG: 4142


Nos fijamos que faltan partidos por emparejar entre los dos datasets.

In [None]:
missing = (
    xg.merge(
        merged[key],
        on=key, how="left", indicator=True
    )
    .query("_merge == 'left_only'")
    .drop(columns="_merge")
)

print("Understat sin pareja exacta:", len(missing))
display(missing.head(15))

Understat sin pareja exacta: 69


Unnamed: 0,match_id,Date,HomeTeam,AwayTeam,h_xg,a_xg,HomeTeam_norm,AwayTeam_norm
385,1403,2015-08-23,Rayo Vallecano,Valencia,0.693517,2.20399,vallecano,valencia
388,1406,2015-08-24,Levante,Celta Vigo,0.485771,1.80598,levante,celta
389,1407,2015-08-24,Real Betis,Villarreal,1.4954,1.63096,betis,villarreal
393,1412,2015-08-30,Celta Vigo,Rayo Vallecano,1.7022,0.081644,celta,vallecano
395,1413,2015-08-30,Real Madrid,Real Betis,3.14512,0.941159,real madrid,betis
398,1417,2015-08-31,Getafe,Granada,0.801319,1.27622,getafe,granada
399,1418,2015-08-31,Las Palmas,Levante,0.825401,0.068596,las palmas,levante
408,1423,2015-09-13,Real Betis,Real Sociedad,0.75201,0.663712,betis,sociedad
417,1433,2015-09-20,Real Sociedad,Espanyol,1.35321,1.89499,sociedad,espanol
424,1441,2015-09-23,Granada,Real Sociedad,0.393118,2.65392,granada,sociedad


Analizando por que fallaban estos partidos, las fechas no son correctas por uno o dos días de diferencia. Se hizo lo siguiente:

In [None]:
from datetime import timedelta

xg_cols = ["h_xg","a_xg"]

def fill_by_shift(df_base: pd.DataFrame, df_xg: pd.DataFrame, shift_days: int):
    tmp = df_xg.copy()
    tmp["Date"] = tmp["Date"] + timedelta(days=shift_days)

    tmp = tmp[key + xg_cols]
    tmp = tmp.rename(columns={c: f"{c}_sh{shift_days:+d}" for c in xg_cols})

    joined = df_base.merge(tmp, on=key, how="left")

    mask = joined["h_xg"].isna() & joined[f"h_xg_sh{shift_days:+d}"].notna()
    for c in xg_cols:
        joined.loc[mask, c] = joined.loc[mask, f"{c}_sh{shift_days:+d}"]

    joined = joined.drop(columns=[f"{c}_sh{shift_days:+d}" for c in xg_cols])
    return joined

for d in [1, -1, 2, -2]:
    merged = fill_by_shift(merged, xg, d)

print("Emparejados tras fechas flexibles:", merged["h_xg"].notna().sum())

Emparejados tras fechas flexibles: 4211


In [None]:
proc_dir = '/content/drive/MyDrive/TFM/data/processed'
os.makedirs(proc_dir, exist_ok=True)

output_path = os.path.join(proc_dir, "fd_xg_2005_2025.parquet")
merged.to_parquet(output_path, index=False)
print(f"Guardado dataset final en: {output_path}")

Guardado dataset final en: /content/drive/MyDrive/TFM/data/processed/fd_xg_2005_2025.parquet


# 3. Rating ClubElo

In [None]:
# ╔════════════════════════════════════════╗
# ║  CLUBELO  •  La Liga 2005-25  •  ELO   ║
# ╚════════════════════════════════════════╝

PROC_DIR = '/content/drive/MyDrive/TFM/data/processed'
os.makedirs(PROC_DIR, exist_ok=True)
SAVE_PATH = os.path.join(PROC_DIR, "clubelo_2005_2025.parquet")

# Lista de nombres exactos tipo ClubElo
CLUBS = [
    "Real Madrid","Barcelona","Atletico","Bilbao","Sevilla",
    "Valencia","Villarreal","Sociedad","Betis","Osasuna","Espanyol",
    "Getafe","Celta","Mallorca","Las Palmas","Cadiz","Almeria","Granada",
    "Alaves","Levante","Rayo Vallecano","Eibar","Girona","Leganes","Huesca",
    "Valladolid","Elche","Cordoba","Gijon","Depor", "Malaga", "Oviedo",
    "Zaragoza","Xerez","Tenerife","Recreativo","Numancia","Murcia",
    "Tarragona","Santander","Hercules"
]

def norm_elo(s):
    return norm(s.replace(" ",""))

ce = sd.ClubElo()
frames = []

for club in CLUBS:
    try:
        hist = ce.read_team_history(club)
        if hist.empty:
            print("Sin datos", club); continue

        hist = hist.reset_index().rename(columns={hist.index.name or "index":"Date"})
        rating_col = "elo" if "elo" in hist.columns else "Elo"
        df = hist[["Date", rating_col]].rename(columns={rating_col:"Elo"})
        df["Team"]   = club
        df["team_norm"] = norm(club)
        frames.append(df)
        print("✓", club, "filas:", len(df))
    except Exception as e:
        print("X", club, "→", e)
        time.sleep(3)

elo_es = pd.concat(frames, ignore_index=True)
elo_es["Date"] = pd.to_datetime(elo_es["Date"]).dt.date

elo_es.to_parquet(SAVE_PATH, index=False)
print(f"Filas Elo España: {len(elo_es)}")
print(f"Guardado en {SAVE_PATH}")

✓ Real Madrid filas: 5530
✓ Barcelona filas: 5677
✓ Atletico filas: 6512
✓ Bilbao filas: 7000
✓ Sevilla filas: 6739
✓ Valencia filas: 6835
✓ Villarreal filas: 3404
✓ Sociedad filas: 6485
✓ Betis filas: 6244
✓ Osasuna filas: 5149
✓ Espanyol filas: 6860
✓ Getafe filas: 3348
✓ Celta filas: 5462
✓ Mallorca filas: 4587
✓ Las Palmas filas: 4699
✓ Cadiz filas: 2862
✓ Almeria filas: 2728
✓ Granada filas: 2887
✓ Alaves filas: 3311
✓ Levante filas: 3673
✓ Rayo Vallecano filas: 4180
✓ Eibar filas: 2831
✓ Girona filas: 1822
✓ Leganes filas: 2029
✓ Huesca filas: 1407
✓ Valladolid filas: 5622
✓ Elche filas: 4583
✓ Cordoba filas: 2778
✓ Gijon filas: 5249
✓ Depor filas: 4867
✓ Malaga filas: 4219
✓ Oviedo filas: 4102
✓ Zaragoza filas: 6377
✓ Xerez filas: 1517
✓ Tenerife filas: 3881
✓ Recreativo filas: 2523
✓ Numancia filas: 2121
✓ Murcia filas: 2789
✓ Tarragona filas: 1521
✓ Santander filas: 4126
✓ Hercules filas: 2647
Filas Elo España: 171183
Guardado en /content/drive/MyDrive/TFM/data/processed/clube

In [None]:
PROC_DIR = '/content/drive/MyDrive/TFM/data/processed'
os.makedirs(PROC_DIR, exist_ok=True)

df = pd.read_parquet(f"{PROC_DIR}/fd_xg_2005_2025.parquet")
df["Date"] = pd.to_datetime(df["Date"]).dt.normalize()

elo_es = pd.read_parquet(f"{PROC_DIR}/clubelo_2005_2025.parquet")
elo_es["Date"] = pd.to_datetime(elo_es["Date"]).dt.normalize()

clubelo_to_fd = {
    'real madrid': 'real madrid',
    'real oviedo'    : 'real oviedo',
    'barcelona': 'barcelona',
    'atletico': 'ath madrid',
    'bilbao': 'ath bilbao',
    'sevilla': 'sevilla',
    'valencia': 'valencia',
    'villarreal': 'villarreal',
    'sociedad': 'sociedad',
    'betis': 'betis',
    'osasuna': 'osasuna',
    'espanyol': 'espanol',
    'getafe': 'getafe',
    'celta': 'celta',
    'mallorca': 'mallorca',
    'las palmas': 'las palmas',
    'cadiz': 'cadiz',
    'almeria': 'almeria',
    'granada': 'granada',
    'alaves': 'alaves',
    'levante': 'levante',
    'rayo vallecano': 'vallecano',
    'eibar': 'eibar',
    'girona': 'girona',
    'leganes': 'leganes',
    'huesca': 'huesca',
    'valladolid': 'valladolid',
    'elche': 'elche',
    'cordoba': 'cordoba',
    'gijon': 'sp gijon',
    'depor': 'la coruna',
    'malaga': 'malaga',
    'zaragoza': 'zaragoza',
    'xerez': 'xerez',
    'tenerife': 'tenerife',
    'recreativo': 'recreativo',
    'numancia': 'numancia',
    'murcia': 'murcia',
    'tarragona': 'gimnastic',
    'santander': 'santander',
    'hercules': 'hercules'
}
elo_es['team_norm'] = elo_es['team_norm'].replace(clubelo_to_fd)

elo_home = elo_es.rename(columns={"team_norm":"HomeTeam_norm","Elo":"h_elo"}).sort_values("Date")
elo_away = elo_es.rename(columns={"team_norm":"AwayTeam_norm","Elo":"a_elo"}).sort_values("Date")

df = pd.merge_asof(
        df.sort_values("Date"),
        elo_home[["Date","HomeTeam_norm","h_elo"]],
        on="Date", by="HomeTeam_norm", direction="backward"
)

df = pd.merge_asof(
        df.sort_values("Date"),
        elo_away[["Date","AwayTeam_norm","a_elo"]],
        on="Date", by="AwayTeam_norm", direction="backward"
)

print("Cobertura h_elo:", df['h_elo'].notna().mean()*100, "%")
print("Cobertura a_elo:", df['a_elo'].notna().mean()*100, "%")

save_path = f"{PROC_DIR}/fd_xg_elo_2005_2025.parquet"
df.to_parquet(save_path, index=False)
print(f"Guardado {save_path}")

Cobertura h_elo: 100.0 %
Cobertura a_elo: 100.0 %
Guardado /content/drive/MyDrive/TFM/data/processed/fd_xg_elo_2005_2025.parquet


# 4. Transfermarkt (Plantilla y Mercado)

## **+ 6 HORAS (Mejor no ejecutar)**

Además habria que realizar comprobaciones de los codigos de los equipos en la url de transfermarkt.

In [None]:
# # ╔════════════════════════════════════════════╗
# # ║  TRANSFERMARKT  •  La Liga 2005-25  •  €€  ║
# # ╚════════════════════════════════════════════╝

# HEADERS = {
#     "User-Agent": UserAgent().random,
#     "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
#     "Referer": "https://www.transfermarkt.com/"
# }
# BASE = "https://www.transfermarkt.com"
# DELAY = 6 + np.random.uniform(0, 5)

# def fetch_html(url, retries=4, base_delay=7, factor=2):
#     for i in range(retries):
#         try:
#             r = requests.get(url, headers=HEADERS, timeout=40)
#             if r.status_code == 200:
#                 return r.text
#             print(f"⟳  Retry {i+1}/{retries}  status {r.status_code}")
#         except Exception as e:
#             print(f"⟳  Retry {i+1}/{retries}  error: {e}")
#         delay = base_delay * (factor ** i) + np.random.uniform(0, 5)
#         print(f"Sleeping {delay:.1f}s before retry {i+1}")
#         time.sleep(delay)
#     return None

# def parse_euro_value(text):
#     """
#     Devuelve el valor en millones de euros (M€) desde el formato Transfermarkt (.es y .com).
#     """
#     text = text.strip().replace('\xa0', '').replace(' ', '').lower()
#     num_match = re.search(r"([\d.,]+)", text)
#     if not num_match:
#         return None

#     value_str = num_match.group(1)
#     if "." in value_str and "," in value_str:
#         value_str = value_str.replace(".", "").replace(",", ".")
#     elif "." in value_str:
#         value_str = value_str.replace(".", "")
#     elif "," in value_str:
#         value_str = value_str.replace(",", ".")
#     try:
#         value = float(value_str)
#     except:
#         return None

#     if "mill" in text or "million" in text or "mio" in text or re.search(r"\bm\b", text):
#         return value
#     elif value >= 10000:
#         return value / 1000
#     elif value >= 1000:
#         return value / 1000
#     else:
#         return value / 1_000_000

# def parse_squad_table(soup):
#     """
#     Extrae el número de jugadores (squad size) y el % de extranjeros (primera nacionalidad ≠ España)
#     """
#     table = soup.find("table", class_="items")
#     if not table:
#         return None, None

#     tbody = table.find("tbody")
#     if not tbody:
#         return None, None

#     rows = tbody.find_all("tr", recursive=False)
#     squad_size = 0
#     n_extranjeros = 0

#     for row in rows:
#         cells = row.find_all("td", recursive=False)
#         if len(cells) < 4:
#             continue

#         nat_cell = cells[3]
#         flags = nat_cell.find_all("img", class_="flaggenrahmen")
#         if flags:
#             primera_nacionalidad = flags[0].get("title", "").strip()
#             if primera_nacionalidad not in ["España", "Spain"]:
#                 n_extranjeros += 1
#         else:
#             n_extranjeros += 1

#         squad_size += 1

#     pct_extranjeros = round(100 * n_extranjeros / squad_size, 2) if squad_size > 0 else None
#     return squad_size, pct_extranjeros

# def parse_tm_row_summary(soup):
#     tfoot = soup.find("tfoot")
#     avg_age = total_value = avg_value = None
#     if tfoot:
#         row = tfoot.find("tr")
#         age_td = row.find("td", class_="zentriert")
#         if age_td:
#             try:
#                 avg_age = float(age_td.get_text(strip=True).replace(",", "."))
#             except:
#                 avg_age = None
#         rechts_tds = row.find_all("td", class_="rechts")
#         if len(rechts_tds) >= 3:
#             total_value = parse_euro_value(rechts_tds[1].get_text(strip=True))
#             avg_value = parse_euro_value(rechts_tds[2].get_text(strip=True))
#     return avg_age, total_value, avg_value

# def scrape_tm(team_slug, team_id, season):
#     url = f"{BASE}/{team_slug}/kader/verein/{team_id}/plus/0/galerie/0?saison_id={season}"
#     html = fetch_html(url)
#     if html is None:
#         return None
#     soup = BeautifulSoup(html, "lxml")
#     avg_age, value_mio, value_avg_mio = parse_tm_row_summary(soup)
#     squad_size, pct_extranjeros = parse_squad_table(soup)
#     return {
#         "Season": season,
#         "team_slug": team_slug,
#         "avg_age": avg_age,
#         "value_mio": value_mio,
#         "value_avg_mio": value_avg_mio,
#         "squad_size": squad_size,
#         "pct_foreigners": pct_extranjeros
#     }

# slug_map = {
#     "real-madrid":         ("real-madrid", 418),
#     "fc-barcelona":        ("fc-barcelona", 131),
#     "atletico-madrid":     ("atletico-madrid", 13),
#     "athletic-bilbao":     ("athletic-club", 621),
#     "sevilla-fc":          ("sevilla-fc", 368),
#     "valencia-cf":         ("valencia-cf", 1049),
#     "villarreal-cf":       ("villarreal-cf", 1050),
#     "real-sociedad":       ("real-sociedad", 681),
#     "real-betis":          ("real-betis", 150),
#     "ca-osasuna":          ("ca-osasuna", 331),
#     "espanyol-barcelona":  ("rcd-espanyol", 714),
#     "getafe-cf":           ("getafe-cf", 3709),
#     "rc-celta-de-vigo":    ("rc-celta-de-vigo", 940),
#     "rcd-mallorca":        ("rcd-mallorca", 237),
#     "ud-las-palmas":       ("ud-las-palmas", 472),
#     "cadiz-cf":            ("cadiz-cf", 2687),
#     "ud-almeria":          ("ud-almeria", 3302),
#     "granada-cf":          ("granada-cf", 16795),
#     "deportivo-alaves":    ("deportivo-alaves", 1108),
#     "levante-ud":          ("levante-ud", 3368),
#     "rayo-vallecano":      ("rayo-vallecano", 367),
#     "sd-eibar":            ("sd-eibar", 1533),
#     "girona-fc":           ("girona-fc", 12321),
#     "cd-leganes":          ("cd-leganes", 1244),
#     "sd-huesca":           ("sd-huesca", 5358),
#     "real-valladolid":     ("real-valladolid", 366),
#     "elche-cf":            ("elche-cf", 1531),
#     "cordoba-cf":          ("cordoba-cf", 993),
#     "real-sporting":       ("sporting-gijon", 2448),
#     "deportivo-la-coruna": ("deportivo-la-coruna", 897),
#     "real-zaragoza":       ("real-zaragoza", 142),
#     "xerez-cd":            ("xerez-cd", 134),
#     "cd-tenerife":         ("cd-tenerife", 648),
#     "recreativo-huelva":   ("recreativo-huelva", 2867),
#     "cd-numancia":         ("cd-numancia", 2296),
#     "real-murcia-cf":      ("real-murcia", 171),
#     "gimnastic-de-tarragona": ("gimnastic-tarragona", 5648),
#     "racing-santander":    ("racing-santander", 630),
#     "hercules-alicante":   ("hercules-cf", 7971),
#     "malaga-cf": ("malaga-cf", 1084)
# }

# seasons_map = {
#     "real-madrid": list(range(2005, 2025)),
#     "fc-barcelona": list(range(2005, 2025)),
#     "atletico-madrid": list(range(2005, 2025)),
#     "athletic-bilbao": list(range(2005, 2025)),
#     "sevilla-fc": list(range(2005, 2025)),
#     "valencia-cf": list(range(2005, 2025)),
#     "villarreal-cf": list(range(2005, 2025)),
#     "real-sociedad": [2005,2006,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024],
#     "real-betis": [2005,2006,2007,2008,2011,2012,2013,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024],
#     "ca-osasuna": [2005,2006,2007,2008,2009,2010,2011,2012,2013,2016,2019,2020,2021,2022,2023,2024],
#     "espanyol-barcelona": [2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2021,2022,2024],
#     "getafe-cf": [2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2017,2018,2019,2020,2021,2022,2023,2024],
#     "rc-celta-de-vigo": [2005,2006,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024],
#     "rcd-mallorca": [2005,2006,2007,2008,2009,2010,2011,2012,2019,2020,2021,2022,2023,2024],
#     "ud-las-palmas": [2015,2016,2017,2023,2024],
#     "cadiz-cf": [2005,2020,2021,2022,2023],
#     "ud-almeria": [2007,2008,2009,2010,2013,2014,2022,2023],
#     "granada-cf": [2011,2012,2013,2014,2015,2016,2019,2020,2021,2023],
#     "deportivo-alaves": [2005,2016,2017,2018,2019,2020,2021,2023,2024],
#     "levante-ud": [2006,2007,2010,2011,2012,2013,2014,2015,2017,2018,2019,2020,2021],
#     "rayo-vallecano": [2011,2012,2013,2014,2015,2018,2021,2022,2023,2024],
#     "sd-eibar": [2014,2015,2016,2017,2018,2019,2020],
#     "girona-fc": [2017,2018,2022,2023,2024],
#     "cd-leganes": [2016,2017,2018,2019,2024],
#     "sd-huesca": [2018,2020],
#     "real-valladolid": [2007,2008,2009,2012,2013,2018,2019,2020,2022,2024],
#     "elche-cf": [2013,2014,2020,2021,2022],
#     "cordoba-cf": [2014],
#     "real-sporting": [2008,2009,2010,2011,2015,2016],
#     "deportivo-la-coruna": [2005,2006,2007,2008,2009,2010,2012,2014,2015,2016,2017],
#     "real-zaragoza": [2005,2006,2007,2009,2010,2011,2012],
#     "xerez-cd": [2009],
#     "cd-tenerife": [2009],
#     "recreativo-huelva": [2006,2007,2008],
#     "cd-numancia": [2008],
#     "real-murcia-cf": [2007],
#     "gimnastic-de-tarragona": [2006],
#     "racing-santander": [2005,2006,2007,2008,2009,2010,2011],
#     "hercules-alicante": [2010],
#     "malaga-cf": [2005,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017]
# }

In [None]:
# records = []
# failures = []

# for slug, (team_slug, team_id) in tqdm(slug_map.items(), desc="Clubs"):
#     for season in seasons_map.get(slug, []):
#         try:
#             rec = scrape_tm(team_slug, team_id, season)
#             if rec is not None and all(v is not None for v in [rec["avg_age"], rec["value_mio"], rec["value_avg_mio"], rec["squad_size"], rec["pct_foreigners"]]):
#                 rec["team_n"] = slug
#                 records.append(rec)
#                 print(f"OK {slug} {season}")
#             else:
#                 failures.append((slug, team_slug, team_id, season))
#                 print(f"FAILED {slug} {season}")
#             time.sleep(np.random.uniform(5, 12))
#         except Exception as e:
#             print(f"Skip {slug} {season} → {e}")
#             failures.append((slug, team_slug, team_id, season))
#             time.sleep(np.random.uniform(20, 35))

# tm_df = pd.DataFrame(records)
# tm_df.to_parquet('/content/drive/MyDrive/TFM/data/processed/transfermarkt_€€_2005_2025.parquet', index=False)
# print("🗸 Scraped rows:", len(tm_df))
# print("Years missed:", failures)

En caso de tener failures, habria que volver a descargarlos:

In [None]:
# parquet_path = '/content/drive/MyDrive/TFM/data/processed/transfermarkt_€€_2005_2025.parquet'
# df_orig = pd.read_parquet(parquet_path)

# retries = []
# failures_retry = []

# for slug, team_slug, team_id, season in tqdm(failures, desc="Retry failures"):
#     try:
#         rec = scrape_tm(team_slug, team_id, season)
#         if rec is not None and all(v is not None for v in [rec["avg_age"], rec["value_mio"], rec["value_avg_mio"], rec["squad_size"], rec["pct_foreigners"]]):
#             rec["team_n"] = slug
#             retries.append(rec)
#             print(f"OK {slug} {season} (retry)")
#         else:
#             print(f"FAILED {slug} {season} (retry)")
#             failures_retry.append((slug, team_slug, team_id, season))
#         time.sleep(np.random.uniform(7, 16))
#     except Exception as e:
#         print(f"Skip {slug} {season} → {e}")
#         failures_retry.append((slug, team_slug, team_id, season))
#         time.sleep(np.random.uniform(25, 35))

# df_retries = pd.DataFrame(retries)

# if not df_retries.empty:
#     df_total = pd.concat([df_orig, df_retries], ignore_index=True)
#     df_total = df_total.drop_duplicates(subset=["Season", "team_slug"], keep="last")
#     df_total.to_parquet(parquet_path, index=False)
#     print(f"Guardado actualizado: {parquet_path} (total filas: {len(df_total)})")
# else:
#     print("No se recuperó ningún nuevo registro. Parquet no actualizado.")

# print("Fallos tras reintento:", failures_retry)
# print(f"Total de fallidos en este reintento: {len(failures_retry)}")

Retry failures:   0%|          | 0/1 [00:00<?, ?it/s]

OK cadiz-cf 2005 (retry)


Retry failures: 100%|██████████| 1/1 [00:10<00:00, 10.58s/it]

Guardado actualizado: /content/drive/MyDrive/TFM/data/processed/transfermarkt_€€_2005_2025.parquet (total filas: 402)
Fallos tras reintento: []
Total de fallidos en este reintento: 0





## Continuar aquí

In [None]:
transfermarkt = pd.read_parquet('/content/drive/MyDrive/TFM/data/processed/transfermarkt_€€_2005_2025.parquet')

In [None]:
transfermarkt.head(20)

Unnamed: 0,Season,team_slug,avg_age,value_mio,value_avg_mio,squad_size,pct_foreigners,team_n
0,2005,real-madrid,25.4,281.6,7.82,36,36.11,real-madrid
1,2006,real-madrid,25.3,372.2,9.54,39,38.46,real-madrid
2,2007,real-madrid,26.4,355.8,13.68,26,65.38,real-madrid
3,2008,real-madrid,26.0,428.6,11.91,36,55.56,real-madrid
4,2009,real-madrid,25.7,451.7,14.12,32,50.0,real-madrid
5,2010,real-madrid,24.7,519.0,13.66,38,42.11,real-madrid
6,2011,real-madrid,25.2,539.2,16.34,33,48.48,real-madrid
7,2012,real-madrid,25.9,582.4,17.13,34,55.88,real-madrid
8,2013,real-madrid,25.5,636.8,19.9,32,46.88,real-madrid
9,2014,real-madrid,24.7,787.8,19.21,41,46.34,real-madrid


**Aqui habría que seguir el codigo en caso de querer añadir los datos de Transfermarkt para más temporadas**

In [None]:
fd_xg_elo = pd.read_parquet('/content/drive/MyDrive/TFM/data/processed/fd_xg_elo_2005_2025.parquet')

fd_xg_elo

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,CLCA,LBCH,LBCD,LBCA,HomeTeam_norm,AwayTeam_norm,h_xg,a_xg,h_elo,a_elo
0,SP1,2005-08-27,Alaves,Barcelona,0,0,D,0,0,D,...,,,,,alaves,barcelona,,,1644.251709,1892.859375
1,SP1,2005-08-27,Ath Bilbao,Sociedad,3,0,H,0,0,D,...,,,,,ath bilbao,sociedad,,,1741.242554,1716.347778
2,SP1,2005-08-27,Valencia,Betis,1,0,H,0,0,D,...,,,,,valencia,betis,,,1804.846436,1812.068970
3,SP1,2005-08-28,Ath Madrid,Zaragoza,0,0,D,0,0,D,...,,,,,ath madrid,zaragoza,,,1734.815430,1718.908691
4,SP1,2005-08-28,Cadiz,Real Madrid,1,2,A,0,1,A,...,,,,,cadiz,real madrid,,,1659.813232,1887.151733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7626,SP1,2025-08-30,Oviedo,Sociedad,1,0,H,1,0,H,...,2.25,3.30,3.10,2.25,oviedo,sociedad,0.45706,0.92221,1580.597412,1667.441528
7627,SP1,2025-08-31,Espanol,Osasuna,1,0,H,0,0,D,...,2.87,2.37,2.87,2.87,espanol,osasuna,1.36886,1.38167,1649.464233,1701.518677
7628,SP1,2025-08-31,Celta,Villarreal,1,1,D,0,0,D,...,2.25,3.00,3.50,2.25,celta,villarreal,0.876041,0.96978,1674.327637,1795.541260
7629,SP1,2025-08-31,Betis,Ath Bilbao,1,2,A,0,0,D,...,2.40,3.10,3.10,2.40,betis,ath bilbao,0.716186,1.11141,1744.222412,1796.607300


Lo primero que hacemos para poder juntar las dos tablas será crear una variables Season que indique el año de la temporada a la que corresponde cada partido.

In [None]:
fd_xg_elo['Date'] = pd.to_datetime(fd_xg_elo['Date'])

fd_xg_elo['Season'] = fd_xg_elo['Date'].apply(
    lambda x: x.year if x.month > 7 else x.year - 1
)

In [None]:
team_norm_to_slug = {
    'alaves':        'deportivo-alaves',
    'ath bilbao':    'athletic-club',
    'valencia':      'valencia-cf',
    'ath madrid':    'atletico-madrid',
    'cadiz':         'cadiz-cf',
    'celta':         'rc-celta-de-vigo',
    'espanol':       'rcd-espanyol',
    'mallorca':      'rcd-mallorca',
    'osasuna':       'ca-osasuna',
    'sevilla':       'sevilla-fc',
    'real madrid':   'real-madrid',
    'betis':         'real-betis',
    'la coruna':     'deportivo-la-coruna',
    'barcelona':     'fc-barcelona',
    'getafe':        'getafe-cf',
    'malaga':        'malaga-cf',
    'santander':     'racing-santander',
    'sociedad':      'real-sociedad',
    'villarreal':    'villarreal-cf',
    'zaragoza':      'real-zaragoza',
    'recreativo':    'recreativo-huelva',
    'levante':       'levante-ud',
    'gimnastic':     'gimnastic-tarragona',
    'murcia':        'real-murcia',
    'almeria':       'ud-almeria',
    'valladolid':    'real-valladolid',
    'numancia':      'cd-numancia',
    'sp gijon':      'sporting-gijon',
    'tenerife':      'cd-tenerife',
    'xerez':         'xerez-cd',
    'hercules':      'hercules-cf',
    'granada':       'granada-cf',
    'vallecano':     'rayo-vallecano',
    'elche':         'elche-cf',
    'eibar':         'sd-eibar',
    'cordoba':       'cordoba-cf',
    'las palmas':    'ud-las-palmas',
    'leganes':       'cd-leganes',
    'girona':        'girona-fc',
    'huesca':        'sd-huesca'
}

In [None]:
fd_xg_elo['home_team_slug'] = fd_xg_elo['HomeTeam_norm'].map(team_norm_to_slug)
fd_xg_elo['away_team_slug'] = fd_xg_elo['AwayTeam_norm'].map(team_norm_to_slug)

In [None]:
fd_xg_elo = fd_xg_elo.merge(
    transfermarkt.rename(columns={
        'team_slug': 'home_team_slug',
        'avg_age': 'h_avg_age',
        'value_mio': 'h_value_mio',
        'value_avg_mio': 'h_value_avg_mio',
        'squad_size' : 'h_squad_size',
        'pct_foreigners' : 'h_pct_foreigners'
    })[['Season','home_team_slug','h_avg_age','h_value_mio','h_value_avg_mio', 'h_squad_size', 'h_pct_foreigners']],
    on=['Season','home_team_slug'],
    how='left'
)

In [None]:
fd_xg_elo = fd_xg_elo.merge(
    transfermarkt.rename(columns={
        'team_slug': 'away_team_slug',
        'avg_age': 'a_avg_age',
        'value_mio': 'a_value_mio',
        'value_avg_mio': 'a_value_avg_mio',
        'squad_size' : 'a_squad_size',
        'pct_foreigners' : 'a_pct_foreigners'
    })[['Season','away_team_slug','a_avg_age','a_value_mio','a_value_avg_mio', 'a_squad_size', 'a_pct_foreigners']],
    on=['Season','away_team_slug'],
    how='left'
)

In [None]:
fd_xg_elo

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,h_avg_age,h_value_mio,h_value_avg_mio,h_squad_size,h_pct_foreigners,a_avg_age,a_value_mio,a_value_avg_mio,a_squad_size,a_pct_foreigners
0,SP1,2005-08-27,Alaves,Barcelona,0,0,D,0,0,D,...,28.2,34.830,1.120,31.0,54.84,25.4,327.50,9.63,34.0,47.06
1,SP1,2005-08-27,Ath Bilbao,Sociedad,3,0,H,0,0,D,...,25.2,47.230,1.150,41.0,2.44,25.9,53.83,1.74,31.0,22.58
2,SP1,2005-08-27,Valencia,Betis,1,0,H,0,0,D,...,27.3,213.550,6.280,34.0,41.18,26.2,85.95,2.60,33.0,24.24
3,SP1,2005-08-28,Ath Madrid,Zaragoza,0,0,D,0,0,D,...,24.2,134.150,4.330,31.0,25.81,27.7,66.55,2.66,25.0,28.00
4,SP1,2005-08-28,Cadiz,Real Madrid,1,2,A,0,1,A,...,28.8,2.215,0.791,28.0,46.43,25.4,281.60,7.82,36.0,36.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7626,SP1,2025-08-30,Oviedo,Sociedad,1,0,H,1,0,H,...,,,,,,,,,,
7627,SP1,2025-08-31,Espanol,Osasuna,1,0,H,0,0,D,...,,,,,,,,,,
7628,SP1,2025-08-31,Celta,Villarreal,1,1,D,0,0,D,...,,,,,,,,,,
7629,SP1,2025-08-31,Betis,Ath Bilbao,1,2,A,0,0,D,...,,,,,,,,,,


In [None]:
parquet_path = "/content/drive/MyDrive/TFM/data/processed/fd_xg_elo_transfermarkt_2005_2025.parquet"

fd_xg_elo.to_parquet(parquet_path, index=False)

print(f"Archivo guardado en: {parquet_path}")

Archivo guardado en: /content/drive/MyDrive/TFM/data/processed/fd_xg_elo_transfermarkt_2005_2025.parquet
