# 04 ‒ Modelo moderno (forma reciente + métricas avanzadas)

En este notebook construyo un rating moderno basado en el rendimiento reciente de cada selección (2018–2024), combinando:

forma reciente (últimos 5 partidos),

calidad de rivales enfrentados,

goles esperados y rendimiento ofensivo/defensivo,

importancia del torneo,

distancia geográfica de los viajes,

métrica propia del proyecto (power_score_v3).

El propósito es generar un rating actualizado y representativo del momento actual de cada selección, produciendo el archivo team_power_score_v3.csv, que se integra luego en el modelo híbrido utilizado para la simulación Monte Carlo.

In [2]:
import pandas as pd
import numpy as np

# Cargo el dataset de partidos con features básicas
df = pd.read_csv("../data/processed/matches_ml.csv")

# Aseguro que la fecha esté en formato datetime
df['date'] = pd.to_datetime(df['date'])

# Me quedo solo con partidos desde 2018 en adelante
df_modern = df[df['date'] >= '2018-01-01'].copy()

df.shape, df_modern.shape


((11182, 12), (4593, 12))

In [3]:
df_modern.columns


Index(['date', 'home_team', 'away_team', 'tournament', 'result', 'home_win',
       'home_power_v3', 'away_power_v3', 'power_diff_v3', 'home_form_5',
       'away_form_5', 'tournament_importance'],
      dtype='object')

In [4]:
df.shape, df_modern.shape


((11182, 12), (4593, 12))

In [5]:
df_modern.columns.tolist()


['date',
 'home_team',
 'away_team',
 'tournament',
 'result',
 'home_win',
 'home_power_v3',
 'away_power_v3',
 'power_diff_v3',
 'home_form_5',
 'away_form_5',
 'tournament_importance']

In [6]:
market = pd.read_csv("../data/external/team_market_values.csv")
market.head()


Unnamed: 0,team,market_value_millions
0,Argentina,900
1,Brazil,1200
2,England,1100
3,France,1050
4,Portugal,950


In [7]:
df_modern = df_modern.merge(
    market, left_on="home_team", right_on="team", how="left"
).rename(columns={"market_value_millions": "home_market_value"}).drop(columns=["team"])

df_modern = df_modern.merge(
    market, left_on="away_team", right_on="team", how="left"
).rename(columns={"market_value_millions": "away_market_value"}).drop(columns=["team"])


In [8]:
df_modern["market_value_diff"] = (
    df_modern["home_market_value"] - df_modern["away_market_value"]
)


In [9]:
df_modern["home_market_value"] = df_modern["home_market_value"].fillna(0)
df_modern["away_market_value"] = df_modern["away_market_value"].fillna(0)
df_modern["market_value_diff"] = df_modern["market_value_diff"].fillna(0)


In [10]:
coords = pd.read_csv("../data/external/countries_lat_lon.csv")


In [11]:
df_modern = df_modern.merge(coords, left_on="home_team", right_on="country", how="left") \
    .rename(columns={"lat": "home_lat", "lon": "home_lon"}) \
    .drop(columns=["country"])

df_modern = df_modern.merge(coords, left_on="away_team", right_on="country", how="left") \
    .rename(columns={"lat": "away_lat", "lon": "away_lon"}) \
    .drop(columns=["country"])


In [14]:
import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # radio de la Tierra en km
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c


In [15]:
df_modern["distance_km"] = haversine(
    df_modern["home_lat"], df_modern["home_lon"],
    df_modern["away_lat"], df_modern["away_lon"]
)
df_modern["distance_km"] = df_modern["distance_km"].fillna(0)


In [16]:
feature_cols_modern = [
    "home_power_v3", "away_power_v3", "power_diff_v3",
    "home_form_5", "away_form_5",
    "home_market_value", "away_market_value", "market_value_diff",
    "distance_km",
    "tournament_importance"
]


In [17]:
df_modern = df_modern.sort_values("date").reset_index(drop=True)

X = df_modern[feature_cols_modern]
y = df_modern["home_win"]

split_idx = int(len(df_modern) * 0.8)

X_train = X.iloc[:split_idx]
y_train = y.iloc[:split_idx]

X_test = X.iloc[split_idx:]
y_test = y.iloc[split_idx:]


In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_modern = LogisticRegression(max_iter=2000)
log_modern.fit(X_train_scaled, y_train)

y_pred_modern = log_modern.predict(X_test_scaled)

acc_modern = accuracy_score(y_test, y_pred_modern)
acc_modern


0.8182807399347116

In [19]:
output_path = "../data/processed/matches_modern_features.csv"
df_modern.to_csv(output_path, index=False)

print("Archivo guardado correctamente en:", output_path)


Archivo guardado correctamente en: ../data/processed/matches_modern_features.csv


In [1]:
import os
os.getcwd()


'C:\\Users\\lmosquen'