In [2]:
import sys
!{sys.executable} -m pip install xgboost


Collecting xgboost
  Using cached xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   - -------------------------------------- 1.8/72.0 MB 11.2 MB/s eta 0:00:07
   -- ------------------------------------- 3.9/72.0 MB 10.7 MB/s eta 0:00:07
   --- ------------------------------------ 6.0/72.0 MB 10.3 MB/s eta 0:00:07
   ---- ----------------------------------- 7.9/72.0 MB 9.9 MB/s eta 0:00:07
   ----- ---------------------------------- 10.0/72.0 MB 9.7 MB/s eta 0:00:07
   ------ --------------------------------- 12.1/72.0 MB 9.8 MB/s eta 0:00:07
   ------- -------------------------------- 13.9/72.0 MB 9.8 MB/s eta 0:00:06
   -------- ------------------------------- 16.0/72.0 MB 9.7 MB/s eta 0:00:06
   --------- ------------------------------ 17.8/72.0 MB 9.7 MB/s eta 0:00:06
   ---------- ----------------------------- 19.7/72.0 MB 9.6 MB/s eta 0:00:06
   

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [5]:
import os
os.listdir("../data/processed")

['.gitkeep',
 'matches_ml.csv',
 'team_power_score_v3.csv',
 'team_ratings_modern.csv']

In [21]:
df = pd.read_csv("../data/processed/matches_ml.csv")
df.head()
df.shape
df.columns

Index(['date', 'home_team', 'away_team', 'tournament', 'result', 'home_win',
       'home_power_v3', 'away_power_v3', 'power_diff_v3', 'home_form_5',
       'away_form_5', 'tournament_importance'],
      dtype='object')

In [8]:
# Pasar result a categoría y crear códigos
df['target'] = df['result'].astype('category').cat.codes

# Guardamos el mapeo para usarlo después (por ejemplo, al predecir)
result_mapping = dict(enumerate(df['result'].astype('category').cat.categories))
print(result_mapping)



{0: -1, 1: 0, 2: 1}


In [9]:
obj_cols = df.select_dtypes(include='object').columns.tolist()
obj_cols


['date', 'home_team', 'away_team']

In [10]:
# Convierto la fecha a datetime y las columnas numéricas a float

# Fecha
df['date'] = pd.to_datetime(df['date'])

# Defino qué columnas quiero como numéricas
numeric_cols = [
    'home_win',
    'home_power_v3',
    'away_power_v3',
    'power_diff_v3',
    'home_form_5',
    'away_form_5',
    'tournament_importance'
]

# Convierto a float (menos home_win que es entera)
for col in numeric_cols:
    if col == 'home_win':
        df[col] = df[col].astype(int)
    else:
        df[col] = df[col].astype(float)

df.dtypes


date                     datetime64[ns]
home_team                        object
away_team                        object
result                            int64
home_win                          int32
home_power_v3                   float64
away_power_v3                   float64
power_diff_v3                   float64
home_form_5                     float64
away_form_5                     float64
tournament_importance           float64
target                             int8
dtype: object

In [11]:
# Reviso la distribución de la variable objetivo
df['home_win'].value_counts(normalize=True)


home_win
0    0.516276
1    0.483724
Name: proportion, dtype: float64

In [12]:
# Defino las columnas que voy a usar como features
feature_cols = [
    'home_power_v3',
    'away_power_v3',
    'power_diff_v3',
    'home_form_5',
    'away_form_5',
    'tournament_importance'
]

X = df[feature_cols].copy()
y = df['home_win'].copy()

# Ordeno por fecha para respetar el tiempo
df_sorted = df.sort_values('date').reset_index(drop=True)
X = df_sorted[feature_cols]
y = df_sorted['home_win']


In [13]:
# Armo un corte 80% - 20% respetando el orden temporal
n = len(df_sorted)
split_idx = int(n * 0.8)

X_train = X.iloc[:split_idx]
y_train = y.iloc[:split_idx]

X_test = X.iloc[split_idx:]
y_test = y.iloc[split_idx:]

len(X_train), len(X_test)


(8945, 2237)

In [14]:
from sklearn.linear_model import LogisticRegression


In [15]:
# Escalo las features y entreno una regresión logística como modelo baseline

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)


In [16]:
# Accuracy y reporte de clasificación
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy del modelo (Logistic Regression): {acc:.4f}")

print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))

# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
cm


Accuracy del modelo (Logistic Regression): 0.7944

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.83      0.78      0.80      1205
           1       0.76      0.82      0.79      1032

    accuracy                           0.79      2237
   macro avg       0.79      0.80      0.79      2237
weighted avg       0.80      0.79      0.79      2237



array([[934, 271],
       [189, 843]], dtype=int64)

In [17]:
continent_map = {
    # CONMEBOL
    "Argentina": "CONMEBOL",
    "Brazil": "CONMEBOL",
    "Uruguay": "CONMEBOL",
    "Chile": "CONMEBOL",
    "Colombia": "CONMEBOL",
    "Peru": "CONMEBOL",
    "Paraguay": "CONMEBOL",
    "Ecuador": "CONMEBOL",
    "Bolivia": "CONMEBOL",
    "Venezuela": "CONMEBOL",

    # UEFA
    "Spain": "UEFA", "France": "UEFA", "Germany": "UEFA", "England": "UEFA",
    "Netherlands": "UEFA", "Portugal": "UEFA", "Belgium": "UEFA",
    "Italy": "UEFA", "Croatia": "UEFA", "Denmark": "UEFA",
    "Sweden": "UEFA", "Norway": "UEFA", "Switzerland": "UEFA",
    # (sumamos más después)

    # CONCACAF
    "United States": "CONCACAF",
    "Mexico": "CONCACAF",
    "Canada": "CONCACAF",
    "Costa Rica": "CONCACAF",
    "Honduras": "CONCACAF",
    "Jamaica": "CONCACAF",
    "Panama": "CONCACAF",
    # ...

    # CAF
    "Senegal": "CAF", "Morocco": "CAF", "Nigeria": "CAF",
    "Ivory Coast": "CAF", "Egypt": "CAF", "Tunisia": "CAF",
    # ...

    # AFC
    "Japan": "AFC", "South Korea": "AFC", "Iran": "AFC",
    "Saudi Arabia": "AFC", "Australia": "AFC",
    # ...

    # OFC
    "New Zealand": "OFC",
    "Vanuatu": "OFC",
    "Solomon Islands": "OFC"
}


In [24]:
df['home_continent'] = df['home_team'].map(continent_map)
df['away_continent'] = df['away_team'].map(continent_map)


In [25]:
def infer_tournament_continent(t):
    if "World Cup" in t:
        return "WORLD"   # Mundial o Eliminatorias, después veo
    elif "Euro" in t or "UEFA Nations League" in t:
        return "UEFA"
    elif "Copa América" in t or "Copa America" in t:
        return "CONMEBOL"
    elif "Gold Cup" in t or "CONCACAF" in t:
        return "CONCACAF"
    elif "Asian Cup" in t or "AFC" in t:
        return "AFC"
    elif "African Cup" in t or "Africa" in t:
        return "CAF"
    elif "OFC" in t:
        return "OFC"
    else:
        return "WORLD"


In [26]:
df['tournament_continent'] = df['tournament'].apply(infer_tournament_continent)


In [27]:
df['home_continent_advantage'] = (
    (df['home_continent'] == df['tournament_continent']) &
    (df['tournament_continent'] != "WORLD")
).astype(int)

df['away_continent_penalty'] = (
    (df['away_continent'] != df['tournament_continent']) &
    (df['tournament_continent'] != "WORLD")
).astype(int)


In [28]:
df[['home_team','away_team','tournament','tournament_continent',
    'home_continent','away_continent',
    'home_continent_advantage','away_continent_penalty']].head(15)


Unnamed: 0,home_team,away_team,tournament,tournament_continent,home_continent,away_continent,home_continent_advantage,away_continent_penalty
0,Haiti,Canada,Gold Cup,CONCACAF,,CONCACAF,0,0
1,Martinique,Costa Rica,Gold Cup,CONCACAF,,CONCACAF,0,0
2,El Salvador,Mexico,Gold Cup,CONCACAF,,CONCACAF,0,0
3,Mali,Liberia,African Cup of Nations,CAF,,,0,1
4,United States,South Korea,Gold Cup,CONCACAF,CONCACAF,AFC,1,1
5,Cameroon,DR Congo,African Cup of Nations,CAF,,,0,1
6,Costa Rica,Trinidad and Tobago,Gold Cup,CONCACAF,CONCACAF,,1,1
7,Ecuador,Haiti,Gold Cup,CONCACAF,CONMEBOL,,0,1
8,Egypt,Senegal,African Cup of Nations,CAF,CAF,CAF,1,0
9,South Africa,Burkina Faso,African Cup of Nations,CAF,,,0,1


In [29]:
feature_cols = [
    'home_power_v3',
    'away_power_v3',
    'power_diff_v3',
    'home_form_5',
    'away_form_5',
    'tournament_importance',
    'home_continent_advantage',
    'away_continent_penalty'
]


In [30]:
X = df[feature_cols].copy()
y = df['home_win'].copy()

df_sorted = df.sort_values('date').reset_index(drop=True)
X = df_sorted[feature_cols]
y = df_sorted['home_win']

n = len(df_sorted)
split_idx = int(n * 0.8)

X_train = X.iloc[:split_idx]
y_train = y.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_test = y.iloc[split_idx:]

# Escalado + modelo
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy del modelo con continentes: {acc:.4f}")


Accuracy del modelo con continentes: 0.7930


In [31]:
coords = pd.read_csv("../data/external/countries_lat_lon.csv")
coords.head()


Unnamed: 0,country,lat,lon
0,Argentina,-34.6,-58.4
1,Brazil,-15.8,-47.9
2,Chile,-33.4,-70.6
3,Uruguay,-34.9,-56.2
4,Paraguay,-25.3,-57.6


In [32]:
df = df.merge(coords, left_on="home_team", right_on="country", how="left") \
       .rename(columns={"lat": "home_lat", "lon": "home_lon"}) \
       .drop(columns=["country"])

df = df.merge(coords, left_on="away_team", right_on="country", how="left") \
       .rename(columns={"lat": "away_lat", "lon": "away_lon"}) \
       .drop(columns=["country"])


In [33]:
import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # radio de la Tierra en km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c


In [34]:
df["distance_km"] = haversine(
    df["home_lat"], df["home_lon"],
    df["away_lat"], df["away_lon"]
)


In [35]:
df[["home_team","away_team","distance_km"]].head(15)


Unnamed: 0,home_team,away_team,distance_km
0,Haiti,Canada,
1,Martinique,Costa Rica,
2,El Salvador,Mexico,
3,Mali,Liberia,
4,United States,South Korea,11164.331578
5,Cameroon,DR Congo,
6,Costa Rica,Trinidad and Tobago,
7,Ecuador,Haiti,
8,Egypt,Senegal,5236.195569
9,South Africa,Burkina Faso,


In [36]:
features = [
    "home_power_v3", "away_power_v3", "power_diff_v3",
    "home_form_5", "away_form_5",
    "tournament_importance",
    "home_continent_advantage", "away_continent_penalty",
    "distance_km"
]


In [37]:
df[["home_lat", "away_lat"]].isna().mean()


home_lat    0.672241
away_lat    0.697460
dtype: float64

In [38]:
# Por si quedaron NaN de equipos sin coordenadas
df["distance_km"] = df["distance_km"].fillna(0)

df["distance_km"].describe()


count    11182.000000
mean       555.774655
std       1868.591799
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      19605.981095
Name: distance_km, dtype: float64

In [40]:
feature_cols = [
    'home_power_v3',
    'away_power_v3',
    'power_diff_v3',
    'home_form_5',
    'away_form_5',
    'tournament_importance',
    'home_continent_advantage',
    'away_continent_penalty',
    'distance_km'
]


In [41]:
# Ordeno por fecha para respetar el tiempo
df_sorted = df.sort_values('date').reset_index(drop=True)

X = df_sorted[feature_cols]
y = df_sorted['home_win']

n = len(df_sorted)
split_idx = int(n * 0.8)

X_train = X.iloc[:split_idx]
y_train = y.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_test = y.iloc[split_idx:]


In [42]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy del modelo con distancia: {acc:.4f}")


Accuracy del modelo con distancia: 0.7930


In [43]:
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))

print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred))



Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.83      0.77      0.80      1205
           1       0.76      0.81      0.78      1032

    accuracy                           0.79      2237
   macro avg       0.79      0.79      0.79      2237
weighted avg       0.80      0.79      0.79      2237


Matriz de confusión:
[[933 272]
 [191 841]]


In [44]:
market = pd.read_csv("../data/external/team_market_values.csv")
market.head()


Unnamed: 0,team,market_value_millions
0,Argentina,900
1,Brazil,1200
2,England,1100
3,France,1050
4,Portugal,950


In [45]:
df = df.merge(market, left_on="home_team", right_on="team", how="left") \
       .rename(columns={"market_value_millions": "home_market_value"}) \
       .drop(columns=["team"])

df = df.merge(market, left_on="away_team", right_on="team", how="left") \
       .rename(columns={"market_value_millions": "away_market_value"}) \
       .drop(columns=["team"])


In [46]:
df["market_value_diff"] = df["home_market_value"] - df["away_market_value"]


In [47]:
df["home_market_value"] = df["home_market_value"].fillna(0)
df["away_market_value"] = df["away_market_value"].fillna(0)
df["market_value_diff"] = df["market_value_diff"].fillna(0)


In [48]:
feature_cols = [
    "home_power_v3", "away_power_v3", "power_diff_v3",
    "home_form_5", "away_form_5",
    "tournament_importance",
    "home_continent_advantage", "away_continent_penalty",
    "distance_km",
    "home_market_value", "away_market_value", "market_value_diff"
]


In [50]:
y_pred = log_reg.predict(X_test_scaled)
accuracy_score(y_test, y_pred)


0.7930263746088512

In [1]:
import os
os.getcwd()


'C:\\Users\\lmosquen'