# 02 - Features por partido para el modelo de predicción

En este notebook preparo un dataset a nivel partido, uniendo:
- resultado real (win/draw/loss)
- ratings de las selecciones (`power_score_v3`)
- forma reciente (form_5)
- importancia del torneo

El objetivo es dejar una tabla lista para entrenar modelos de Machine Learning.


In [27]:
import pandas as pd
import numpy as np


In [28]:
# resultados crudos
df = pd.read_csv("../data/raw/results.csv")
df['date'] = pd.to_datetime(df['date'])

# ratings modernos por selección (lo que guardamos en el 01)
ratings = pd.read_csv("../data/processed/team_ratings_modern.csv", index_col=0)

df.head(), ratings.head()


(        date home_team away_team  home_score  away_score tournament     city  \
 0 1872-11-30  Scotland   England         0.0         0.0   Friendly  Glasgow   
 1 1873-03-08   England  Scotland         4.0         2.0   Friendly   London   
 2 1874-03-07  Scotland   England         2.0         1.0   Friendly  Glasgow   
 3 1875-03-06   England  Scotland         2.0         2.0   Friendly   London   
 4 1876-03-04  Scotland   England         3.0         0.0   Friendly  Glasgow   
 
     country  neutral  
 0  Scotland    False  
 1   England    False  
 2  Scotland    False  
 3   England    False  
 4  Scotland    False  ,
                 avg_goals_for  avg_goals_against  win_rate  \
 team                                                         
 Afghanistan          0.777778           2.200000  0.200000   
 Albania              1.014085           1.239437  0.330986   
 Algeria              1.743590           0.974359  0.519231   
 American Samoa       0.909091           3.909091  0

In [29]:
torneos_serios = [
    'FIFA World Cup',
    'FIFA World Cup qualification',
    'UEFA Euro',
    'UEFA Euro qualification',
    'Copa América',
    'African Cup of Nations',
    'African Cup of Nations qualification',
    'AFC Asian Cup',
    'AFC Asian Cup qualification',
    'Gold Cup',
    'CONCACAF Nations League',
    'UEFA Nations League'
]

df_serio = df[df['tournament'].isin(torneos_serios)].copy()

# Me quedo con fútbol moderno
df_moderno = df_serio[df_serio['date'] >= '2002-01-01'].copy()
df_moderno.shape


(11182, 9)

In [30]:
def resultado_fila(row):
    if row['home_score'] > row['away_score']:
        return 1   # gana local
    elif row['home_score'] < row['away_score']:
        return -1  # gana visitante
    else:
        return 0   # empate

matches = df_moderno.copy()
matches['result'] = matches.apply(resultado_fila, axis=1)
matches['home_win'] = (matches['result'] == 1).astype(int)

matches[['date', 'home_team', 'away_team', 'home_score', 'away_score', 'result', 'home_win']].head()


Unnamed: 0,date,home_team,away_team,home_score,away_score,result,home_win
26159,2002-01-18,Haiti,Canada,0.0,2.0,-1,0
26160,2002-01-18,Martinique,Costa Rica,0.0,2.0,-1,0
26161,2002-01-19,El Salvador,Mexico,0.0,1.0,-1,0
26163,2002-01-19,Mali,Liberia,1.0,1.0,0,0
26165,2002-01-19,United States,South Korea,2.0,1.0,1,1


In [31]:
# me quedo solo con la métrica final por equipo
team_power = ratings[['power_score_v3']].copy()

# power del local
matches = matches.merge(
    team_power,
    left_on='home_team',
    right_index=True,
    how='left'
).rename(columns={'power_score_v3': 'home_power_v3'})

# power del visitante
matches = matches.merge(
    team_power,
    left_on='away_team',
    right_index=True,
    how='left'
).rename(columns={'power_score_v3': 'away_power_v3'})

matches[['home_team','away_team','home_power_v3','away_power_v3']].head()


Unnamed: 0,home_team,away_team,home_power_v3,away_power_v3
26159,Haiti,Canada,0.616172,0.684707
26160,Martinique,Costa Rica,0.401563,0.632365
26161,El Salvador,Mexico,0.535305,0.785911
26163,Mali,Liberia,0.652291,0.36414
26165,United States,South Korea,0.761712,0.74957


In [32]:
rows = []

for _, row in df_moderno.iterrows():
    # equipo local
    rows.append({
        'date': row['date'],
        'team': row['home_team'],
        'goals_for': row['home_score'],
        'goals_against': row['away_score'],
        'is_home': 1,
        'result': 1 if row['home_score'] > row['away_score'] else (0 if row['home_score'] == row['away_score'] else -1)
    })
    # equipo visitante
    rows.append({
        'date': row['date'],
        'team': row['away_team'],
        'goals_for': row['away_score'],
        'goals_against': row['home_score'],
        'is_home': 0,
        'result': 1 if row['away_score'] > row['home_score'] else (0 if row['away_score'] == row['home_score'] else -1)
    })

df_team_results = pd.DataFrame(rows)
df_team_results = df_team_results.sort_values(['team', 'date'])

# puntos por partido: victoria=3, empate=1, derrota=0
df_team_results['points'] = np.where(
    df_team_results['result'] == 1, 3,
    np.where(df_team_results['result'] == 0, 1, 0)
)

# racha de los últimos 5 partidos (promedio de puntos)
df_team_results['form_5'] = (
    df_team_results
    .groupby('team')['points']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

df_team_results.head()


Unnamed: 0,date,team,goals_for,goals_against,is_home,result,points,form_5
435,2003-03-16,Afghanistan,2.0,1.0,0,1,3,3.0
437,2003-03-18,Afghanistan,0.0,4.0,0,-1,0,1.5
1241,2003-11-19,Afghanistan,0.0,11.0,0,-1,0,1.0
1250,2003-11-23,Afghanistan,0.0,2.0,1,-1,0,0.75
4279,2007-10-08,Afghanistan,0.0,3.0,0,-1,0,0.6


In [33]:
# local
matches = matches.merge(
    df_team_results[['team','date','form_5']],
    left_on=['home_team','date'],
    right_on=['team','date'],
    how='left'
).rename(columns={'form_5': 'home_form_5'}).drop(columns=['team'])

# visitante
matches = matches.merge(
    df_team_results[['team','date','form_5']],
    left_on=['away_team','date'],
    right_on=['team','date'],
    how='left'
).rename(columns={'form_5': 'away_form_5'}).drop(columns=['team'])

matches[['home_team','away_team','home_form_5','away_form_5']].head()


Unnamed: 0,home_team,away_team,home_form_5,away_form_5
0,Haiti,Canada,0.0,3.0
1,Martinique,Costa Rica,0.0,3.0
2,El Salvador,Mexico,0.0,3.0
3,Mali,Liberia,1.0,1.0
4,United States,South Korea,3.0,0.0


In [34]:
tournament_importance = {
    'FIFA World Cup': 3.0,
    'FIFA World Cup qualification': 2.5,
    'UEFA Euro': 2.5,
    'UEFA Euro qualification': 2.0,
    'Copa América': 2.5,
    'African Cup of Nations': 2.0,
    'African Cup of Nations qualification': 1.8,
    'AFC Asian Cup': 2.0,
    'AFC Asian Cup qualification': 1.8,
    'Gold Cup': 2.0,
    'CONCACAF Nations League': 1.6,
    'UEFA Nations League': 1.6
}

matches['tournament_importance'] = matches['tournament'].map(
    lambda x: tournament_importance.get(x, 1.0)
)


In [35]:
matches['power_diff_v3'] = matches['home_power_v3'] - matches['away_power_v3']


In [36]:
features_cols = [
    'home_power_v3',
    'away_power_v3',
    'power_diff_v3',
    'home_form_5',
    'away_form_5',
    'tournament_importance'
]

target_col = 'home_win'   # por ahora modelo binario: gana local sí/no

matches_ml = matches[['date','home_team','away_team','tournament','result','home_win'] + features_cols].dropna()

matches_ml.head(), matches_ml.shape


(        date      home_team    away_team              tournament  result  \
 0 2002-01-18          Haiti       Canada                Gold Cup      -1   
 1 2002-01-18     Martinique   Costa Rica                Gold Cup      -1   
 2 2002-01-19    El Salvador       Mexico                Gold Cup      -1   
 3 2002-01-19           Mali      Liberia  African Cup of Nations       0   
 4 2002-01-19  United States  South Korea                Gold Cup       1   
 
    home_win  home_power_v3  away_power_v3  power_diff_v3  home_form_5  \
 0         0       0.616172       0.684707      -0.068536          0.0   
 1         0       0.401563       0.632365      -0.230802          0.0   
 2         0       0.535305       0.785911      -0.250606          0.0   
 3         0       0.652291       0.364140       0.288151          1.0   
 4         1       0.761712       0.749570       0.012141          3.0   
 
    away_form_5  tournament_importance  
 0          3.0                    2.0  
 1      

In [37]:
import os
os.makedirs("../data/processed", exist_ok=True)

matches_ml.to_csv("../data/processed/matches_ml.csv", index=False)

print("Dataset para modelo guardado en ../data/processed/matches_ml.csv")


Dataset para modelo guardado en ../data/processed/matches_ml.csv


In [38]:
matches_ml.isna().sum()



date                     0
home_team                0
away_team                0
tournament               0
result                   0
home_win                 0
home_power_v3            0
away_power_v3            0
power_diff_v3            0
home_form_5              0
away_form_5              0
tournament_importance    0
dtype: int64

In [39]:
matches_ml['home_win'].value_counts(normalize=True)


home_win
0    0.516276
1    0.483724
Name: proportion, dtype: float64

In [40]:
import os
os.makedirs("../data/processed", exist_ok=True)

matches_ml.to_csv("../data/processed/matches_ml.csv", index=False)

print("Dataset para modelo guardado en ../data/processed/matches_ml.csv")


Dataset para modelo guardado en ../data/processed/matches_ml.csv


In [41]:
import os
os.getcwd()


'C:\\Users\\lmosquen\\Desktop\\Python\\Proyecto mundial\\worldcup-2026-prediction-main\\worldcup-2026-prediction-main\\notebooks'

In [43]:
!dir



 El volumen de la unidad C no tiene etiqueta.
 El n£mero de serie del volumen es: 76BE-16F2

 Directorio de C:\Users\lmosquen\Desktop\Python\Proyecto mundial\worldcup-2026-prediction-main\worldcup-2026-prediction-main\notebooks

28/11/2025  15:37    <DIR>          .
28/11/2025  15:37    <DIR>          ..
28/11/2025  11:11                21 .gitkeep
28/11/2025  11:36    <DIR>          .ipynb_checkpoints
28/11/2025  15:37           210.580 01-exploracion.ipynb
               2 archivos        210.601 bytes
               3 dirs  386.771.206.144 bytes libres
