# 1. Carga del dataset original y exploración inicial

En esta sección cargo el dataset `results.csv`, convierto las fechas, reviso sus columnas y hago una primera exploración general.


In [1]:
import pandas as pd

df = pd.read_csv('../data/raw/results.csv')
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [2]:
df.shape


(48850, 9)

In [3]:
df.columns


Index(['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral'],
      dtype='object')

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48850 entries, 0 to 48849
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        48850 non-null  object 
 1   home_team   48850 non-null  object 
 2   away_team   48850 non-null  object 
 3   home_score  48849 non-null  float64
 4   away_score  48849 non-null  float64
 5   tournament  48850 non-null  object 
 6   city        48850 non-null  object 
 7   country     48850 non-null  object 
 8   neutral     48850 non-null  bool   
dtypes: bool(1), float64(2), object(6)
memory usage: 3.0+ MB


In [5]:
df['tournament'].value_counts().head(20)


tournament
Friendly                                18205
FIFA World Cup qualification             8688
UEFA Euro qualification                  2824
African Cup of Nations qualification     2278
FIFA World Cup                            964
Copa América                              869
AFC Asian Cup qualification               829
African Cup of Nations                    793
UEFA Nations League                       658
CECAFA Cup                                620
CFU Caribbean Cup qualification           606
Merdeka Tournament                        599
British Home Championship                 523
CONCACAF Nations League                   422
AFC Asian Cup                             421
Gold Cup                                  420
Gulf Cup                                  410
Island Games                              394
UEFA Euro                                 388
Asian Games                               368
Name: count, dtype: int64

In [6]:
df['date'].min(), df['date'].max()


('1872-11-30', '2025-11-20')

# 2. Filtro de torneos competitivos ("serios")

En esta sección selecciono solo los torneos que aportan valor predictivo: Mundial, Eliminatorias, Euro, Copa América, etc.  
El resultado principal es el dataframe `df_serio`.


In [7]:
# torneos que considero "serios" para predicción
torneos_serios = [
    'FIFA World Cup',
    'FIFA World Cup qualification',
    'UEFA Euro',
    'UEFA Euro qualification',
    'CONMEBOL World Cup Qualifiers',
    'Copa América',
    'African Cup of Nations',
    'African Cup of Nations qualification',
    'AFC Asian Cup',
    'AFC Asian Cup qualification',
    'Gold Cup',
    'CONCACAF Nations League',
    'UEFA Nations League'
]

df_serio = df[df['tournament'].isin(torneos_serios)].copy()
df_serio.shape


(19554, 9)

In [8]:
# agregar columnas de goles anotados y recibidos
df_serio['gf_local'] = df_serio['home_score']
df_serio['ga_local'] = df_serio['away_score']
df_serio['gf_visit'] = df_serio['away_score']
df_serio['ga_visit'] = df_serio['home_score']

# armar dataset donde cada fila representa un equipo en un partido
local = df_serio[['date', 'home_team', 'gf_local', 'ga_local']].rename(columns={
    'home_team': 'team',
    'gf_local': 'goals_for',
    'ga_local': 'goals_against'
})

visit = df_serio[['date', 'away_team', 'gf_visit', 'ga_visit']].rename(columns={
    'away_team': 'team',
    'gf_visit': 'goals_for',
    'ga_visit': 'goals_against'
})

df_equipos = pd.concat([local, visit], ignore_index=True)

df_equipos.head()


Unnamed: 0,date,team,goals_for,goals_against
0,1916-07-02,Chile,0.0,4.0
1,1916-07-06,Argentina,6.0,1.0
2,1916-07-08,Brazil,1.0,1.0
3,1916-07-10,Argentina,1.0,1.0
4,1916-07-12,Brazil,1.0,2.0


In [9]:
stats_globales = df_equipos.groupby('team').agg({
    'goals_for': 'mean',
    'goals_against': 'mean'
}).rename(columns={
    'goals_for': 'avg_goals_for',
    'goals_against': 'avg_goals_against'
})

stats_globales.head(10)


Unnamed: 0_level_0,avg_goals_for,avg_goals_against
team,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,0.732143,2.5
Albania,0.824903,1.610895
Algeria,1.599379,1.0
American Samoa,0.666667,6.666667
Andorra,0.314815,2.604938
Angola,1.232558,1.037209
Anguilla,0.294118,3.705882
Antigua and Barbuda,1.390625,2.03125
Argentina,1.987152,0.91863
Armenia,0.966667,1.838889


In [10]:
stats_globales.shape


(223, 2)

In [11]:
# resultado desde el punto de vista del equipo local
df_serio['resultado_local'] = df_serio.apply(
    lambda x: 'win' if x['home_score'] > x['away_score']
              else 'loss' if x['home_score'] < x['away_score']
              else 'draw',
    axis=1
)

# resultado para el visitante
df_serio['resultado_visit'] = df_serio.apply(
    lambda x: 'win' if x['away_score'] > x['home_score']
              else 'loss' if x['away_score'] < x['home_score']
              else 'draw',
    axis=1
)


In [12]:
local_res = df_serio[['home_team', 'resultado_local']].rename(columns={
    'home_team': 'team',
    'resultado_local': 'result'
})

visit_res = df_serio[['away_team', 'resultado_visit']].rename(columns={
    'away_team': 'team',
    'resultado_visit': 'result'
})

df_resultados = pd.concat([local_res, visit_res], ignore_index=True)
df_resultados.head()


Unnamed: 0,team,result
0,Chile,loss
1,Argentina,win
2,Brazil,draw
3,Argentina,draw
4,Brazil,loss


In [13]:
win_rate = df_resultados.groupby('team')['result'].apply(
    lambda x: (x == 'win').mean()
)

win_rate = win_rate.to_frame().rename(columns={0: 'win_rate'})
win_rate.head(10)


Unnamed: 0_level_0,result
team,Unnamed: 1_level_1
Afghanistan,0.160714
Albania,0.229572
Algeria,0.481366
American Samoa,0.2
Andorra,0.04321
Angola,0.35814
Anguilla,0.029412
Antigua and Barbuda,0.328125
Argentina,0.593148
Armenia,0.222222


In [14]:
stats_completas = stats_globales.join(win_rate, how='left')
stats_completas.head(10)


Unnamed: 0_level_0,avg_goals_for,avg_goals_against,result
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,0.732143,2.5,0.160714
Albania,0.824903,1.610895,0.229572
Algeria,1.599379,1.0,0.481366
American Samoa,0.666667,6.666667,0.2
Andorra,0.314815,2.604938,0.04321
Angola,1.232558,1.037209,0.35814
Anguilla,0.294118,3.705882,0.029412
Antigua and Barbuda,1.390625,2.03125,0.328125
Argentina,1.987152,0.91863,0.593148
Armenia,0.966667,1.838889,0.222222


In [15]:
win_rate.shape


(223, 1)

In [18]:
stats_completas.columns


Index(['avg_goals_for', 'avg_goals_against', 'result'], dtype='object')

## Reconstrucción de estadísticas - Versión limpia


In [20]:
# Parto de df_serio, que ya tenía filtrado por torneos serios

# 1) Creo columnas de goles a favor / en contra para local y visitante
df_serio['gf_local'] = df_serio['home_score']
df_serio['ga_local'] = df_serio['away_score']
df_serio['gf_visit'] = df_serio['away_score']
df_serio['ga_visit'] = df_serio['home_score']

# 2) Armo df_equipos: cada fila = un equipo en un partido
local = df_serio[['date', 'home_team', 'gf_local', 'ga_local']].rename(columns={
    'home_team': 'team',
    'gf_local': 'goals_for',
    'ga_local': 'goals_against'
})

visit = df_serio[['date', 'away_team', 'gf_visit', 'ga_visit']].rename(columns={
    'away_team': 'team',
    'gf_visit': 'goals_for',
    'ga_visit': 'goals_against'
})

df_equipos = pd.concat([local, visit], ignore_index=True)

# 3) Estadísticas globales de goles por selección
stats_globales = df_equipos.groupby('team').agg({
    'goals_for': 'mean',
    'goals_against': 'mean'
}).rename(columns={
    'goals_for': 'avg_goals_for',
    'goals_against': 'avg_goals_against'
})

stats_globales.head()


Unnamed: 0_level_0,avg_goals_for,avg_goals_against
team,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,0.732143,2.5
Albania,0.824903,1.610895
Algeria,1.599379,1.0
American Samoa,0.666667,6.666667
Andorra,0.314815,2.604938


In [21]:
# 1) Creo columnas de resultado para local y visitante
df_serio['resultado_local'] = df_serio.apply(
    lambda x: 'win' if x['home_score'] > x['away_score']
              else 'loss' if x['home_score'] < x['away_score']
              else 'draw',
    axis=1
)

df_serio['resultado_visit'] = df_serio.apply(
    lambda x: 'win' if x['away_score'] > x['home_score']
              else 'loss' if x['away_score'] < x['home_score']
              else 'draw',
    axis=1
)

# 2) Unifico resultados por equipo
local_res = df_serio[['home_team', 'resultado_local']].rename(columns={
    'home_team': 'team',
    'resultado_local': 'result'
})

visit_res = df_serio[['away_team', 'resultado_visit']].rename(columns={
    'away_team': 'team',
    'resultado_visit': 'result'
})

df_resultados = pd.concat([local_res, visit_res], ignore_index=True)

# 3) Win rate por selección
win_rate = df_resultados.groupby('team')['result'].apply(
    lambda x: (x == 'win').mean()
)

win_rate = win_rate.to_frame(name='win_rate')

win_rate.head()


Unnamed: 0_level_0,win_rate
team,Unnamed: 1_level_1
Afghanistan,0.160714
Albania,0.229572
Algeria,0.481366
American Samoa,0.2
Andorra,0.04321


In [22]:
# Uno tablas por índice (team es el index en ambas)
stats_completas = stats_globales.join(win_rate, how='left')

stats_completas.head()
stats_completas.columns


Index(['avg_goals_for', 'avg_goals_against', 'win_rate'], dtype='object')

In [23]:
stats_completas['power_score'] = (
    stats_completas['win_rate'] * 0.6 +
    stats_completas['avg_goals_for'] * 0.3 -
    stats_completas['avg_goals_against'] * 0.2
)

ranking = stats_completas.sort_values('power_score', ascending=False)
ranking.head(20)


Unnamed: 0_level_0,avg_goals_for,avg_goals_against,win_rate,power_score
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Germany,2.378641,0.910194,0.65534,0.924757
Spain,2.18599,0.806763,0.635266,0.875604
Iran,2.14,0.693333,0.62,0.875333
Netherlands,2.216625,0.853904,0.617128,0.864484
New Zealand,2.450549,1.076923,0.527473,0.836264
England,2.071053,0.697368,0.589474,0.835526
Australia,2.215859,0.837004,0.559471,0.83304
Brazil,2.130342,0.886752,0.604701,0.824573
Japan,2.103053,0.793893,0.583969,0.822519
Argentina,1.987152,0.91863,0.593148,0.768308


In [24]:
stats_completas.columns
ranking.head(10)


Unnamed: 0_level_0,avg_goals_for,avg_goals_against,win_rate,power_score
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Germany,2.378641,0.910194,0.65534,0.924757
Spain,2.18599,0.806763,0.635266,0.875604
Iran,2.14,0.693333,0.62,0.875333
Netherlands,2.216625,0.853904,0.617128,0.864484
New Zealand,2.450549,1.076923,0.527473,0.836264
England,2.071053,0.697368,0.589474,0.835526
Australia,2.215859,0.837004,0.559471,0.83304
Brazil,2.130342,0.886752,0.604701,0.824573
Japan,2.103053,0.793893,0.583969,0.822519
Argentina,1.987152,0.91863,0.593148,0.768308


## Construcción del dataset de partidos para el modelo

En esta sección construyo un dataset a nivel partido, donde cada fila representa un encuentro oficial entre dos selecciones.  
A cada partido le agrego el `power_score` de la selección local y de la visitante, y creo una variable objetivo que indica si la selección local ganó o no. Este dataset lo voy a usar más adelante para entrenar modelos de predicción de resultados.


In [27]:
# Me quedo solo con el power_score de cada selección
team_power = stats_completas[['power_score']].copy()
team_power.head()


Unnamed: 0_level_0,power_score
team,Unnamed: 1_level_1
Afghanistan,-0.183929
Albania,0.063035
Algeria,0.568634
American Samoa,-1.013333
Andorra,-0.400617


In [28]:
# Construyo un dataset de partidos con las columnas principales
matches = df_serio[['date', 'home_team', 'away_team', 'home_score', 'away_score', 'tournament']].copy()

matches.head()


Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament
478,1916-07-02,Chile,Uruguay,0.0,4.0,Copa América
480,1916-07-06,Argentina,Chile,6.0,1.0,Copa América
481,1916-07-08,Brazil,Chile,1.0,1.0,Copa América
482,1916-07-10,Argentina,Brazil,1.0,1.0,Copa América
484,1916-07-12,Brazil,Uruguay,1.0,2.0,Copa América


In [29]:
# Creo la variable objetivo: si ganó el equipo local
matches['home_win'] = (matches['home_score'] > matches['away_score']).astype(int)

# También podría guardar si fue empate, por si después quiero un modelo de 3 clases
matches['draw'] = (matches['home_score'] == matches['away_score']).astype(int)

matches[['home_score', 'away_score', 'home_win', 'draw']].head(10)


Unnamed: 0,home_score,away_score,home_win,draw
478,0.0,4.0,0,0
480,6.0,1.0,1,0
481,1.0,1.0,0,1
482,1.0,1.0,0,1
484,1.0,2.0,0,0
486,0.0,0.0,0,1
514,4.0,0.0,1,0
515,4.0,2.0,1,0
516,1.0,0.0,1,0
519,4.0,0.0,1,0


In [30]:
# Agrego el power_score de la selección local
matches = matches.merge(
    team_power,
    left_on='home_team',
    right_index=True,
    how='left'
).rename(columns={'power_score': 'home_power'})

# Agrego el power_score de la selección visitante
matches = matches.merge(
    team_power,
    left_on='away_team',
    right_index=True,
    how='left'
).rename(columns={'power_score': 'away_power'})

matches.head()


Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,home_win,draw,home_power,away_power
478,1916-07-02,Chile,Uruguay,0.0,4.0,Copa América,0,0,0.335802,0.576356
480,1916-07-06,Argentina,Chile,6.0,1.0,Copa América,1,0,0.768308,0.335802
481,1916-07-08,Brazil,Chile,1.0,1.0,Copa América,0,1,0.824573,0.335802
482,1916-07-10,Argentina,Brazil,1.0,1.0,Copa América,0,1,0.768308,0.824573
484,1916-07-12,Brazil,Uruguay,1.0,2.0,Copa América,0,0,0.824573,0.576356


In [31]:
matches[['home_power', 'away_power']].isna().sum()


home_power    0
away_power    0
dtype: int64

In [32]:
model_data = matches[['home_power', 'away_power', 'home_win']].copy()
model_data.head()


Unnamed: 0,home_power,away_power,home_win
478,0.335802,0.576356,0
480,0.768308,0.335802,1
481,0.824573,0.335802,0
482,0.768308,0.824573,0
484,0.824573,0.576356,0


In [33]:
from sklearn.model_selection import train_test_split

X = model_data[['home_power', 'away_power']]
y = model_data['home_win']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [34]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


In [35]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

accuracy


0.7128611608284326

In [36]:
import numpy as np

coef = model.coef_[0]
features = ['home_power', 'away_power']

for f, c in zip(features, coef):
    print(f"{f}: {c:.4f}")


home_power: 2.9566
away_power: -3.1549


In [37]:
matches['power_diff'] = matches['home_power'] - matches['away_power']
matches[['home_power', 'away_power', 'power_diff']].head()


Unnamed: 0,home_power,away_power,power_diff
478,0.335802,0.576356,-0.240553
480,0.768308,0.335802,0.432506
481,0.824573,0.335802,0.48877
482,0.768308,0.824573,-0.056264
484,0.824573,0.576356,0.248217


In [38]:
tournament_weight = {
    'FIFA World Cup': 3.0,
    'Copa América': 2.5,
    'UEFA Euro': 2.5,
    'FIFA World Cup qualification': 2.0,
    'UEFA Euro qualification': 2.0,
    'CONCACAF Nations League': 1.5,
    'UEFA Nations League': 1.5,
}

# Le pongo 1.0 a torneos no listados
matches['tournament_importance'] = matches['tournament'].map(tournament_weight).fillna(1.0)


In [39]:
model_data = matches[[
    'home_power',
    'away_power',
    'power_diff',
    'tournament_importance',
    'home_win'
]].copy()

model_data.head()


Unnamed: 0,home_power,away_power,power_diff,tournament_importance,home_win
478,0.335802,0.576356,-0.240553,2.5,0
480,0.768308,0.335802,0.432506,2.5,1
481,0.824573,0.335802,0.48877,2.5,0
482,0.768308,0.824573,-0.056264,2.5,0
484,0.824573,0.576356,0.248217,2.5,0


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = model_data[['home_power', 'away_power', 'power_diff', 'tournament_importance']]
y = model_data['home_win']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy


0.7138839171567374

In [41]:
# Ordeno todo por fecha (la convierto a datetime por las dudas)
df_serio['date'] = pd.to_datetime(df_serio['date'])
df_serio = df_serio.sort_values('date')


In [42]:
local = df_serio[['date', 'home_team', 'resultado_local']].rename(columns={
    'home_team': 'team',
    'resultado_local': 'result'
})

visit = df_serio[['date', 'away_team', 'resultado_visit']].rename(columns={
    'away_team': 'team',
    'resultado_visit': 'result'
})

df_team_results = pd.concat([local, visit], ignore_index=True)
df_team_results = df_team_results.sort_values(['team','date'])
df_team_results.head()


Unnamed: 0,date,team,result
21685,1975-04-02,Afghanistan,loss
21689,1975-04-04,Afghanistan,loss
21694,1975-04-06,Afghanistan,loss
21696,1975-04-10,Afghanistan,loss
21697,1975-04-12,Afghanistan,draw


In [43]:
df_team_results['points'] = df_team_results['result'].map({
    'win': 3,
    'draw': 1,
    'loss': 0
})


In [44]:
df_team_results['form_5'] = (
    df_team_results.groupby('team')['points']
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)


In [46]:
matches['date'] = pd.to_datetime(matches['date'])


In [47]:
# Agrego form_5 al equipo local
matches = matches.merge(
    df_team_results[['team','date','form_5']],
    left_on=['home_team','date'],
    right_on=['team','date'],
    how='left'
).rename(columns={'form_5': 'home_form_5'})

# Agrego form_5 al visitante
matches = matches.merge(
    df_team_results[['team','date','form_5']],
    left_on=['away_team','date'],
    right_on=['team','date'],
    how='left'
).rename(columns={'form_5': 'away_form_5'})


In [48]:
matches = matches.drop(columns=['team_x','team_y'], errors='ignore')


In [49]:
model_data = matches[[
    'home_power',
    'away_power',
    'power_diff',
    'tournament_importance',
    'home_form_5',
    'away_form_5',
    'home_win'
]].dropna()

# Train / test + modelo como ya tenías


In [50]:
X = model_data.drop(columns=['home_win'])
y = model_data['home_win']


In [51]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

accuracy


0.8056618209640398

## Construcción del dataset moderno (2002–2025)
En esta parte me quedo únicamente con los partidos oficiales desde 2002, que representan el fútbol moderno. De esta forma reduzco ruido, elimino datos antiguos que no son representativos, y preparo un dataset más realista para predecir el Mundial 2026.


In [54]:
df_moderno = df_serio[df_serio['date'] >= '2002-01-01'].copy()
df_moderno.shape


(11182, 15)

In [55]:
df_moderno['tournament'].value_counts().head(20)


tournament
FIFA World Cup qualification            4984
UEFA Euro qualification                 1532
African Cup of Nations qualification    1253
UEFA Nations League                      658
AFC Asian Cup qualification              470
African Cup of Nations                   441
CONCACAF Nations League                  422
FIFA World Cup                           384
Gold Cup                                 340
UEFA Euro                                246
AFC Asian Cup                            230
Copa América                             222
Name: count, dtype: int64

In [56]:
# Local
local = df_moderno[['date', 'home_team', 'home_score', 'away_score']].rename(columns={
    'home_team': 'team',
    'home_score': 'goals_for',
    'away_score': 'goals_against'
})

# Visitante
visit = df_moderno[['date', 'away_team', 'home_score', 'away_score']].rename(columns={
    'away_team': 'team',
    'away_score': 'goals_for',
    'home_score': 'goals_against'
})

df_equipos_moderno = pd.concat([local, visit], ignore_index=True)


# 4. Estadísticas modernas por selección

En esta sección calculo: 
- promedio de goles a favor
- promedio de goles en contra
- win rate 
- power_score_moderno
Los guardo en `stats_moderno`.


In [57]:
stats_moderno = df_equipos_moderno.groupby('team').agg({
    'goals_for': 'mean',
    'goals_against': 'mean'
}).rename(columns={
    'goals_for': 'avg_goals_for',
    'goals_against': 'avg_goals_against'
})


In [58]:
# resultados locales
df_moderno['resultado_local'] = df_moderno.apply(
    lambda x: 'win' if x['home_score'] > x['away_score']
              else 'loss' if x['home_score'] < x['away_score']
              else 'draw', axis=1
)

# resultados visitantes
df_moderno['resultado_visit'] = df_moderno.apply(
    lambda x: 'win' if x['away_score'] > x['home_score']
              else 'loss' if x['away_score'] < x['home_score']
              else 'draw', axis=1
)

# unifico
local_r = df_moderno[['home_team', 'resultado_local']].rename(columns={'home_team':'team', 'resultado_local':'result'})
visit_r = df_moderno[['away_team', 'resultado_visit']].rename(columns={'away_team':'team', 'resultado_visit':'result'})

df_resultados_mod = pd.concat([local_r, visit_r], ignore_index=True)

win_rate_moderno = df_resultados_mod.groupby('team')['result'].apply(lambda x: (x=='win').mean())
win_rate_moderno = win_rate_moderno.to_frame(name='win_rate')


In [59]:
stats_moderno = stats_moderno.join(win_rate_moderno, how='left')
stats_moderno.head()


Unnamed: 0_level_0,avg_goals_for,avg_goals_against,win_rate
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,0.777778,2.2,0.2
Albania,1.014085,1.239437,0.330986
Algeria,1.74359,0.974359,0.519231
American Samoa,0.909091,3.909091,0.272727
Andorra,0.302817,2.521127,0.049296


In [61]:
stats_moderno.shape
stats_moderno.head(10)


Unnamed: 0_level_0,avg_goals_for,avg_goals_against,win_rate
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,0.777778,2.2,0.2
Albania,1.014085,1.239437,0.330986
Algeria,1.74359,0.974359,0.519231
American Samoa,0.909091,3.909091,0.272727
Andorra,0.302817,2.521127,0.049296
Angola,1.233871,1.0,0.370968
Anguilla,0.25,3.78125,0.03125
Antigua and Barbuda,1.461538,1.980769,0.365385
Argentina,1.688525,0.803279,0.579235
Armenia,1.042857,1.878571,0.257143


In [62]:
stats_moderno.shape

(217, 3)

In [63]:
stats_moderno['power_score_moderno'] = (
    stats_moderno['win_rate'] * 0.6 +
    stats_moderno['avg_goals_for'] * 0.3 -
    stats_moderno['avg_goals_against'] * 0.2
)

stats_moderno.sort_values('power_score_moderno', ascending=False).head(15)


Unnamed: 0_level_0,avg_goals_for,avg_goals_against,win_rate,power_score_moderno
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
New Zealand,2.625,0.65625,0.65625,1.05
Germany,2.497268,0.863388,0.677596,0.98306
Spain,2.273632,0.676617,0.706468,0.970647
Japan,2.222222,0.703704,0.654321,0.918519
England,2.198953,0.643979,0.643979,0.917277
Netherlands,2.253968,0.84127,0.661376,0.904762
Vanuatu,3.0,1.857143,0.571429,0.871429
Portugal,2.081218,0.80203,0.614213,0.832487
Iran,1.981013,0.677215,0.620253,0.831013
United States,1.995192,0.865385,0.629808,0.803365


In [64]:
top15 = stats_moderno.sort_values('power_score_moderno', ascending=False).head(15)
top15


Unnamed: 0_level_0,avg_goals_for,avg_goals_against,win_rate,power_score_moderno
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
New Zealand,2.625,0.65625,0.65625,1.05
Germany,2.497268,0.863388,0.677596,0.98306
Spain,2.273632,0.676617,0.706468,0.970647
Japan,2.222222,0.703704,0.654321,0.918519
England,2.198953,0.643979,0.643979,0.917277
Netherlands,2.253968,0.84127,0.661376,0.904762
Vanuatu,3.0,1.857143,0.571429,0.871429
Portugal,2.081218,0.80203,0.614213,0.832487
Iran,1.981013,0.677215,0.620253,0.831013
United States,1.995192,0.865385,0.629808,0.803365


# 5. Cálculo de Elo custom (v1)

Primero implemento un sistema Elo simple que ajusta el rating según victoria/empate/derrota.


In [66]:
import numpy as np

def compute_elo(df, k=30, home_adv=100, initial_rating=1500):
    """
    Calculo un rating Elo simple para cada selección
    a partir de un dataframe de partidos.
    
    df debe tener columnas:
    - date (datetime)
    - home_team
    - away_team
    - home_score
    - away_score
    """
    # Me aseguro de que esté ordenado por fecha
    df = df.sort_values('date').copy()
    
    ratings = {}  # diccionario: team -> rating actual
    
    for _, row in df.iterrows():
        home = row['home_team']
        away = row['away_team']
        hs = row['home_score']
        as_ = row['away_score']
        
        # Inicializo ratings si no existen
        if home not in ratings:
            ratings[home] = initial_rating
        if away not in ratings:
            ratings[away] = initial_rating
        
        Ra = ratings[home]
        Rb = ratings[away]
        
        # Ventaja de local
        dr = (Ra + home_adv) - Rb
        
        # Resultado esperado del local según Elo
        Ea = 1 / (1 + 10 ** (-dr / 400))
        
        # Resultado real
        if hs > as_:
            Sa = 1.0  # gana local
            Sb = 0.0
        elif hs < as_:
            Sa = 0.0  # gana visitante
            Sb = 1.0
        else:
            Sa = 0.5  # empate
            Sb = 0.5
        
        # Actualizo ratings
        Ra_new = Ra + k * (Sa - Ea)
        Rb_new = Rb + k * (Sb - (1 - Ea))
        
        ratings[home] = Ra_new
        ratings[away] = Rb_new
    
    # Lo convierto a DataFrame
    elo_df = pd.DataFrame.from_dict(ratings, orient='index', columns=['elo_custom'])
    return elo_df



In [67]:
elo_moderno = compute_elo(df_moderno)
elo_moderno.head()
elo_moderno.shape


(217, 1)

In [69]:
elo_moderno.head(10)

Unnamed: 0,elo_custom
Martinique,1497.793622
Costa Rica,1699.624425
Haiti,1621.384505
Canada,1710.922914
United States,1728.199605
South Korea,1798.866544
Mali,1660.152775
Liberia,1418.959788
El Salvador,1545.065828
Mexico,1823.243629


In [70]:
elo_moderno.sort_values('elo_custom', ascending=False).head(20)


Unnamed: 0,elo_custom
Spain,1943.118138
France,1890.023272
England,1848.223528
Japan,1837.11191
Mexico,1823.243629
Iran,1818.639434
Portugal,1817.790996
Argentina,1810.413465
South Korea,1798.866544
Senegal,1788.968528


# 6. Elo custom mejorado (v2)

Agrego: 
- importancia de torneos
- recencia de partidos
  
Esto genera `elo_custom_v2`, una métrica más alineada con el rendimiento moderno.


In [71]:
import numpy as np

def compute_elo_v2(df, k_base=20, home_adv=80, initial_rating=1500):
    """
    Calculo un rating Elo mejorado para cada selección a partir
    de partidos oficiales modernos.

    Ajusto el factor K según:
    - importancia del torneo
    - recencia del partido
    """
    # Me aseguro de que la fecha sea datetime y esté ordenada
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date')
    
    # Peso por torneo (puedo ajustar estos valores si quiero afinar)
    tournament_weight = {
        'FIFA World Cup': 3.0,
        'FIFA World Cup qualification': 2.5,
        'UEFA Euro': 2.5,
        'UEFA Euro qualification': 2.0,
        'Copa América': 2.5,
        'African Cup of Nations': 2.0,
        'African Cup of Nations qualification': 1.8,
        'AFC Asian Cup': 2.0,
        'AFC Asian Cup qualification': 1.8,
        'Gold Cup': 2.0,
        'CONCACAF Nations League': 1.6,
        'UEFA Nations League': 1.6,
    }
    
    ratings = {}
    
    for _, row in df.iterrows():
        home = row['home_team']
        away = row['away_team']
        hs = row['home_score']
        as_ = row['away_score']
        tourn = row['tournament']
        year = row['date'].year
        
        # Inicializo ratings si no existen
        if home not in ratings:
            ratings[home] = initial_rating
        if away not in ratings:
            ratings[away] = initial_rating
        
        Ra = ratings[home]
        Rb = ratings[away]
        
        # Ventaja de local
        dr = (Ra + home_adv) - Rb
        
        # Resultado esperado del local
        Ea = 1 / (1 + 10 ** (-dr / 400))
        
        # Resultado real
        if hs > as_:
            Sa = 1.0
            Sb = 0.0
        elif hs < as_:
            Sa = 0.0
            Sb = 1.0
        else:
            Sa = 0.5
            Sb = 0.5
        
        # Peso por importancia del torneo
        w_tourn = tournament_weight.get(tourn, 1.0)
        
        # Peso por recencia
        if year >= 2018:
            w_time = 1.6   # últimos ciclos pesan más
        elif year >= 2010:
            w_time = 1.3
        else:
            w_time = 1.0   # principio del período moderno
        
        # Factor K efectivo
        K_eff = k_base * w_tourn * w_time
        
        # Actualizo ratings
        Ra_new = Ra + K_eff * (Sa - Ea)
        Rb_new = Rb + K_eff * (Sb - (1 - Ea))
        
        ratings[home] = Ra_new
        ratings[away] = Rb_new
    
    elo_df = pd.DataFrame.from_dict(ratings, orient='index', columns=['elo_custom_v2'])
    return elo_df


In [72]:
elo_moderno_v2 = compute_elo_v2(df_moderno)
elo_moderno_v2.shape
elo_moderno_v2.sort_values('elo_custom_v2', ascending=False).head(15)


Unnamed: 0,elo_custom_v2
Spain,2058.199129
England,1956.11801
France,1955.955115
Mexico,1921.361776
Argentina,1919.217598
Japan,1897.118203
Australia,1885.575201
South Korea,1866.317772
Ecuador,1863.893643
Iran,1860.864348


In [73]:
# Uno el nuevo Elo a stats_moderno
stats_moderno = stats_moderno.join(elo_moderno_v2, how='left')

# Si alguna selección no tiene Elo (rara), relleno con la media
stats_moderno['elo_custom_v2'] = stats_moderno['elo_custom_v2'].fillna(
    stats_moderno['elo_custom_v2'].mean()
)

# Normalizo Elo 2.0 a 0–1
elo_norm_v2 = stats_moderno['elo_custom_v2'] / stats_moderno['elo_custom_v2'].max()

# Nuevo power score híbrido: mezcla de mi métrica y el Elo mejorado
stats_moderno['power_score_elohybrid_v2'] = (
    stats_moderno['power_score_moderno'] * 0.5 +
    elo_norm_v2 * 0.5
)

stats_moderno.sort_values('power_score_elohybrid_v2', ascending=False).head(20)


Unnamed: 0_level_0,avg_goals_for,avg_goals_against,win_rate,power_score_moderno,elo_custom_v2,power_score_elohybrid_v2
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Spain,2.273632,0.676617,0.706468,0.970647,2058.199129,0.985323
New Zealand,2.625,0.65625,0.65625,1.05,1797.54576,0.961679
Germany,2.497268,0.863388,0.677596,0.98306,1827.648985,0.935522
England,2.198953,0.643979,0.643979,0.917277,1956.11801,0.93384
Japan,2.222222,0.703704,0.654321,0.918519,1897.118203,0.920128
Netherlands,2.253968,0.84127,0.661376,0.904762,1815.262401,0.893364
Iran,1.981013,0.677215,0.620253,0.831013,1860.864348,0.867568
France,1.864583,0.739583,0.609375,0.777083,1955.955115,0.863703
Portugal,2.081218,0.80203,0.614213,0.832487,1825.859412,0.859801
Mexico,1.872727,0.790909,0.618182,0.774545,1921.361776,0.854031


In [74]:
elo_moderno_v2.sort_values('elo_custom_v2', ascending=False).head(15)


Unnamed: 0,elo_custom_v2
Spain,2058.199129
England,1956.11801
France,1955.955115
Mexico,1921.361776
Argentina,1919.217598
Japan,1897.118203
Australia,1885.575201
South Korea,1866.317772
Ecuador,1863.893643
Iran,1860.864348


In [75]:
stats_moderno.sort_values('power_score_elohybrid_v2', ascending=False).head(20)


Unnamed: 0_level_0,avg_goals_for,avg_goals_against,win_rate,power_score_moderno,elo_custom_v2,power_score_elohybrid_v2
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Spain,2.273632,0.676617,0.706468,0.970647,2058.199129,0.985323
New Zealand,2.625,0.65625,0.65625,1.05,1797.54576,0.961679
Germany,2.497268,0.863388,0.677596,0.98306,1827.648985,0.935522
England,2.198953,0.643979,0.643979,0.917277,1956.11801,0.93384
Japan,2.222222,0.703704,0.654321,0.918519,1897.118203,0.920128
Netherlands,2.253968,0.84127,0.661376,0.904762,1815.262401,0.893364
Iran,1.981013,0.677215,0.620253,0.831013,1860.864348,0.867568
France,1.864583,0.739583,0.609375,0.777083,1955.955115,0.863703
Portugal,2.081218,0.80203,0.614213,0.832487,1825.859412,0.859801
Mexico,1.872727,0.790909,0.618182,0.774545,1921.361776,0.854031


# 7. Ajuste por confederación

Asigno a cada selección su confederación (UEFA, CONMEBOL, AFC, etc.) y aplico un coeficiente de fuerza. Esto corrige inflados regionales (OFC, AFC) y fortalece confeds competitivas.


In [76]:
confeds = {
    # UEFA
    'Spain': 'UEFA', 'England': 'UEFA', 'France': 'UEFA', 'Germany': 'UEFA', 'Italy': 'UEFA',
    'Portugal': 'UEFA', 'Netherlands': 'UEFA', 'Belgium': 'UEFA', 'Croatia': 'UEFA',
    'Switzerland': 'UEFA', 'Denmark': 'UEFA', 'Norway': 'UEFA', 'Sweden': 'UEFA',
    'Poland': 'UEFA', 'Czech Republic': 'UEFA', 'Austria': 'UEFA', 'Hungary': 'UEFA',
    'Serbia': 'UEFA', 'Scotland': 'UEFA', 'Wales': 'UEFA', 'Ukraine': 'UEFA',
    'Russia': 'UEFA', 'Turkey': 'UEFA', 'Bosnia and Herzegovina': 'UEFA',
    'Romania': 'UEFA', 'Slovakia': 'UEFA', 'Slovenia': 'UEFA', 'Greece': 'UEFA',
    'Finland': 'UEFA', 'Ireland': 'UEFA', 'Northern Ireland': 'UEFA',
    'Iceland': 'UEFA', 'Albania': 'UEFA', 'Armenia': 'UEFA', 'Georgia': 'UEFA',
    'Kazakhstan': 'UEFA', 'Azerbaijan': 'UEFA', 'Cyprus': 'UEFA', 'Malta': 'UEFA',
    'Luxembourg': 'UEFA', 'Montenegro': 'UEFA', 'Bulgaria': 'UEFA',
    
    # CONMEBOL
    'Argentina': 'CONMEBOL', 'Brazil': 'CONMEBOL', 'Uruguay': 'CONMEBOL',
    'Colombia': 'CONMEBOL', 'Chile': 'CONMEBOL', 'Ecuador': 'CONMEBOL',
    'Paraguay': 'CONMEBOL', 'Peru': 'CONMEBOL', 'Venezuela': 'CONMEBOL',
    'Bolivia': 'CONMEBOL',
    
    # CONCACAF
    'Mexico': 'CONCACAF', 'United States': 'CONCACAF', 'Canada': 'CONCACAF',
    'Costa Rica': 'CONCACAF', 'Honduras': 'CONCACAF', 'Panama': 'CONCACAF',
    'Jamaica': 'CONCACAF', 'El Salvador': 'CONCACAF', 'Guatemala': 'CONCACAF',
    'Haiti': 'CONCACAF', 'Cuba': 'CONCACAF', 'Trinidad and Tobago': 'CONCACAF',
    'Curacao': 'CONCACAF', 'Martinique': 'CONCACAF', 'Bermuda': 'CONCACAF',
    'Saint Lucia': 'CONCACAF', 'Nicaragua': 'CONCACAF',
    
    # CAF (África)
    'Senegal': 'CAF', 'Morocco': 'CAF', 'Egypt': 'CAF', 'Ivory Coast': 'CAF',
    'Mali': 'CAF', 'Ghana': 'CAF', 'Nigeria': 'CAF', 'South Africa': 'CAF',
    'Tunisia': 'CAF', 'Cameroon': 'CAF', 'Algeria': 'CAF',
    'Burkina Faso': 'CAF', 'Guinea': 'CAF', 'Tanzania': 'CAF',
    
    # AFC (Asia)
    'Japan': 'AFC', 'South Korea': 'AFC', 'Iran': 'AFC', 'Saudi Arabia': 'AFC',
    'Australia': 'AFC', 'Qatar': 'AFC', 'Uzbekistan': 'AFC', 'United Arab Emirates': 'AFC',
    'Iraq': 'AFC', 'Jordan': 'AFC', 'Oman': 'AFC', 'Syria': 'AFC', 'China PR': 'AFC',
    'Kuwait': 'AFC', 'Bahrain': 'AFC', 'Vietnam': 'AFC',
    
    # OFC (Oceanía)
    'New Zealand': 'OFC', 'Vanuatu': 'OFC', 'Fiji': 'OFC', 'Solomon Islands': 'OFC',
    'Papua New Guinea': 'OFC', 'Tahiti': 'OFC'
}


In [77]:
confed_strength = {
    'UEFA': 1.00,
    'CONMEBOL': 1.00,
    'CAF': 0.90,
    'AFC': 0.85,
    'CONCACAF': 0.85,
    'OFC': 0.60
}


# 8. Power Score v3 (métrica moderna definitiva)

Combino: 
- power_score_moderno
- Elo ajustado por torneo+recencia
- fuerza de confederación

Este es el rating final que usaré para el modelo y la simulación del Mundial 2026.

In [79]:
# Asigno confederación
stats_moderno['confed'] = stats_moderno.index.map(lambda x: confeds.get(x, 'OTHER'))

# Asigno peso por confederación
stats_moderno['confed_strength'] = stats_moderno['confed'].map(
    lambda c: confed_strength.get(c, 0.80)  
)


In [80]:
stats_moderno['elo_confed_adjusted'] = (
    stats_moderno['elo_custom_v2'] * stats_moderno['confed_strength']
)


In [81]:
elo_norm_adj = stats_moderno['elo_confed_adjusted'] / stats_moderno['elo_confed_adjusted'].max()


# 9. Ranking final de selecciones (Top 20)

Muestro las selecciones mejor rankeadas según `power_score_v3`.


In [82]:
stats_moderno['power_score_v3'] = (
    stats_moderno['power_score_moderno'] * 0.4 +
    elo_norm_adj * 0.6
)

stats_moderno.sort_values('power_score_v3', ascending=False).head(20)


Unnamed: 0_level_0,avg_goals_for,avg_goals_against,win_rate,power_score_moderno,elo_custom_v2,power_score_elohybrid_v2,confed,confed_strength,elo_confed_adjusted,power_score_v3
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Spain,2.273632,0.676617,0.706468,0.970647,2058.199129,0.985323,UEFA,1.0,2058.199129,0.988259
England,2.198953,0.643979,0.643979,0.917277,1956.11801,0.93384,UEFA,1.0,1956.11801,0.937153
Germany,2.497268,0.863388,0.677596,0.98306,1827.648985,0.935522,UEFA,1.0,1827.648985,0.926015
Netherlands,2.253968,0.84127,0.661376,0.904762,1815.262401,0.893364,UEFA,1.0,1815.262401,0.891085
France,1.864583,0.739583,0.609375,0.777083,1955.955115,0.863703,UEFA,1.0,1955.955115,0.881027
Portugal,2.081218,0.80203,0.614213,0.832487,1825.859412,0.859801,UEFA,1.0,1825.859412,0.865264
Brazil,1.89881,0.72619,0.589286,0.777976,1809.820228,0.828649,CONMEBOL,1.0,1809.820228,0.838784
Japan,2.222222,0.703704,0.654321,0.918519,1897.118203,0.920128,AFC,0.85,1612.550472,0.837493
Argentina,1.688525,0.803279,0.579235,0.693443,1919.217598,0.812958,CONMEBOL,1.0,1919.217598,0.836862
Belgium,2.137143,0.977143,0.594286,0.802286,1735.116291,0.822656,UEFA,1.0,1735.116291,0.82673


In [83]:
stats_moderno.sort_values('power_score_v3', ascending=False).head(20)


Unnamed: 0_level_0,avg_goals_for,avg_goals_against,win_rate,power_score_moderno,elo_custom_v2,power_score_elohybrid_v2,confed,confed_strength,elo_confed_adjusted,power_score_v3
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Spain,2.273632,0.676617,0.706468,0.970647,2058.199129,0.985323,UEFA,1.0,2058.199129,0.988259
England,2.198953,0.643979,0.643979,0.917277,1956.11801,0.93384,UEFA,1.0,1956.11801,0.937153
Germany,2.497268,0.863388,0.677596,0.98306,1827.648985,0.935522,UEFA,1.0,1827.648985,0.926015
Netherlands,2.253968,0.84127,0.661376,0.904762,1815.262401,0.893364,UEFA,1.0,1815.262401,0.891085
France,1.864583,0.739583,0.609375,0.777083,1955.955115,0.863703,UEFA,1.0,1955.955115,0.881027
Portugal,2.081218,0.80203,0.614213,0.832487,1825.859412,0.859801,UEFA,1.0,1825.859412,0.865264
Brazil,1.89881,0.72619,0.589286,0.777976,1809.820228,0.828649,CONMEBOL,1.0,1809.820228,0.838784
Japan,2.222222,0.703704,0.654321,0.918519,1897.118203,0.920128,AFC,0.85,1612.550472,0.837493
Argentina,1.688525,0.803279,0.579235,0.693443,1919.217598,0.812958,CONMEBOL,1.0,1919.217598,0.836862
Belgium,2.137143,0.977143,0.594286,0.802286,1735.116291,0.822656,UEFA,1.0,1735.116291,0.82673


In [95]:
import os

os.makedirs("data/processed", exist_ok=True)
print("Carpeta creada (o ya existente).")


Carpeta creada (o ya existente).


In [96]:
# Guardo el dataset completo de ratings modernos por selección
stats_moderno.to_csv("data/processed/team_ratings_modern.csv", index=True)


# Confirmo
print("Archivo guardado correctamente:", "data/processed/team_ratings_modern.csv")


Archivo guardado correctamente: data/processed/team_ratings_modern.csv


In [97]:
import os
os.getcwd()


'C:\\Users\\lmosquen\\Desktop\\Python\\Proyecto mundial\\worldcup-2026-prediction-main\\worldcup-2026-prediction-main\\notebooks'

In [98]:
import os

# Me aseguro de que exista la carpeta correcta fuera de notebooks
os.makedirs("../data/processed", exist_ok=True)

# Guardo el CSV en la ubicación "oficial" del proyecto
stats_moderno.to_csv("../data/processed/team_ratings_modern.csv", index=True)

print("Archivo guardado correctamente en ../data/processed/team_ratings_modern.csv")


Archivo guardado correctamente en ../data/processed/team_ratings_modern.csv


In [99]:
# Me quedo sólo con la métrica final por selección
team_power = stats_moderno[['power_score_v3']].copy()
team_power = team_power.sort_values('power_score_v3', ascending=False)

# Lo renombro para que quede más prolijo en el CSV
team_power.reset_index(inplace=True)
team_power.rename(columns={'team': 'team', 'power_score_v3': 'power_score_v3'}, inplace=True)

# Creo la carpeta (por las dudas) y guardo
import os
os.makedirs("../data/processed", exist_ok=True)

team_power.to_csv("../data/processed/team_power_score_v3.csv", index=False)

print("Archivo guardado correctamente en ../data/processed/team_power_score_v3.csv")
team_power.head()


Archivo guardado correctamente en ../data/processed/team_power_score_v3.csv


Unnamed: 0,team,power_score_v3
0,Spain,0.988259
1,England,0.937153
2,Germany,0.926015
3,Netherlands,0.891085
4,France,0.881027
