In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import joblib
import lightgbm as lgb

In [2]:
sns.set_theme()

In [3]:
model = joblib.load('lgbm_model.joblib')

In [4]:
def aggregate_teams(df, min_players=23):
    # Choose columns to keep.
    columns = (['year', 'overall', 'potential', 'value_eur', 'wage_eur', 'age', 'height_cm', 
        'weight_kg', 'nationality_name', 'nation_team_id'] + 
        [f for f in df.columns if f.startswith(('attacking_', 'skill_', 'movement_', 'power_', 
            'mentality_', 'defending_', 'goalkeeping_'))])
    columns.remove('mentality_composure')
    columns.remove('goalkeeping_speed')
    df = df[columns]
    # Teams with defined national team
    df_nat_team = df.dropna(subset=['nation_team_id'])
    df_nat_team = df_nat_team.drop(columns=['nation_team_id'])
    df = df.drop(columns=['nation_team_id'])
    no_nat_names = set(df['nationality_name'].unique()).difference(set(df_nat_team['nationality_name'].unique()))
    # Teams without a defined national team but with at least min_players players
    df_no_nat_team = df[df['nationality_name'].isin(no_nat_names)]\
        .groupby(['year', 'nationality_name'])\
        .filter(lambda g: g.count()['overall'] >= min_players)\
        .sort_values(['overall'], ascending=False)\
        .groupby(['year', 'nationality_name'])\
        .apply(lambda g: g[:min_players])\
        .reset_index(drop=True)
    df = pd.concat((df_nat_team, df_no_nat_team), ignore_index=True)
    # Set nan for value and wage to 0
    df = df.fillna(0)
    # For each feature, aggregate mean, min, and max per team
    df_agg = df.groupby(['year', 'nationality_name']).agg(['mean', 'min', 'max'])
    df_agg.columns = df_agg.columns.map('_'.join)
    keep_columns = [f for f in df_agg.columns if not (f.startswith('goalkeeping_') and f.endswith(('mean', 'min')))]
    df_agg = df_agg[keep_columns].reset_index()
    df_agg = df_agg.rename(columns={'nationality_name': 'team'})
    df_agg['team'] = df_agg['team'].astype('string')
    return df_agg

def merge_wc22_teams(res, teams):
    match_list = []
    val_cols = [c for c in teams.columns if c not in ['year', 'team']]
    col_names = ['team1_' + c for c in val_cols] + ['team2_' + c for c in val_cols]
    for row in res.itertuples(index=False):
        team1 = row.team1
        team2 = row.team2
        col_vals = np.hstack((
            teams[teams['team'] == team1].sort_values('year').iloc[-1][val_cols].values,
            teams[teams['team'] == team2].sort_values('year').iloc[-1][val_cols].values
        ))
        d = {col_name: col_val for (col_name, col_val) in zip(col_names, col_vals)}
        d['team1'] = team1
        d['team2'] = team2
        match_list.append(d)
    return pd.DataFrame(match_list)

def transform_wc22(df):
    df = df[df['Team1_Name'] != '-']
    team1 = np.hstack((df['Team1_Name'].values, df['Team2_Name'].values))
    team2 = np.hstack((df['Team2_Name'].values, df['Team1_Name'].values))
    df = pd.DataFrame({'team1': team1, 'team2': team2})
    df['team1'] = df['team1'].astype('string')
    df['team2'] = df['team2'].astype('string')
    df = df.replace({'IR Iran': 'Iran', 'South Korea': 'Korea Republic', 'USA': 'United States'})
    return df

def predict_wc22(model, df, feature_cols):
    n_matches = len(df) // 2
    y_hat_wc22 = model.predict(df[feature_cols].values)
    y_hat_wc22_proba = model.predict_proba(df[feature_cols].values)
    y_hat_wc22_proba_max = np.max(y_hat_wc22_proba, axis=1)
    df['team1_score_pred'] = y_hat_wc22
    df['team1_score_pred_proba'] = y_hat_wc22_proba_max
    df['team2_score_pred'] = df['team1_score_pred'].shift(-n_matches)
    df['team2_score_pred_proba'] = df['team1_score_pred_proba'].shift(-n_matches)
    df = df.iloc[0:n_matches][['team1', 'team2', 'team1_score_pred', 'team2_score_pred', 'team1_score_pred_proba', 'team2_score_pred_proba']]
    df['team2_score_pred'] = df['team2_score_pred'].astype(int)
    df['pred_proba'] = df['team1_score_pred_proba'] * df['team2_score_pred_proba']
    df = df[['team1', 'team2', 'team1_score_pred', 'team2_score_pred', 'pred_proba']]
    return df

In [5]:
x_cols = ['overall', 'potential', 'skill_moves', 'attacking_finishing', 
          'skill_long_passing', 'movement_sprint_speed', 'movement_agility', 
          'value_eur', 'wage_eur', 'attacking_finishing', 'power_stamina']
x_cols = [s + a for s in x_cols  for a in ['_min', '_mean', '_max']]
x_cols = x_cols + ['goalkeeping_positioning_max', 'goalkeeping_reflexes_max']
x_cols = [t + s for t in ['team1_', 'team2_'] for s in x_cols]

In [6]:
from itertools import combinations
df_wc_teams = pd.read_csv('Teams.csv')
df_wc_teams = df_wc_teams.replace({'USA': 'United States', 'IR Iran': 'Iran', 'South Korea': 'Korea Republic'})
df_matches = pd.read_csv('Matches.csv')
res_wc22 = transform_wc22(df_matches)
df_teams = pd.read_csv('players_all_years.csv')
df_teams = aggregate_teams(df_teams, min_players=1)
df_wc22 = merge_wc22_teams(res_wc22, df_teams)
df_all_matches = pd.DataFrame({'Team1_Name': x[0], 'Team2_Name': x[1]} for x in combinations(df_wc_teams['Team_Name'].values, 2))
res_all = transform_wc22(df_all_matches)
df_wc22_all = merge_wc22_teams(res_all, df_teams)

  df_teams = pd.read_csv('players_all_years.csv')


In [43]:
df_wc22_all

Unnamed: 0,team1_overall_mean,team1_overall_min,team1_overall_max,team1_potential_mean,team1_potential_min,team1_potential_max,team1_value_eur_mean,team1_value_eur_min,team1_value_eur_max,team1_wage_eur_mean,...,team2_defending_sliding_tackle_mean,team2_defending_sliding_tackle_min,team2_defending_sliding_tackle_max,team2_goalkeeping_diving_max,team2_goalkeeping_handling_max,team2_goalkeeping_kicking_max,team2_goalkeeping_positioning_max,team2_goalkeeping_reflexes_max,team1,team2
0,68.000000,68,68,77.000000,77,77,1.400000e+06,1400000.0,1400000.0,7000.00000,...,43.043478,9,80,71,79,81,75,73,Qatar,Ecuador
1,68.000000,68,68,77.000000,77,77,1.400000e+06,1400000.0,1400000.0,7000.00000,...,38.000000,38,38,10,10,15,7,14,Qatar,Senegal
2,68.000000,68,68,77.000000,77,77,1.400000e+06,1400000.0,1400000.0,7000.00000,...,53.869565,14,86,80,75,77,77,79,Qatar,Netherlands
3,68.000000,68,68,77.000000,77,77,1.400000e+06,1400000.0,1400000.0,7000.00000,...,55.304348,12,83,83,79,87,80,86,Qatar,England
4,68.000000,68,68,77.000000,77,77,1.400000e+06,1400000.0,1400000.0,7000.00000,...,25.000000,25,25,9,11,9,15,8,Qatar,Iran
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,78.217391,75,80,78.217391,75,80,0.000000e+00,0.0,0.0,0.00000,...,55.652174,11,86,84,80,77,83,86,Uruguay,Portugal
988,69.478261,61,78,72.652174,64,81,1.521739e+06,0.0,6500000.0,21521.73913,...,55.652174,11,86,84,80,77,83,86,Korea Republic,Portugal
989,78.217391,75,80,78.217391,75,80,0.000000e+00,0.0,0.0,0.00000,...,80.000000,80,80,11,13,9,9,7,Uruguay,Ghana
990,69.478261,61,78,72.652174,64,81,1.521739e+06,0.0,6500000.0,21521.73913,...,80.000000,80,80,11,13,9,9,7,Korea Republic,Ghana


In [132]:
def get_prob(team1, team2, features):
    ix1 = (df_wc22_all['team1'] == team1) & (df_wc22_all['team2'] == team2)
    ix2 = (df_wc22_all['team1'] == team2) & (df_wc22_all['team2'] == team1)
    x = pd.concat([df_wc22_all[ix1], df_wc22_all[ix2]])[features].values
    return model.predict_proba(x)

def get_wc_probs(team1s, team2s):
    scores = {}
    for (team1, team2) in zip(team1s, team2s):
        prob = get_prob(team1, team2, x_cols)
        score = {(i, j): prob[0][i] * prob[1][j] for i in range(6) for j in range(6)}
        score_sorted = sorted(score.items(), key=lambda kv: kv[1], reverse=True)
        most_prob = score_sorted[0]
        second_prob = score_sorted[1]
        scores[(team1, team2)] = (score, most_prob, second_prob)
    return scores
    

In [133]:
scores = get_wc_probs(res_wc22['team1'].values, res_wc22['team2'].values)

In [134]:
def get_scores(team1s, team2s):
    team1_list = []
    team2_list = []
    score11_list = []
    score12_list = []
    score21_list = []
    score22_list = []
    prob1 = []
    prob2 = []
    for team1, team2 in zip(team1s, team2s):
        score = scores[(team1, team2)]
        team1_list.append(team1)
        team2_list.append(team2)
        score11_list.append(score[1][0][0])
        score12_list.append(score[1][0][1])
        score21_list.append(score[2][0][0])
        score22_list.append(score[2][0][1])
        prob1.append(score[1][1])
        prob2.append(score[2][1])
        
    df = pd.DataFrame({'team1': team1_list, 'team2': team2_list, 'score1': score11_list,
        'score2': score12_list, 'score_prob': prob1, 'alt_score1': score21_list, 'alt_score2': score22_list,
        'alt_prob': prob2})
    return df

In [135]:
df_groups = get_scores(res_wc22['team1'].values, res_wc22['team2'].values)

In [141]:
team1s8 = ['Netherlands', 'Poland', 'Germany', 'Brazil', 'England', 'Denmark', 'Croatia', 'Portugal']
team2s8 = ['Wales', 'France', 'Belgium', 'Ghana', 'Senegal', 'Argentina', 'Spain', 'Switzerland']
scores = get_wc_probs(team1s8, team2s8)
df_eights = get_scores(team1s8, team2s8)

In [138]:
df_eights.to_csv('eights.csv', sep=';')

In [142]:
team1s4 = ['Netherlands', 'Germany', 'England', 'Spain']
team2s4 = ['Poland', 'Brazil', 'Argentina', 'Portugal']
scores = get_wc_probs(team1s4, team2s4)
df_quarter = get_scores(team1s4, team2s4)

In [143]:
df_quarter

Unnamed: 0,team1,team2,score1,score2,score_prob,alt_score1,alt_score2,alt_prob
0,Netherlands,Poland,4,1,0.128051,1,1,0.086311
1,Germany,Brazil,1,1,0.144065,1,0,0.102118
2,England,Argentina,0,0,0.176757,0,1,0.151626
3,Spain,Portugal,1,1,0.167145,1,2,0.114774


In [144]:
team1s2 = ['Netherlands', 'Argentina']
team2s2 = ['Germany', 'Portugal']
scores = get_wc_probs(team1s2, team2s2)
df_semi = get_scores(team1s2, team2s2)

In [145]:
df_semi

Unnamed: 0,team1,team2,score1,score2,score_prob,alt_score1,alt_score2,alt_prob
0,Netherlands,Germany,1,1,0.275037,1,5,0.090524
1,Argentina,Portugal,0,0,0.116443,0,2,0.108299


In [147]:
team1sf = ['Germany', 'Netherlands']
team2sf = ['Portugal', 'Argentina']
scores = get_wc_probs(team1sf, team2sf)
df_finals = get_scores(team1sf, team2sf)

In [148]:
df_finals

Unnamed: 0,team1,team2,score1,score2,score_prob,alt_score1,alt_score2,alt_prob
0,Germany,Portugal,1,1,0.170533,5,1,0.106855
1,Netherlands,Argentina,4,3,0.101398,4,1,0.092784
