# 2022 FIFA World Cup Predictions

Kaggle link: https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017?resource=download

In [1]:
#import libraries
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#%matplotlib inline

In [2]:
#home team data
results_home = pd.read_csv('results.csv')
results_home['date'] = pd.to_datetime(results_home['date'])
results_home = results_home.rename(columns = {'home_team':'team', 
                                              'away_team':'opponent',
                                              'home_score':'team_score',
                                              'away_score':'opponent_score'})
results_home['h_or_a'] = 'Home'

#away team data
results_away = pd.read_csv('results.csv')
results_away['date'] = pd.to_datetime(results_away['date'])
results_away = results_away.rename(columns = {'home_team':'opponent', 
                                              'away_team':'team',
                                              'home_score':'opponent_score',
                                              'away_score':'team_score'})
results_away = results_away[['date', 'team', 'opponent', 'team_score', 'opponent_score', 
                             'tournament', 'city', 'country', 'neutral']]
results_away['h_or_a'] = 'Away'

#concatenating home and away team data
results = pd.concat([results_home, results_away], axis = 0)
results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False,Home
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False,Home
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False,Home
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False,Home
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False,Home
...,...,...,...,...,...,...,...,...,...,...
44201,2022-12-01,Morocco,Canada,,,FIFA World Cup,Doha,Qatar,True,Away
44202,2022-12-02,Switzerland,Serbia,,,FIFA World Cup,Doha,Qatar,True,Away
44203,2022-12-02,Brazil,Cameroon,,,FIFA World Cup,Lusail,Qatar,True,Away
44204,2022-12-02,Uruguay,Ghana,,,FIFA World Cup,Al Wakrah,Qatar,True,Away


In [3]:
#number of matches
results['p'] = 1

#wins
def set_result(row):
    if row["team_score"] > row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(w = results.apply(set_result, axis=1))

#draws
def set_result(row):
    if row["team_score"] == row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(d = results.apply(set_result, axis=1))

#losses
def set_result(row):
    if row["team_score"] < row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(l = results.apply(set_result, axis=1))

results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False,Home,1,0,1,0
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False,Home,1,1,0,0
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False,Home,1,0,1,0
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44201,2022-12-01,Morocco,Canada,,,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,0
44202,2022-12-02,Switzerland,Serbia,,,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,0
44203,2022-12-02,Brazil,Cameroon,,,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,0
44204,2022-12-02,Uruguay,Ghana,,,FIFA World Cup,Al Wakrah,Qatar,True,Away,1,0,0,0


In [4]:
#2022 World Cup teams
wc_teams = ['Qatar', 'Ecuador', 'Senegal', 'Netherlands', 
            'England', 'Iran', 'United States', 'Wales',
            'Argentina', 'Saudi Arabia', 'Mexico', 'Poland',
            'France', 'Australia', 'Denmark', 'Tunisia',
            'Spain', 'Costa Rica', 'Germany', 'Japan',
            'Belgium', 'Canada', 'Morocco', 'Croatia',
            'Brazil', 'Serbia', 'Switzerland', 'Cameroon',
            'Portugal', 'Ghana', 'Uruguay', 'South Korea']
results_2022_teams = results[(results['team'].isin(wc_teams)) & 
                             (results['opponent'].isin(wc_teams))]

#matches since last World Cup
results_cur_all = results_2022_teams[(results_2022_teams['date'] >= '2018-07-16') & 
                                     (results_2022_teams['date'] <= '2022-11-24')]

#matches since last World Cup excluding friendlies
results_cur_cmp = results_2022_teams[(results_2022_teams['date'] >= '2018-07-16') &
                                     (results_2022_teams['date'] <= '2022-11-24') &
                                     (results_2022_teams['tournament'] != 'Friendly')]

In [5]:
#Create data for teams in Group Stage
data = {'team': ['Qatar', 'Ecuador', 'Senegal', 'Netherlands',
                 'England', 'Iran', 'United States', 'Wales',
                 'Argentina', 'Saudi Arabia', 'Mexico', 'Poland',
                 'France', 'Australia', 'Denmark', 'Tunisia',
                 'Spain', 'Costa Rica', 'Germany', 'Japan',
                 'Belgium', 'Canada', 'Morocco', 'Croatia',
                 'Brazil', 'Serbia', 'Switzerland', 'Cameroon',
                 'Portugal', 'Ghana', 'Uruguay', 'South Korea'],
       'group': ['A','A','A','A',
                 'B','B','B','B',
                 'C','C','C','C',
                 'D','D','D','D',
                 'E','E','E','E',
                 'F','F','F','F',
                 'G','G','G','G',
                 'H','H','H','H']}
groups_team = pd.DataFrame(data)

In [6]:
#Create data for opponents in Group Stage
data = {'team': ['Qatar', 'Ecuador', 'Senegal', 'Netherlands',
                 'England', 'Iran', 'United States', 'Wales',
                 'Argentina', 'Saudi Arabia', 'Mexico', 'Poland',
                 'France', 'Australia', 'Denmark', 'Tunisia',
                 'Spain', 'Costa Rica', 'Germany', 'Japan',
                 'Belgium', 'Canada', 'Morocco', 'Croatia',
                 'Brazil', 'Serbia', 'Switzerland', 'Cameroon',
                 'Portugal', 'Ghana', 'Uruguay', 'South Korea'],
       'group': ['A','A','A','A',
                 'B','B','B','B',
                 'C','C','C','C',
                 'D','D','D','D',
                 'E','E','E','E',
                 'F','F','F','F',
                 'G','G','G','G',
                 'H','H','H','H']}
groups_opponent = pd.DataFrame(data)

In [7]:
#Create Group Stage groups
group_a = ['Qatar', 'Ecuador', 'Senegal', 'Netherlands']
group_b = ['England', 'Iran', 'United States', 'Wales']
group_c = ['Argentina', 'Saudi Arabia', 'Mexico', 'Poland']
group_d = ['France', 'Australia', 'Denmark', 'Tunisia']
group_e = ['Spain', 'Costa Rica', 'Germany', 'Japan']
group_f = ['Belgium', 'Canada', 'Morocco', 'Croatia']
group_g = ['Brazil', 'Serbia', 'Switzerland', 'Cameroon']
group_h = ['Portugal', 'Ghana', 'Uruguay', 'South Korea']

## Poisson Distribution amongst all World Cup teams

In [8]:
#create standings table for each team
table_group = results_cur_all[['team','p','w','d','l','team_score','opponent_score']]
table_group = table_group.rename(columns = {'team_score':'gf',
                                            'opponent_score':'ga'})
table_group = table_group.groupby(by = 'team').sum().sort_values('team', ascending = True)

table_group['gpg_scored'] = table_group['gf'] / table_group['p']
table_group['gpg_conceded'] = table_group['ga'] / table_group['p']

table_group

Unnamed: 0_level_0,p,w,d,l,gf,ga,gpg_scored,gpg_conceded
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Argentina,45,20,9,16,66.0,51.0,1.466667,1.133333
Australia,7,0,2,5,3.0,11.0,0.428571,1.571429
Belgium,30,16,6,8,52.0,37.0,1.733333,1.233333
Brazil,33,20,7,6,55.0,26.0,1.666667,0.787879
Cameroon,7,0,3,4,0.0,5.0,0.0,0.714286
Canada,16,7,2,7,18.0,18.0,1.125,1.125
Costa Rica,20,2,5,13,11.0,35.0,0.55,1.75
Croatia,22,7,5,10,26.0,38.0,1.181818,1.727273
Denmark,23,8,6,9,29.0,28.0,1.26087,1.217391
Ecuador,19,3,7,9,19.0,33.0,1.0,1.736842


In [9]:
#goals scored per game for all teams
avg_goals = table_group['gf'].sum() / table_group['p'].sum()
avg_goals

1.2952522255192878

In [10]:
#goals scored per game for all teams (should be the same as goals scored per game for all teams)
table_group['ga'].sum() / table_group['p'].sum()

1.2952522255192878

In [11]:
#calculate strengths
strengths = results_cur_all.groupby(['team']).mean().sort_values('team', ascending = True)
strengths = strengths.reset_index()
strengths = strengths[['team', 'team_score', 'opponent_score']]
strengths['att'] = strengths['team_score'] / avg_goals
strengths['def'] = strengths['opponent_score'] / avg_goals
strengths = strengths[['team', 'att', 'def']]
strengths_team = strengths.rename(columns = {'att':'team_att',
                                             'def':'team_def'})
strengths_opponent = strengths.rename(columns = {'team':'opponent',
                                                 'att':'opponent_att',
                                                 'def':'opponent_def'})
strengths

Unnamed: 0,team,att,def
0,Argentina,1.132341,0.87499
1,Australia,0.330879,1.213222
2,Belgium,1.338221,0.952195
3,Brazil,1.286751,0.608282
4,Cameroon,0.0,0.551465
5,Canada,0.868557,0.868557
6,Costa Rica,0.424628,1.351088
7,Croatia,0.912423,1.333542
8,Denmark,0.973455,0.939887
9,Ecuador,0.77205,1.34093


World Cup schedule: https://fbref.com/en/comps/1/schedule/World-Cup-Scores-and-Fixtures

In [12]:
#group stage predictions (using both round() and np.floor)
group_stage_h = pd.read_csv('2022_wc_schedule.csv', usecols = [0,1,2,3,4,8])
group_stage_h['Date'] = pd.to_datetime(group_stage_h['Date'])
group_stage_h = group_stage_h.dropna()
group_stage_h['Home'] = group_stage_h['Home'].replace(['Qatar qa', 'Ecuador ec', 'Senegal sn', 'Netherlands nl',
                                                       'England eng', 'IR Iran ir', 'United States us','Wales wls',
                                                       'Argentina ar', 'Saudi Arabia sa', 'Mexico mx', 'Poland pl',
                                                       'France fr', 'Australia au', 'Denmark dk', 'Tunisia tn',
                                                       'Spain es', 'Costa Rica cr', 'Germany de', 'Japan jp',
                                                       'Belgium be', 'Canada ca', 'Morocco ma', 'Croatia hr',
                                                       'Brazil br', 'Serbia rs', 'Switzerland ch', 'Cameroon cm',
                                                       'Portugal pt', 'Ghana gh', 'Uruguay uy', 'Korea Republic kr'],
                                                      ['Qatar', 'Ecuador', 'Senegal', 'Netherlands',
                                                       'England', 'Iran', 'United States','Wales',
                                                       'Argentina', 'Saudi Arabia', 'Mexico', 'Poland',
                                                       'France', 'Australia', 'Denmark', 'Tunisia',
                                                       'Spain', 'Costa Rica', 'Germany', 'Japan',
                                                       'Belgium', 'Canada', 'Morocco', 'Croatia',
                                                       'Brazil', 'Serbia', 'Switzerland', 'Cameroon',
                                                       'Portugal', 'Ghana', 'Uruguay', 'South Korea'])
group_stage_h['Away'] = group_stage_h['Away'].replace(['qa Qatar', 'ec Ecuador', 'sn Senegal', 'nl Netherlands',
                                                       'eng England', 'ir IR Iran', 'us United States','wls Wales',
                                                       'ar Argentina', 'sa Saudi Arabia', 'mx Mexico', 'pl Poland',
                                                       'fr France', 'au Australia', 'dk Denmark', 'tn Tunisia',
                                                       'es Spain', 'cr Costa Rica', 'de Germany', 'jp Japan',
                                                       'be Belgium', 'ca Canada', 'ma Morocco', 'hr Croatia',
                                                       'br Brazil', 'rs Serbia', 'ch Switzerland', 'cm Cameroon',
                                                       'pt Portugal', 'gh Ghana', 'uy Uruguay', 'kr Korea Republic'],
                                                      ['Qatar', 'Ecuador', 'Senegal', 'Netherlands',
                                                       'England', 'Iran', 'United States','Wales',
                                                       'Argentina', 'Saudi Arabia', 'Mexico', 'Poland',
                                                       'France', 'Australia', 'Denmark', 'Tunisia',
                                                       'Spain', 'Costa Rica', 'Germany', 'Japan',
                                                       'Belgium', 'Canada', 'Morocco', 'Croatia',
                                                       'Brazil', 'Serbia', 'Switzerland', 'Cameroon',
                                                       'Portugal', 'Ghana', 'Uruguay', 'South Korea'])
group_stage_h = group_stage_h.rename(columns = {'Home':'team', 'Away':'opponent'})
group_stage_h['h_or_a'] = 'Home'

group_stage_h = pd.merge(group_stage_h, groups_team, on = 'team')
group_stage_h = pd.merge(group_stage_h, strengths_team, on = 'team')
group_stage_h = pd.merge(group_stage_h, strengths_opponent, on = 'opponent')


group_stage_a = pd.read_csv('2022_wc_schedule.csv', usecols = [0,1,2,3,4,8])
group_stage_a['Date'] = pd.to_datetime(group_stage_a['Date'])
group_stage_a = group_stage_a.dropna()
group_stage_a['Home'] = group_stage_a['Home'].replace(['Qatar qa', 'Ecuador ec', 'Senegal sn', 'Netherlands nl',
                                                       'England eng', 'IR Iran ir', 'United States us','Wales wls',
                                                       'Argentina ar', 'Saudi Arabia sa', 'Mexico mx', 'Poland pl',
                                                       'France fr', 'Australia au', 'Denmark dk', 'Tunisia tn',
                                                       'Spain es', 'Costa Rica cr', 'Germany de', 'Japan jp',
                                                       'Belgium be', 'Canada ca', 'Morocco ma', 'Croatia hr',
                                                       'Brazil br', 'Serbia rs', 'Switzerland ch', 'Cameroon cm',
                                                       'Portugal pt', 'Ghana gh', 'Uruguay uy', 'Korea Republic kr'],
                                                      ['Qatar', 'Ecuador', 'Senegal', 'Netherlands',
                                                       'England', 'Iran', 'United States','Wales',
                                                       'Argentina', 'Saudi Arabia', 'Mexico', 'Poland',
                                                       'France', 'Australia', 'Denmark', 'Tunisia',
                                                       'Spain', 'Costa Rica', 'Germany', 'Japan',
                                                       'Belgium', 'Canada', 'Morocco', 'Croatia',
                                                       'Brazil', 'Serbia', 'Switzerland', 'Cameroon',
                                                       'Portugal', 'Ghana', 'Uruguay', 'South Korea'])
group_stage_a['Away'] = group_stage_a['Away'].replace(['qa Qatar', 'ec Ecuador', 'sn Senegal', 'nl Netherlands',
                                                       'eng England', 'ir IR Iran', 'us United States','wls Wales',
                                                       'ar Argentina', 'sa Saudi Arabia', 'mx Mexico', 'pl Poland',
                                                       'fr France', 'au Australia', 'dk Denmark', 'tn Tunisia',
                                                       'es Spain', 'cr Costa Rica', 'de Germany', 'jp Japan',
                                                       'be Belgium', 'ca Canada', 'ma Morocco', 'hr Croatia',
                                                       'br Brazil', 'rs Serbia', 'ch Switzerland', 'cm Cameroon',
                                                       'pt Portugal', 'gh Ghana', 'uy Uruguay', 'kr Korea Republic'],
                                                      ['Qatar', 'Ecuador', 'Senegal', 'Netherlands',
                                                       'England', 'Iran', 'United States','Wales',
                                                       'Argentina', 'Saudi Arabia', 'Mexico', 'Poland',
                                                       'France', 'Australia', 'Denmark', 'Tunisia',
                                                       'Spain', 'Costa Rica', 'Germany', 'Japan',
                                                       'Belgium', 'Canada', 'Morocco', 'Croatia',
                                                       'Brazil', 'Serbia', 'Switzerland', 'Cameroon',
                                                       'Portugal', 'Ghana', 'Uruguay', 'South Korea'])
group_stage_a = group_stage_a.rename(columns = {'Home':'opponent', 'Away':'team'})
group_stage_a = group_stage_a[['Wk', 'Day', 'Date', 'Time', 'team', 'opponent']]
group_stage_a['h_or_a'] = 'Away'

group_stage_a = pd.merge(group_stage_a, groups_team, on = 'team')
group_stage_a = pd.merge(group_stage_a, strengths_team, on = 'team')
group_stage_a = pd.merge(group_stage_a, strengths_opponent, on = 'opponent')

group_stage = pd.concat([group_stage_h, group_stage_a], axis = 0)

group_stage['team_xG'] = group_stage['team_att'] * group_stage['opponent_def'] * avg_goals
group_stage['opponent_xG'] = group_stage['opponent_att'] * group_stage['team_def'] * avg_goals

group_stage_xG = group_stage[(group_stage['h_or_a'] == 'Home')]

group_stage['team_pred_1'] = group_stage['team_xG'].round()
group_stage['opponent_pred_1'] = group_stage['opponent_xG'].round()
group_stage['team_pred_2'] = group_stage['team_xG'].apply(np.floor)
group_stage['opponent_pred_2'] = group_stage['opponent_xG'].apply(np.floor)

group_stage = group_stage.drop(['team_att', 'team_def', 'opponent_att', 'opponent_def'], axis = 1)

group_stage = group_stage.sort_values(by = ['Date','Time','h_or_a'], ascending = [True, True, False])
group_stage.to_csv('group_stage.csv')

In [13]:
group_stage = group_stage[(group_stage['Wk'] == 3)]
group_stage.to_csv('group_stage_week_3.csv')

In [14]:
#team predictions using round()
group_stage['team_pred_1'].mean()

1.28125

In [15]:
#opponent predictions using round()
group_stage['opponent_pred_1'].mean()

1.28125

In [16]:
#team predictions using np.floor
group_stage['team_pred_2'].mean()

0.71875

In [17]:
#opponent predictions using np.floor
group_stage['opponent_pred_2'].mean()

0.71875

In [18]:
#show average goals scored by World Cup since 1974
wc_goals = pd.read_csv('results.csv')
wc_goals['date'] = pd.to_datetime(wc_goals['date'])
wc_goals = wc_goals[(wc_goals['date'] <= '2018-07-16') & 
                    (wc_goals['date'] >= '1972-01-01') & 
                    (wc_goals['tournament'] == 'FIFA World Cup')]
wc_goals['date'] = pd.DatetimeIndex(wc_goals['date']).year
wc_goals = wc_goals.drop('neutral', axis = 1)
wc_goals['both_scores'] = (wc_goals['home_score'] + wc_goals['away_score']) / 2
wc_goals = wc_goals.groupby(by = 'date').mean().sort_values('date')
wc_goals = wc_goals.reset_index()
wc_goals

Unnamed: 0,date,home_score,away_score,both_scores
0,1974,1.5,1.052632,1.276316
1,1978,1.473684,1.210526,1.342105
2,1982,1.826923,0.980769,1.403846
3,1986,1.461538,1.076923,1.269231
4,1990,1.269231,0.942308,1.105769
5,1994,1.576923,1.134615,1.355769
6,1998,1.578125,1.09375,1.335938
7,2002,1.421875,1.09375,1.257812
8,2006,1.390625,0.90625,1.148438
9,2010,1.203125,1.0625,1.132812
