# 2022 FIFA World Cup Predictions

Kaggle link: https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017?resource=download

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#previewing the data
df = pd.read_csv('results.csv')
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [3]:
#home team data
results_home = pd.read_csv('results.csv')
results_home['date'] = pd.to_datetime(results_home['date'])
results_home = results_home.rename(columns = {'home_team':'team', 
                                              'away_team':'opponent',
                                              'home_score':'team_score',
                                              'away_score':'opponent_score'})
results_home['h_or_a'] = 'Home'

#away team data
results_away = pd.read_csv('results.csv')
results_away['date'] = pd.to_datetime(results_away['date'])
results_away = results_away.rename(columns = {'home_team':'opponent', 
                                              'away_team':'team',
                                              'home_score':'opponent_score',
                                              'away_score':'team_score'})
results_away = results_away[['date', 'team', 'opponent', 'team_score', 'opponent_score', 
                             'tournament', 'city', 'country', 'neutral']]
results_away['h_or_a'] = 'Away'

#concatenating home and away team data
results = pd.concat([results_home, results_away], axis = 0)
results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home
...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away


In [4]:
#shape of newly formatted data
results.shape

(88694, 10)

In [5]:
#shape of original data
df.shape

(44347, 9)

In [6]:
#number of matches
results['p'] = 1

#wins
def set_result(row):
    if row["team_score"] > row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(w = results.apply(set_result, axis=1))

#draws
def set_result(row):
    if row["team_score"] == row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(d = results.apply(set_result, axis=1))

#losses
def set_result(row):
    if row["team_score"] < row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(l = results.apply(set_result, axis=1))

results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home,1,0,1,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home,1,1,0,0
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home,1,0,1,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away,1,1,0,0
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1


In [7]:
#2022 World Cup matches
wc_matches_2022 = results[(results['tournament'] == 'FIFA World Cup') &
                          (results['date'] >= '2022-11-20') & 
                          (results['date'] <= '2022-12-18')]
wc_matches_2022.head()

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
44285,2022-11-20,Qatar,Ecuador,0,2,FIFA World Cup,Al Khor,Qatar,False,Home,1,0,0,1
44286,2022-11-21,Senegal,Netherlands,0,2,FIFA World Cup,Doha,Qatar,True,Home,1,0,0,1
44287,2022-11-21,England,Iran,6,2,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,1,0,0
44288,2022-11-21,United States,Wales,1,1,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,0,1,0
44289,2022-11-22,Argentina,Saudi Arabia,1,2,FIFA World Cup,Lusail,Qatar,True,Home,1,0,0,1


In [8]:
wc_matches_2022.tail()

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away,1,1,0,0
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1
44346,2022-12-14,Morocco,France,0,2,FIFA World Cup,Al Khor,Qatar,True,Away,1,0,0,1


## Poisson Distribution for all matches involving World Cup semi-finalists

This time, instead of using historical match data before the World Cup, I'm going to use actual World Cup match data to make my predictions. Each team has played six games so far. Usually, a team's most recent 6-8 matches is a good barometer of team form.

I'll do one set of strengths for the Third-Place game and another one for the final.

### Third Place

In [9]:
third_teams = ['Croatia', 'Morocco']
third_matches = wc_matches_2022[(wc_matches_2022['team'].isin(third_teams)) | 
                               (wc_matches_2022['opponent'].isin(third_teams))]
third_matches

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
44295,2022-11-23,Morocco,Croatia,0,0,FIFA World Cup,Al Khor,Qatar,True,Home,1,0,1,0
44311,2022-11-27,Belgium,Morocco,0,2,FIFA World Cup,Doha,Qatar,True,Home,1,0,0,1
44312,2022-11-27,Croatia,Canada,4,1,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,1,0,0
44327,2022-12-01,Croatia,Belgium,0,0,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,0,1,0
44328,2022-12-01,Canada,Morocco,1,2,FIFA World Cup,Doha,Qatar,True,Home,1,0,0,1
44337,2022-12-05,Japan,Croatia,1,1,FIFA World Cup,Al Wakrah,Qatar,True,Home,1,0,1,0
44339,2022-12-06,Morocco,Spain,0,0,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,0,1,0
44341,2022-12-09,Croatia,Brazil,1,1,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,0,1,0
44343,2022-12-10,Morocco,Portugal,1,0,FIFA World Cup,Doha,Qatar,True,Home,1,1,0,0
44345,2022-12-13,Argentina,Croatia,3,0,FIFA World Cup,Lusail,Qatar,True,Home,1,1,0,0


In [10]:
#create standings table for each team
table = third_matches[['team','p','w','d','l','team_score','opponent_score']]
table = table.rename(columns = {'team_score':'gf',
                                            'opponent_score':'ga'})
table = table.groupby(by = 'team').sum().sort_values('team', ascending = True)

table['gd'] = table['gf'] - table['ga']
table['gpg_scored'] = table['gf'] / table['p']
table['gpg_conceded'] = table['ga'] / table['p']

table = table.reset_index()
table

Unnamed: 0,team,p,w,d,l,gf,ga,gd,gpg_scored,gpg_conceded
0,Argentina,1,1,0,0,3,0,3,3.0,0.0
1,Belgium,2,0,1,1,0,2,-2,0.0,1.0
2,Brazil,1,0,1,0,1,1,0,1.0,1.0
3,Canada,2,0,0,2,2,6,-4,1.0,3.0
4,Croatia,6,1,4,1,6,6,0,1.0,1.0
5,France,1,1,0,0,2,0,2,2.0,0.0
6,Japan,1,0,1,0,1,1,0,1.0,1.0
7,Morocco,6,3,2,1,5,3,2,0.833333,0.5
8,Portugal,1,0,0,1,0,1,-1,0.0,1.0
9,Spain,1,0,1,0,0,0,0,0.0,0.0


In [11]:
#tiebreaker
table_gd = table[['team', 'gd']]
table_gd

Unnamed: 0,team,gd
0,Argentina,3
1,Belgium,-2
2,Brazil,0
3,Canada,-4
4,Croatia,0
5,France,2
6,Japan,0
7,Morocco,2
8,Portugal,-1
9,Spain,0


In [12]:
#goals scored per game for all teams
avg_goals = table['gf'].sum() / table['p'].sum()
avg_goals

0.9090909090909091

In [13]:
#goals conceded per game for all teams (should be the same as avg_goals)
table['ga'].sum() / table['p'].sum()

0.9090909090909091

In [14]:
#calculate strengths
strengths = third_matches.groupby(['team']).mean().sort_values('team', ascending = True)
strengths = strengths.reset_index()
strengths = strengths[['team', 'team_score', 'opponent_score']]
strengths['att'] = strengths['team_score'] / avg_goals
strengths['def'] = strengths['opponent_score'] / avg_goals
strengths = strengths[['team', 'att', 'def']]
strengths_team = strengths.rename(columns = {'att':'team_att',
                                             'def':'team_def'})
strengths_opponent = strengths.rename(columns = {'team':'opponent',
                                                 'att':'opponent_att',
                                                 'def':'opponent_def'})
strengths = strengths[(strengths['team'].isin(third_teams))]
strengths

Unnamed: 0,team,att,def
4,Croatia,1.1,1.1
7,Morocco,0.916667,0.55


In [15]:
#third place game
third_place = {'Home':['Croatia'],
               'Away':['Morocco']}
third_place = pd.DataFrame(third_place)

third_place = pd.merge(third_place, strengths, left_on = 'Home', right_on = 'team')
third_place = pd.merge(third_place, strengths, left_on = 'Away', right_on = 'team')
third_place = third_place.drop(['team_x','team_y'], axis = 1)
third_place = third_place.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                            'att_y':'Away_Att', 'def_y':'Away_Def'})

third_place['Home_xG'] = third_place['Home_Att'] * third_place['Away_Def'] * avg_goals
third_place['Away_xG'] = third_place['Away_Att'] * third_place['Home_Def'] * avg_goals
third_place['Home_Score'] = third_place['Home_xG'].round()
third_place['Away_Score'] = third_place['Away_xG'].round()

third_place['Winner'] = np.where(third_place['Home_xG'] > third_place['Away_xG'], third_place['Home'],
                        np.where(third_place['Home_xG'] < third_place['Away_xG'], third_place['Away'], 'Draw'))

third_place

Unnamed: 0,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner
0,Croatia,Morocco,1.1,1.1,0.916667,0.55,0.55,0.916667,1.0,1.0,Morocco


### Final

In [16]:
final_teams = ['Argentina', 'France']
final_matches = wc_matches_2022[(wc_matches_2022['team'].isin(final_teams)) | 
                               (wc_matches_2022['opponent'].isin(final_teams))]
final_matches

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
44289,2022-11-22,Argentina,Saudi Arabia,1,2,FIFA World Cup,Lusail,Qatar,True,Home,1,0,0,1
44292,2022-11-22,France,Australia,4,1,FIFA World Cup,Al Wakrah,Qatar,True,Home,1,1,0,0
44306,2022-11-26,Argentina,Mexico,2,0,FIFA World Cup,Lusail,Qatar,True,Home,1,1,0,0
44308,2022-11-26,France,Denmark,2,1,FIFA World Cup,Doha,Qatar,True,Home,1,1,0,0
44321,2022-11-30,Poland,Argentina,0,2,FIFA World Cup,Doha,Qatar,True,Home,1,0,0,1
44324,2022-11-30,Tunisia,France,1,0,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,1,0,0
44334,2022-12-03,Argentina,Australia,2,1,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,1,0,0
44335,2022-12-04,France,Poland,3,1,FIFA World Cup,Doha,Qatar,True,Home,1,1,0,0
44342,2022-12-09,Netherlands,Argentina,2,2,FIFA World Cup,Lusail,Qatar,True,Home,1,0,1,0
44344,2022-12-10,England,France,1,2,FIFA World Cup,Al Khor,Qatar,True,Home,1,0,0,1


In [17]:
#create standings table for each team
table = final_matches[['team','p','w','d','l','team_score','opponent_score']]
table = table.rename(columns = {'team_score':'gf',
                                            'opponent_score':'ga'})
table = table.groupby(by = 'team').sum().sort_values('team', ascending = True)

table['gd'] = table['gf'] - table['ga']
table['gpg_scored'] = table['gf'] / table['p']
table['gpg_conceded'] = table['ga'] / table['p']

table = table.reset_index()
table

Unnamed: 0,team,p,w,d,l,gf,ga,gd,gpg_scored,gpg_conceded
0,Argentina,6,4,1,1,12,5,7,2.0,0.833333
1,Australia,2,0,0,2,2,6,-4,1.0,3.0
2,Croatia,1,0,0,1,0,3,-3,0.0,3.0
3,Denmark,1,0,0,1,1,2,-1,1.0,2.0
4,England,1,0,0,1,1,2,-1,1.0,2.0
5,France,6,5,0,1,13,5,8,2.166667,0.833333
6,Mexico,1,0,0,1,0,2,-2,0.0,2.0
7,Morocco,1,0,0,1,0,2,-2,0.0,2.0
8,Netherlands,1,0,1,0,2,2,0,2.0,2.0
9,Poland,2,0,0,2,1,5,-4,0.5,2.5


In [18]:
#tiebreaker
table_gd = table[['team', 'gd']]
table_gd

Unnamed: 0,team,gd
0,Argentina,7
1,Australia,-4
2,Croatia,-3
3,Denmark,-1
4,England,-1
5,France,8
6,Mexico,-2
7,Morocco,-2
8,Netherlands,0
9,Poland,-4


In [19]:
#goals scored per game for all teams
avg_goals = table['gf'].sum() / table['p'].sum()
avg_goals

1.4583333333333333

In [20]:
#goals conceded per game for all teams (should be the same as avg_goals)
table['ga'].sum() / table['p'].sum()

1.4583333333333333

In [21]:
#calculate strengths
strengths = final_matches.groupby(['team']).mean().sort_values('team', ascending = True)
strengths = strengths.reset_index()
strengths = strengths[['team', 'team_score', 'opponent_score']]
strengths['att'] = strengths['team_score'] / avg_goals
strengths['def'] = strengths['opponent_score'] / avg_goals
strengths = strengths[['team', 'att', 'def']]
strengths_team = strengths.rename(columns = {'att':'team_att',
                                             'def':'team_def'})
strengths_opponent = strengths.rename(columns = {'team':'opponent',
                                                 'att':'opponent_att',
                                                 'def':'opponent_def'})
strengths = strengths[(strengths['team'].isin(final_teams))]
strengths

Unnamed: 0,team,att,def
0,Argentina,1.371429,0.571429
5,France,1.485714,0.571429


In [22]:
#final
final = {'Home':['Argentina'],
         'Away':['France']}
final = pd.DataFrame(final)

final = pd.merge(final, strengths, left_on = 'Home', right_on = 'team')
final = pd.merge(final, strengths, left_on = 'Away', right_on = 'team')
final = final.drop(['team_x','team_y'], axis = 1)
final = final.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                'att_y':'Away_Att', 'def_y':'Away_Def'})

final['Home_xG'] = final['Home_Att'] * final['Away_Def'] * avg_goals
final['Away_xG'] = final['Away_Att'] * final['Home_Def'] * avg_goals
final['Home_Score'] = final['Home_xG'].round()
final['Away_Score'] = final['Away_xG'].round()

final['Winner'] = np.where(final['Home_xG'] > final['Away_xG'], final['Home'],
                  np.where(final['Home_xG'] < final['Away_xG'], final['Away'], 'Draw'))

final

Unnamed: 0,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner
0,Argentina,France,1.371429,0.571429,1.485714,0.571429,1.142857,1.238095,1.0,1.0,France
