# 2022 FIFA World Cup Predictions

Kaggle link: https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017?resource=download

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#previewing the data
df = pd.read_csv('results.csv')
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [3]:
#home team data
results_home = pd.read_csv('results.csv')
results_home['date'] = pd.to_datetime(results_home['date'])
results_home = results_home.rename(columns = {'home_team':'team', 
                                              'away_team':'opponent',
                                              'home_score':'team_score',
                                              'away_score':'opponent_score'})
results_home['h_or_a'] = 'Home'

#away team data
results_away = pd.read_csv('results.csv')
results_away['date'] = pd.to_datetime(results_away['date'])
results_away = results_away.rename(columns = {'home_team':'opponent', 
                                              'away_team':'team',
                                              'home_score':'opponent_score',
                                              'away_score':'team_score'})
results_away = results_away[['date', 'team', 'opponent', 'team_score', 'opponent_score', 
                             'tournament', 'city', 'country', 'neutral']]
results_away['h_or_a'] = 'Away'

#concatenating home and away team data
results = pd.concat([results_home, results_away], axis = 0)
results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home
...,...,...,...,...,...,...,...,...,...,...
44340,2022-12-06,Switzerland,Portugal,1,6,FIFA World Cup,Lusail,Qatar,True,Away
44341,2022-12-09,Brazil,Croatia,1,1,FIFA World Cup,Al Rayyan,Qatar,True,Away
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away


In [4]:
#shape of newly formatted data
results.shape

(88690, 10)

In [5]:
#shape of original data
df.shape

(44345, 9)

In [6]:
#number of matches
results['p'] = 1

#wins
def set_result(row):
    if row["team_score"] > row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(w = results.apply(set_result, axis=1))

#draws
def set_result(row):
    if row["team_score"] == row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(d = results.apply(set_result, axis=1))

#losses
def set_result(row):
    if row["team_score"] < row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(l = results.apply(set_result, axis=1))

results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home,1,0,1,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home,1,1,0,0
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home,1,0,1,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44340,2022-12-06,Switzerland,Portugal,1,6,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1
44341,2022-12-09,Brazil,Croatia,1,1,FIFA World Cup,Al Rayyan,Qatar,True,Away,1,0,1,0
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1


In [20]:
#2022 World Cup matches
wc_matches_2022 = results[(results['tournament'] == 'FIFA World Cup') &
                          (results['date'] >= '2022-11-20') & 
                          (results['date'] <= '2022-12-18')]
wc_matches_2022.head()

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
44285,2022-11-20,Qatar,Ecuador,0,2,FIFA World Cup,Al Khor,Qatar,False,Home,1,0,0,1
44286,2022-11-21,Senegal,Netherlands,0,2,FIFA World Cup,Doha,Qatar,True,Home,1,0,0,1
44287,2022-11-21,England,Iran,6,2,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,1,0,0
44288,2022-11-21,United States,Wales,1,1,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,0,1,0
44289,2022-11-22,Argentina,Saudi Arabia,1,2,FIFA World Cup,Lusail,Qatar,True,Home,1,0,0,1


In [21]:
wc_matches_2022.tail()

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
44340,2022-12-06,Switzerland,Portugal,1,6,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1
44341,2022-12-09,Brazil,Croatia,1,1,FIFA World Cup,Al Rayyan,Qatar,True,Away,1,0,1,0
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away,1,1,0,0


## Poisson Distribution for all matches involving World Cup semi-finalists

This time, instead of using historical match data before the World Cup, I'm going to use actual World Cup match data to make my predictions. Each semi-finalist has played five games so far. Usually, a team's most recent 6-8 matches is a good barometer of team form. I'll use what is available (each team's last 5 games).

In [22]:
semi_teams = ['Argentina', 'Croatia', 'France', 'Morocco']
semi_matches = wc_matches_2022[(wc_matches_2022['team'].isin(semi_teams)) | 
                               (wc_matches_2022['opponent'].isin(semi_teams))]
semi_matches

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
44289,2022-11-22,Argentina,Saudi Arabia,1,2,FIFA World Cup,Lusail,Qatar,True,Home,1,0,0,1
44292,2022-11-22,France,Australia,4,1,FIFA World Cup,Al Wakrah,Qatar,True,Home,1,1,0,0
44295,2022-11-23,Morocco,Croatia,0,0,FIFA World Cup,Al Khor,Qatar,True,Home,1,0,1,0
44306,2022-11-26,Argentina,Mexico,2,0,FIFA World Cup,Lusail,Qatar,True,Home,1,1,0,0
44308,2022-11-26,France,Denmark,2,1,FIFA World Cup,Doha,Qatar,True,Home,1,1,0,0
44311,2022-11-27,Belgium,Morocco,0,2,FIFA World Cup,Doha,Qatar,True,Home,1,0,0,1
44312,2022-11-27,Croatia,Canada,4,1,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,1,0,0
44321,2022-11-30,Poland,Argentina,0,2,FIFA World Cup,Doha,Qatar,True,Home,1,0,0,1
44324,2022-11-30,Tunisia,France,1,0,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,1,0,0
44327,2022-12-01,Croatia,Belgium,0,0,FIFA World Cup,Al Rayyan,Qatar,True,Home,1,0,1,0


In [28]:
#create standings table for each team
table = semi_matches[['team','p','w','d','l','team_score','opponent_score']]
table = table.rename(columns = {'team_score':'gf',
                                            'opponent_score':'ga'})
table = table.groupby(by = 'team').sum().sort_values('team', ascending = True)

table['gd'] = table['gf'] - table['ga']
table['gpg_scored'] = table['gf'] / table['p']
table['gpg_conceded'] = table['ga'] / table['p']

table = table.reset_index()
table

Unnamed: 0,team,p,w,d,l,gf,ga,gd,gpg_scored,gpg_conceded
0,Argentina,5,3,1,1,9,5,4,1.8,1.0
1,Australia,2,0,0,2,2,6,-4,1.0,3.0
2,Belgium,2,0,1,1,0,2,-2,0.0,1.0
3,Brazil,1,0,1,0,1,1,0,1.0,1.0
4,Canada,2,0,0,2,2,6,-4,1.0,3.0
5,Croatia,5,1,4,0,6,3,3,1.2,0.6
6,Denmark,1,0,0,1,1,2,-1,1.0,2.0
7,England,1,0,0,1,1,2,-1,1.0,2.0
8,France,5,4,0,1,11,5,6,2.2,1.0
9,Japan,1,0,1,0,1,1,0,1.0,1.0


In [29]:
#tiebreaker
table_gd = table[['team', 'gd']]
table_gd

Unnamed: 0,team,gd
0,Argentina,4
1,Australia,-4
2,Belgium,-2
3,Brazil,0
4,Canada,-4
5,Croatia,3
6,Denmark,-1
7,England,-1
8,France,6
9,Japan,0


In [30]:
#goals scored per game for all teams
avg_goals = table['gf'].sum() / table['p'].sum()
avg_goals

1.1842105263157894

In [31]:
#goals conceded per game for all teams (should be the same as avg_goals)
table['ga'].sum() / table['p'].sum()

1.1842105263157894

In [35]:
#calculate strengths
strengths = semi_matches.groupby(['team']).mean().sort_values('team', ascending = True)
strengths = strengths.reset_index()
strengths = strengths[['team', 'team_score', 'opponent_score']]
strengths['att'] = strengths['team_score'] / avg_goals
strengths['def'] = strengths['opponent_score'] / avg_goals
strengths = strengths[['team', 'att', 'def']]
strengths_team = strengths.rename(columns = {'att':'team_att',
                                             'def':'team_def'})
strengths_opponent = strengths.rename(columns = {'team':'opponent',
                                                 'att':'opponent_att',
                                                 'def':'opponent_def'})
strengths = strengths[(strengths['team'].isin(semi_teams))]
strengths

Unnamed: 0,team,att,def
0,Argentina,1.52,0.844444
5,Croatia,1.013333,0.506667
8,France,1.857778,0.844444
11,Morocco,0.844444,0.168889


## Actual Semifinalists, Poisson for Remaining Games

In [45]:
#semi-finals
semis = {'Match':['61','62'],
         'Home':['Argentina','France'],
         'Away':['Croatia','Morocco']}
semis = pd.DataFrame(semis)

semis = pd.merge(semis, strengths, left_on = 'Home', right_on = 'team')
semis = pd.merge(semis, strengths, left_on = 'Away', right_on = 'team')
semis = semis.drop(['team_x','team_y'], axis = 1)
semis = semis.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                'att_y':'Away_Att', 'def_y':'Away_Def'})

semis['Home_xG'] = semis['Home_Att'] * semis['Away_Def'] * avg_goals
semis['Away_xG'] = semis['Away_Att'] * semis['Home_Def'] * avg_goals
semis['Home_Score'] = semis['Home_xG'].round()
semis['Away_Score'] = semis['Away_xG'].round()

semis['Winner'] = np.where(semis['Home_xG'] > semis['Away_xG'], semis['Home'],
                  np.where(semis['Home_xG'] < semis['Away_xG'], semis['Away'], 'Draw'))

semis['Loser'] = np.where(semis['Home_xG'] < semis['Away_xG'], semis['Home'],
                 np.where(semis['Home_xG'] > semis['Away_xG'], semis['Away'], 'Draw'))

semis

Unnamed: 0,Match,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner,Loser
0,61,Argentina,Croatia,1.52,0.844444,1.013333,0.506667,0.912,1.013333,1.0,1.0,Croatia,Argentina
1,62,France,Morocco,1.857778,0.844444,0.844444,0.168889,0.371556,0.844444,0.0,1.0,Morocco,France


In [46]:
#third place game
third_place = {'Match':['63'],
               'Home_N':['61'],
               'Away_N':['62']}
third_place = pd.DataFrame(third_place)

third_place = pd.merge(third_place, semis, left_on = 'Home_N', right_on = 'Match')
third_place = pd.merge(third_place, semis, left_on = 'Away_N', right_on = 'Match')
third_place = third_place[['Match_x','Loser_x','Loser_y']]
third_place = third_place.rename(columns = {'Match_x':'Match', 'Loser_x':'Home', 'Loser_y':'Away'})

third_place = pd.merge(third_place, strengths, left_on = 'Home', right_on = 'team')
third_place = pd.merge(third_place, strengths, left_on = 'Away', right_on = 'team')
third_place = third_place.drop(['team_x','team_y'], axis = 1)
third_place = third_place.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                            'att_y':'Away_Att', 'def_y':'Away_Def'})

third_place['Home_xG'] = third_place['Home_Att'] * third_place['Away_Def'] * avg_goals
third_place['Away_xG'] = third_place['Away_Att'] * third_place['Home_Def'] * avg_goals
third_place['Home_Score'] = third_place['Home_xG'].round()
third_place['Away_Score'] = third_place['Away_xG'].round()

third_place['Winner'] = np.where(third_place['Home_xG'] > third_place['Away_xG'], third_place['Home'],
                        np.where(third_place['Home_xG'] < third_place['Away_xG'], third_place['Away'], 'Draw'))

third_place

Unnamed: 0,Match,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner
0,63,Argentina,France,1.52,0.844444,1.857778,0.844444,1.52,1.857778,2.0,2.0,France


In [47]:
#final
final = {'Match':['63'],
         'Home_N':['61'],
         'Away_N':['62']}
final = pd.DataFrame(final)

final = pd.merge(final, semis, left_on = 'Home_N', right_on = 'Match')
final = pd.merge(final, semis, left_on = 'Away_N', right_on = 'Match')
final = final[['Match_x','Winner_x','Winner_y']]
final = final.rename(columns = {'Match_x':'Match', 'Winner_x':'Home', 'Winner_y':'Away'})

final = pd.merge(final, strengths, left_on = 'Home', right_on = 'team')
final = pd.merge(final, strengths, left_on = 'Away', right_on = 'team')
final = final.drop(['team_x','team_y'], axis = 1)
final = final.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                'att_y':'Away_Att', 'def_y':'Away_Def'})

final['Home_xG'] = final['Home_Att'] * final['Away_Def'] * avg_goals
final['Away_xG'] = final['Away_Att'] * final['Home_Def'] * avg_goals
final['Home_Score'] = final['Home_xG'].round()
final['Away_Score'] = final['Away_xG'].round()

final['Winner'] = np.where(final['Home_xG'] > final['Away_xG'], final['Home'],
                  np.where(final['Home_xG'] < final['Away_xG'], final['Away'], 'Draw'))

final

Unnamed: 0,Match,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner
0,63,Croatia,Morocco,1.013333,0.506667,0.844444,0.168889,0.202667,0.506667,0.0,1.0,Morocco
