# 2022 FIFA World Cup Predictions

Kaggle link: https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017?resource=download

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#previewing the data
df = pd.read_csv('results.csv')
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [3]:
#home team data
results_home = pd.read_csv('results.csv')
results_home['date'] = pd.to_datetime(results_home['date'])
results_home = results_home.rename(columns = {'home_team':'team', 
                                              'away_team':'opponent',
                                              'home_score':'team_score',
                                              'away_score':'opponent_score'})
results_home['h_or_a'] = 'Home'

#away team data
results_away = pd.read_csv('results.csv')
results_away['date'] = pd.to_datetime(results_away['date'])
results_away = results_away.rename(columns = {'home_team':'opponent', 
                                              'away_team':'team',
                                              'home_score':'opponent_score',
                                              'away_score':'team_score'})
results_away = results_away[['date', 'team', 'opponent', 'team_score', 'opponent_score', 
                             'tournament', 'city', 'country', 'neutral']]
results_away['h_or_a'] = 'Away'

#concatenating home and away team data
results = pd.concat([results_home, results_away], axis = 0)
results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home
...,...,...,...,...,...,...,...,...,...,...
44336,2022-12-04,Senegal,England,0,3,FIFA World Cup,Al Khor,Qatar,True,Away
44337,2022-12-05,Croatia,Japan,1,1,FIFA World Cup,Al Wakrah,Qatar,True,Away
44338,2022-12-05,South Korea,Brazil,1,4,FIFA World Cup,Doha,Qatar,True,Away
44339,2022-12-06,Spain,Morocco,0,0,FIFA World Cup,Al Rayyan,Qatar,True,Away


In [4]:
#shape of newly formatted data
results.shape

(88682, 10)

In [5]:
#shape of original data
df.shape

(44341, 9)

In [6]:
#number of matches
results['p'] = 1

#wins
def set_result(row):
    if row["team_score"] > row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(w = results.apply(set_result, axis=1))

#draws
def set_result(row):
    if row["team_score"] == row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(d = results.apply(set_result, axis=1))

#losses
def set_result(row):
    if row["team_score"] < row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(l = results.apply(set_result, axis=1))

results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home,1,0,1,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home,1,1,0,0
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home,1,0,1,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44336,2022-12-04,Senegal,England,0,3,FIFA World Cup,Al Khor,Qatar,True,Away,1,0,0,1
44337,2022-12-05,Croatia,Japan,1,1,FIFA World Cup,Al Wakrah,Qatar,True,Away,1,0,1,0
44338,2022-12-05,South Korea,Brazil,1,4,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44339,2022-12-06,Spain,Morocco,0,0,FIFA World Cup,Al Rayyan,Qatar,True,Away,1,0,1,0


In [7]:
#2022 World Cup teams
wc_teams = ['Qatar', 'Ecuador', 'Senegal', 'Netherlands', 
            'England', 'Iran', 'United States', 'Wales',
            'Argentina', 'Saudi Arabia', 'Mexico', 'Poland',
            'France', 'Australia', 'Denmark', 'Tunisia',
            'Spain', 'Costa Rica', 'Germany', 'Japan',
            'Belgium', 'Canada', 'Morocco', 'Croatia',
            'Brazil', 'Serbia', 'Switzerland', 'Cameroon',
            'Portugal', 'Ghana', 'Uruguay', 'South Korea']
results_2022_teams = results[(results['team'].isin(wc_teams)) & 
                             (results['opponent'].isin(wc_teams))]
results_2022_teams

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
10,1879-01-18,England,Wales,2,1,Friendly,London,England,False,Home,1,1,0,0
14,1880-03-15,Wales,England,2,3,Friendly,Wrexham,Wales,False,Home,1,0,0,1
16,1881-02-26,England,Wales,0,1,Friendly,Blackburn,England,False,Home,1,0,0,1
22,1882-03-13,Wales,England,5,3,Friendly,Wrexham,Wales,False,Home,1,1,0,0
24,1883-02-03,England,Wales,5,0,Friendly,London,England,False,Home,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44336,2022-12-04,Senegal,England,0,3,FIFA World Cup,Al Khor,Qatar,True,Away,1,0,0,1
44337,2022-12-05,Croatia,Japan,1,1,FIFA World Cup,Al Wakrah,Qatar,True,Away,1,0,1,0
44338,2022-12-05,South Korea,Brazil,1,4,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44339,2022-12-06,Spain,Morocco,0,0,FIFA World Cup,Al Rayyan,Qatar,True,Away,1,0,1,0


In [8]:
#matches from last World Cup to current World Cup Round of 16
results_cur_all = results_2022_teams[(results_2022_teams['date'] >= '2018-07-16') & 
                                     (results_2022_teams['date'] <= '2022-12-08')]
results_cur_all

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
40446,2018-09-06,Germany,France,0,0,UEFA Nations League,Munich,Germany,False,Home,1,0,1,0
40460,2018-09-06,Portugal,Croatia,1,1,Friendly,Faro-Loulé,Portugal,False,Home,1,0,1,0
40479,2018-09-07,South Korea,Costa Rica,2,0,Friendly,Goyang,South Korea,False,Home,1,1,0,0
40480,2018-09-07,Mexico,Uruguay,1,4,Friendly,Houston,United States,True,Home,1,0,0,1
40484,2018-09-07,United States,Brazil,0,2,Friendly,East Rutherford,United States,False,Home,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44336,2022-12-04,Senegal,England,0,3,FIFA World Cup,Al Khor,Qatar,True,Away,1,0,0,1
44337,2022-12-05,Croatia,Japan,1,1,FIFA World Cup,Al Wakrah,Qatar,True,Away,1,0,1,0
44338,2022-12-05,South Korea,Brazil,1,4,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44339,2022-12-06,Spain,Morocco,0,0,FIFA World Cup,Al Rayyan,Qatar,True,Away,1,0,1,0


In [9]:
#Create Group Stage groups
quarters = ['Netherlands', 'Argentina', 'Croatia', 'Brazil', 'England', 'France', 'Morocco', 'Portugal']
results_q = results_cur_all[(results_cur_all['team'].isin(quarters)) & 
                            (results_cur_all['opponent'].isin(quarters))]
results_q

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
40460,2018-09-06,Portugal,Croatia,1,1,Friendly,Faro-Loulé,Portugal,False,Home,1,0,1,0
40512,2018-09-09,France,Netherlands,2,1,UEFA Nations League,Paris,France,False,Home,1,1,0,0
40603,2018-10-12,Croatia,England,0,0,UEFA Nations League,Rijeka,Croatia,False,Home,1,0,1,0
40710,2018-10-16,Argentina,Brazil,0,1,Superclásico de las Américas,Jeddah,Saudi Arabia,True,Home,1,0,0,1
40748,2018-11-16,Netherlands,France,2,0,UEFA Nations League,Rotterdam,Netherlands,False,Home,1,1,0,0
40797,2018-11-18,England,Croatia,2,1,UEFA Nations League,London,England,False,Home,1,1,0,0
41067,2019-03-26,Morocco,Argentina,0,1,Friendly,Tangier,Morocco,False,Home,1,0,0,1
41146,2019-06-06,England,Netherlands,1,3,UEFA Nations League,Guimarães,Portugal,True,Home,1,0,0,1
41210,2019-06-09,Portugal,Netherlands,1,0,UEFA Nations League,Porto,Portugal,False,Home,1,1,0,0
41376,2019-07-02,Brazil,Argentina,2,0,Copa América,Belo Horizonte,Brazil,False,Home,1,1,0,0


## Poisson Distribution amongst all World Cup Quartefinalteams teams

In [10]:
#create standings table for each team
table = results_q[['team','p','w','d','l','team_score','opponent_score']]
table = table.rename(columns = {'team_score':'gf',
                                            'opponent_score':'ga'})
table = table.groupby(by = 'team').sum().sort_values('team', ascending = True)

table['gd'] = table['gf'] - table['ga']
table['gpg_scored'] = table['gf'] / table['p']
table['gpg_conceded'] = table['ga'] / table['p']

table = table.reset_index()
table

Unnamed: 0,team,p,w,d,l,gf,ga,gd,gpg_scored,gpg_conceded
0,Argentina,6,3,1,2,3,3,0,0.5,0.5
1,Brazil,5,2,1,2,3,2,1,0.6,0.4
2,Croatia,11,1,4,6,10,18,-8,0.909091,1.636364
3,England,4,2,1,1,4,4,0,1.0,1.0
4,France,9,4,3,2,12,10,2,1.333333,1.111111
5,Morocco,2,0,1,1,0,1,-1,0.0,0.5
6,Netherlands,4,2,0,2,6,4,2,1.5,1.0
7,Portugal,7,3,3,1,11,7,4,1.571429,1.0


In [11]:
#tiebreaker
table_gd = table[['team', 'gd']]
table_gd

Unnamed: 0,team,gd
0,Argentina,0
1,Brazil,1
2,Croatia,-8
3,England,0
4,France,2
5,Morocco,-1
6,Netherlands,2
7,Portugal,4


In [12]:
#goals scored per game for all teams
avg_goals = table['gf'].sum() / table['p'].sum()
avg_goals

1.0208333333333333

In [13]:
#goals conceded per game for all teams (should be the same as goals scored per game for all teams)
table['ga'].sum() / table['p'].sum()

1.0208333333333333

In [14]:
#calculate strengths
strengths = results_q.groupby(['team']).mean().sort_values('team', ascending = True)
strengths = strengths.reset_index()
strengths = strengths[['team', 'team_score', 'opponent_score']]
strengths['att'] = strengths['team_score'] / avg_goals
strengths['def'] = strengths['opponent_score'] / avg_goals
strengths = strengths[['team', 'att', 'def']]
strengths_team = strengths.rename(columns = {'att':'team_att',
                                             'def':'team_def'})
strengths_opponent = strengths.rename(columns = {'team':'opponent',
                                                 'att':'opponent_att',
                                                 'def':'opponent_def'})
strengths

Unnamed: 0,team,att,def
0,Argentina,0.489796,0.489796
1,Brazil,0.587755,0.391837
2,Croatia,0.890538,1.602968
3,England,0.979592,0.979592
4,France,1.306122,1.088435
5,Morocco,0.0,0.489796
6,Netherlands,1.469388,0.979592
7,Portugal,1.539359,0.979592


## Actual Round of 16 Teams, Poisson for Remaining Games

In [15]:
#quarter-finals
quarters = {'Match':['57','58','59','60'],
            'Home':['Netherlands', 'Croatia', 'England', 'Morocco'],
            'Away':['Argentina', 'Brazil', 'France', 'Portugal']}
quarters = pd.DataFrame(quarters)

quarters = pd.merge(quarters, strengths, left_on = 'Home', right_on = 'team')
quarters = pd.merge(quarters, strengths, left_on = 'Away', right_on = 'team')
quarters = quarters.drop(['team_x','team_y'], axis = 1)
quarters = quarters.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                      'att_y':'Away_Att', 'def_y':'Away_Def'})

quarters['Home_xG'] = quarters['Home_Att'] * quarters['Away_Def'] * avg_goals
quarters['Away_xG'] = quarters['Away_Att'] * quarters['Home_Def'] * avg_goals
quarters['Home_Score'] = quarters['Home_xG'].round()
quarters['Away_Score'] = quarters['Away_xG'].round()

quarters['Winner'] = np.where(quarters['Home_xG'] > quarters['Away_xG'], quarters['Home'],
                     np.where(quarters['Home_xG'] < quarters['Away_xG'], quarters['Away'], 'Draw'))

quarters

Unnamed: 0,Match,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner
0,57,Netherlands,Argentina,1.469388,0.979592,0.489796,0.489796,0.734694,0.489796,1.0,0.0,Netherlands
1,58,Croatia,Brazil,0.890538,1.602968,0.587755,0.391837,0.356215,0.961781,0.0,1.0,Brazil
2,59,England,France,0.979592,0.979592,1.306122,1.088435,1.088435,1.306122,1.0,1.0,France
3,60,Morocco,Portugal,0.0,0.489796,1.539359,0.979592,0.0,0.769679,0.0,1.0,Portugal


In [16]:
#semi-finals
semis = {'Match':['61','62'],
         'Home_N':['57','59'],
         'Away_N':['58','60']}
semis = pd.DataFrame(semis)

semis = pd.merge(semis, quarters, left_on = 'Home_N', right_on = 'Match')
semis = pd.merge(semis, quarters, left_on = 'Away_N', right_on = 'Match')
semis = semis[['Match_x','Winner_x','Winner_y']]
semis = semis.rename(columns = {'Match_x':'Match', 'Winner_x':'Home', 'Winner_y':'Away'})

semis = pd.merge(semis, strengths, left_on = 'Home', right_on = 'team')
semis = pd.merge(semis, strengths, left_on = 'Away', right_on = 'team')
#tiebreaker
semis = pd.merge(semis, table_gd, left_on = 'Home', right_on = 'team')
semis = pd.merge(semis, table_gd, left_on = 'Away', right_on = 'team')
semis = semis.drop(['team_x','team_y'], axis = 1)
semis = semis.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                'att_y':'Away_Att', 'def_y':'Away_Def',
                                #tiebreaker
                                'gd_x':'Home_GD', 'gd_y':'Away_GD'})

semis['Home_xG'] = semis['Home_Att'] * semis['Away_Def'] * avg_goals
semis['Away_xG'] = semis['Away_Att'] * semis['Home_Def'] * avg_goals
semis['Home_Score'] = semis['Home_xG'].round()
semis['Away_Score'] = semis['Away_xG'].round()

semis['Winner'] = np.where(semis['Home_xG'] > semis['Away_xG'], semis['Home'],
                  np.where(semis['Home_xG'] < semis['Away_xG'], semis['Away'], 
                  np.where(semis['Home_GD'] > semis['Away_GD'], semis['Home'], 
                  np.where(semis['Home_GD'] < semis['Away_GD'], semis['Away'], 'Draw'))))
semis['Loser'] = np.where(semis['Home_xG'] < semis['Away_xG'], semis['Home'],
                 np.where(semis['Home_xG'] > semis['Away_xG'], semis['Away'],
                 np.where(semis['Home_GD'] < semis['Away_GD'], semis['Home'], 
                 np.where(semis['Home_GD'] > semis['Away_GD'], semis['Away'], 'Draw'))))

semis

Unnamed: 0,Match,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_GD,Away_GD,Home_xG,Away_xG,Home_Score,Away_Score,Winner,Loser
0,61,Netherlands,Brazil,1.469388,0.979592,0.587755,0.391837,2,1,0.587755,0.587755,1.0,1.0,Netherlands,Brazil
1,62,France,Portugal,1.306122,1.088435,1.539359,0.979592,2,4,1.306122,1.710398,1.0,2.0,Portugal,France


In [17]:
#third place game
third_place = {'Match':['63'],
               'Home_N':['61'],
               'Away_N':['62']}
third_place = pd.DataFrame(third_place)

third_place = pd.merge(third_place, semis, left_on = 'Home_N', right_on = 'Match')
third_place = pd.merge(third_place, semis, left_on = 'Away_N', right_on = 'Match')
third_place = third_place[['Match_x','Loser_x','Loser_y']]
third_place = third_place.rename(columns = {'Match_x':'Match', 'Loser_x':'Home', 'Loser_y':'Away'})

third_place = pd.merge(third_place, strengths, left_on = 'Home', right_on = 'team')
third_place = pd.merge(third_place, strengths, left_on = 'Away', right_on = 'team')
third_place = third_place.drop(['team_x','team_y'], axis = 1)
third_place = third_place.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                            'att_y':'Away_Att', 'def_y':'Away_Def'})

third_place['Home_xG'] = third_place['Home_Att'] * third_place['Away_Def'] * avg_goals
third_place['Away_xG'] = third_place['Away_Att'] * third_place['Home_Def'] * avg_goals
third_place['Home_Score'] = third_place['Home_xG'].round()
third_place['Away_Score'] = third_place['Away_xG'].round()

third_place['Winner'] = np.where(third_place['Home_xG'] > third_place['Away_xG'], third_place['Home'],
                        np.where(third_place['Home_xG'] < third_place['Away_xG'], third_place['Away'], 'Draw'))

third_place

Unnamed: 0,Match,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner
0,63,Brazil,France,0.587755,0.391837,1.306122,1.088435,0.653061,0.522449,1.0,1.0,Brazil


In [18]:
#final
final = {'Match':['63'],
         'Home_N':['61'],
         'Away_N':['62']}
final = pd.DataFrame(final)

final = pd.merge(final, semis, left_on = 'Home_N', right_on = 'Match')
final = pd.merge(final, semis, left_on = 'Away_N', right_on = 'Match')
final = final[['Match_x','Winner_x','Winner_y']]
final = final.rename(columns = {'Match_x':'Match', 'Winner_x':'Home', 'Winner_y':'Away'})

final = pd.merge(final, strengths, left_on = 'Home', right_on = 'team')
final = pd.merge(final, strengths, left_on = 'Away', right_on = 'team')
final = final.drop(['team_x','team_y'], axis = 1)
final = final.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                'att_y':'Away_Att', 'def_y':'Away_Def'})

final['Home_xG'] = final['Home_Att'] * final['Away_Def'] * avg_goals
final['Away_xG'] = final['Away_Att'] * final['Home_Def'] * avg_goals
final['Home_Score'] = final['Home_xG'].round()
final['Away_Score'] = final['Away_xG'].round()

final['Winner'] = np.where(final['Home_xG'] > final['Away_xG'], final['Home'],
                  np.where(final['Home_xG'] < final['Away_xG'], final['Away'], 'Draw'))

final

Unnamed: 0,Match,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner
0,63,Netherlands,Portugal,1.469388,0.979592,1.539359,0.979592,1.469388,1.539359,1.0,2.0,Portugal
