# 2022 FIFA World Cup Predictions

Kaggle link: https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017?resource=download

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#previewing the data
df = pd.read_csv('results.csv')
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [3]:
#home team data
results_home = pd.read_csv('results.csv')
results_home['date'] = pd.to_datetime(results_home['date'])
results_home = results_home.rename(columns = {'home_team':'team', 
                                              'away_team':'opponent',
                                              'home_score':'team_score',
                                              'away_score':'opponent_score'})
results_home['h_or_a'] = 'Home'

#away team data
results_away = pd.read_csv('results.csv')
results_away['date'] = pd.to_datetime(results_away['date'])
results_away = results_away.rename(columns = {'home_team':'opponent', 
                                              'away_team':'team',
                                              'home_score':'opponent_score',
                                              'away_score':'team_score'})
results_away = results_away[['date', 'team', 'opponent', 'team_score', 'opponent_score', 
                             'tournament', 'city', 'country', 'neutral']]
results_away['h_or_a'] = 'Away'

#concatenating home and away team data
results = pd.concat([results_home, results_away], axis = 0)
results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home
...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away


In [4]:
#shape of newly formatted data
results.shape

(88694, 10)

In [5]:
#shape of original data
df.shape

(44347, 9)

In [6]:
#number of matches
results['p'] = 1

#wins
def set_result(row):
    if row["team_score"] > row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(w = results.apply(set_result, axis=1))

#draws
def set_result(row):
    if row["team_score"] == row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(d = results.apply(set_result, axis=1))

#losses
def set_result(row):
    if row["team_score"] < row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(l = results.apply(set_result, axis=1))

results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home,1,0,1,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home,1,1,0,0
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home,1,0,1,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away,1,1,0,0
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1


In [7]:
#2022 World Cup teams
wc_teams = ['Qatar', 'Ecuador', 'Senegal', 'Netherlands', 
            'England', 'Iran', 'United States', 'Wales',
            'Argentina', 'Saudi Arabia', 'Mexico', 'Poland',
            'France', 'Australia', 'Denmark', 'Tunisia',
            'Spain', 'Costa Rica', 'Germany', 'Japan',
            'Belgium', 'Canada', 'Morocco', 'Croatia',
            'Brazil', 'Serbia', 'Switzerland', 'Cameroon',
            'Portugal', 'Ghana', 'Uruguay', 'South Korea']
results_2022_teams = results[(results['team'].isin(wc_teams)) & 
                             (results['opponent'].isin(wc_teams))]
results_2022_teams

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
10,1879-01-18,England,Wales,2,1,Friendly,London,England,False,Home,1,1,0,0
14,1880-03-15,Wales,England,2,3,Friendly,Wrexham,Wales,False,Home,1,0,0,1
16,1881-02-26,England,Wales,0,1,Friendly,Blackburn,England,False,Home,1,0,0,1
22,1882-03-13,Wales,England,5,3,Friendly,Wrexham,Wales,False,Home,1,1,0,0
24,1883-02-03,England,Wales,5,0,Friendly,London,England,False,Home,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away,1,1,0,0
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1


In [18]:
#matches from when Scaloni took charge to 2022 World Cup Final (certain competitions)
competitions = ['Copa América', 'FIFA World Cup', 'FIFA World Cup qualification', 'UEFA Euro', 'UEFA Nations League']
results_cur_all = results_2022_teams[(results_2022_teams['date'] >= '2018-08-03') & 
                                     (results_2022_teams['date'] <= '2022-12-17') &
                                     (results_2022_teams['tournament'].isin(competitions))]
results_cur_all

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
505,2019-05-13,Argentina,Uruguay,2,3,Copa América,Rio de Janeiro,Brazil,True,Home,1,0,0,1
508,2019-05-18,Brazil,Argentina,3,1,Copa América,Rio de Janeiro,Brazil,False,Home,1,1,0,0
512,2019-05-26,Brazil,Uruguay,2,2,Copa América,Rio de Janeiro,Brazil,False,Home,1,0,1,0
513,2019-05-29,Brazil,Uruguay,1,0,Copa América,Rio de Janeiro,Brazil,False,Home,1,1,0,0
556,2020-09-12,Uruguay,Argentina,1,1,Copa América,Viña del Mar,Chile,True,Home,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away,1,1,0,0
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1


## Poisson Distribution amongst all World Cup teams

In [19]:
#create standings table for each team
table = results_cur_all[['team','p','w','d','l','team_score','opponent_score']]
table = table.rename(columns = {'team_score':'gf',
                                            'opponent_score':'ga'})
table = table.groupby(by = 'team').sum().sort_values('team', ascending = True)

table['gd'] = table['gf'] - table['ga']
table['gpg_scored'] = table['gf'] / table['p']
table['gpg_conceded'] = table['ga'] / table['p']

table = table.reset_index()
table

Unnamed: 0,team,p,w,d,l,gf,ga,gd,gpg_scored,gpg_conceded
0,Argentina,24,14,4,6,33,18,15,1.375,0.75
1,Australia,8,2,1,5,5,11,-6,0.625,1.375
2,Belgium,20,11,3,6,34,26,8,1.7,1.3
3,Brazil,22,10,6,6,29,21,8,1.318182,0.954545
4,Cameroon,3,1,1,1,4,4,0,1.333333,1.333333
5,Canada,9,3,2,4,9,11,-2,1.0,1.222222
6,Costa Rica,9,3,1,5,7,15,-8,0.777778,1.666667
7,Croatia,20,5,6,9,24,37,-13,1.2,1.85
8,Denmark,16,6,2,8,19,18,1,1.1875,1.125
9,Ecuador,13,2,5,6,12,20,-8,0.923077,1.538462


In [20]:
#goals scored per game for all teams
avg_goals = table['gf'].sum() / table['p'].sum()
avg_goals

1.3121951219512196

In [21]:
#goals conceded per game for all teams (should be the same as goals scored per game for all teams)
table['ga'].sum() / table['p'].sum()

1.3121951219512196

In [22]:
#calculate strengths
strengths = results_cur_all.groupby(['team']).mean().sort_values('team', ascending = True)
strengths = strengths.reset_index()
strengths = strengths[['team', 'team_score', 'opponent_score']]
strengths['att'] = strengths['team_score'] / avg_goals
strengths['def'] = strengths['opponent_score'] / avg_goals
strengths = strengths[['team', 'att', 'def']]
strengths_team = strengths.rename(columns = {'att':'team_att',
                                             'def':'team_def'})
strengths_opponent = strengths.rename(columns = {'team':'opponent',
                                                 'att':'opponent_att',
                                                 'def':'opponent_def'})
strengths

Unnamed: 0,team,att,def
0,Argentina,1.047862,0.571561
1,Australia,0.476301,1.047862
2,Belgium,1.295539,0.990706
3,Brazil,1.004562,0.727442
4,Cameroon,1.016109,1.016109
5,Canada,0.762082,0.931433
6,Costa Rica,0.59273,1.270136
7,Croatia,0.914498,1.409851
8,Denmark,0.904972,0.857342
9,Ecuador,0.70346,1.172434


In [24]:
#final
final = {'Home':['Argentina'],
         'Away':['France']}
final = pd.DataFrame(final)

final = pd.merge(final, strengths, left_on = 'Home', right_on = 'team')
final = pd.merge(final, strengths, left_on = 'Away', right_on = 'team')
final = final.drop(['team_x','team_y'], axis = 1)
final = final.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                'att_y':'Away_Att', 'def_y':'Away_Def'})

final['Home_xG'] = final['Home_Att'] * final['Away_Def'] * avg_goals
final['Away_xG'] = final['Away_Att'] * final['Home_Def'] * avg_goals
final['Home_Score'] = final['Home_xG'].round()
final['Away_Score'] = final['Away_xG'].round()

final['Winner'] = np.where(final['Home_xG'] > final['Away_xG'], final['Home'],
                  np.where(final['Home_xG'] < final['Away_xG'], final['Away'], 'Draw'))

final

Unnamed: 0,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner
0,Argentina,France,1.047862,0.571561,1.225958,0.861484,1.18454,0.919468,1.0,1.0,Argentina
