# 2022 FIFA World Cup Predictions

Kaggle link: https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017?resource=download

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#previewing the data
df = pd.read_csv('results.csv')
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [3]:
#home team data
results_home = pd.read_csv('results.csv')
results_home['date'] = pd.to_datetime(results_home['date'])
results_home = results_home.rename(columns = {'home_team':'team', 
                                              'away_team':'opponent',
                                              'home_score':'team_score',
                                              'away_score':'opponent_score'})
results_home['h_or_a'] = 'Home'

#away team data
results_away = pd.read_csv('results.csv')
results_away['date'] = pd.to_datetime(results_away['date'])
results_away = results_away.rename(columns = {'home_team':'opponent', 
                                              'away_team':'team',
                                              'home_score':'opponent_score',
                                              'away_score':'team_score'})
results_away = results_away[['date', 'team', 'opponent', 'team_score', 'opponent_score', 
                             'tournament', 'city', 'country', 'neutral']]
results_away['h_or_a'] = 'Away'

#concatenating home and away team data
results = pd.concat([results_home, results_away], axis = 0)
results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home
...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away


In [4]:
#shape of newly formatted data
results.shape

(88694, 10)

In [5]:
#shape of original data
df.shape

(44347, 9)

In [6]:
#number of matches
results['p'] = 1

#wins
def set_result(row):
    if row["team_score"] > row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(w = results.apply(set_result, axis=1))

#draws
def set_result(row):
    if row["team_score"] == row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(d = results.apply(set_result, axis=1))

#losses
def set_result(row):
    if row["team_score"] < row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(l = results.apply(set_result, axis=1))

results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home,1,0,1,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home,1,1,0,0
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home,1,0,1,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away,1,1,0,0
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1


In [7]:
#2022 World Cup teams
wc_teams = ['Qatar', 'Ecuador', 'Senegal', 'Netherlands', 
            'England', 'Iran', 'United States', 'Wales',
            'Argentina', 'Saudi Arabia', 'Mexico', 'Poland',
            'France', 'Australia', 'Denmark', 'Tunisia',
            'Spain', 'Costa Rica', 'Germany', 'Japan',
            'Belgium', 'Canada', 'Morocco', 'Croatia',
            'Brazil', 'Serbia', 'Switzerland', 'Cameroon',
            'Portugal', 'Ghana', 'Uruguay', 'South Korea']
results_2022_teams = results[(results['team'].isin(wc_teams)) & 
                             (results['opponent'].isin(wc_teams))]
results_2022_teams

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
10,1879-01-18,England,Wales,2,1,Friendly,London,England,False,Home,1,1,0,0
14,1880-03-15,Wales,England,2,3,Friendly,Wrexham,Wales,False,Home,1,0,0,1
16,1881-02-26,England,Wales,0,1,Friendly,Blackburn,England,False,Home,1,0,0,1
22,1882-03-13,Wales,England,5,3,Friendly,Wrexham,Wales,False,Home,1,1,0,0
24,1883-02-03,England,Wales,5,0,Friendly,London,England,False,Home,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away,1,1,0,0
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1


In [8]:
#matches from last World Cup to current World Cup Round of 16
results_cur_all = results_2022_teams[(results_2022_teams['date'] >= '2018-07-16') & 
                                     (results_2022_teams['date'] <= '2022-12-14')]
results_cur_all

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
491,2018-07-18,Uruguay,Argentina,1,1,Copa Premio Honor Uruguayo,Montevideo,Uruguay,False,Home,1,0,1,0
492,2018-07-28,Uruguay,Argentina,3,1,Copa Premio Honor Uruguayo,Montevideo,Uruguay,False,Home,1,1,0,0
493,2018-08-15,Argentina,Uruguay,0,0,Copa Premio Honor Argentino,Buenos Aires,Argentina,False,Home,1,0,1,0
494,2018-08-25,Argentina,Uruguay,2,1,Copa Premio Honor Argentino,Buenos Aires,Argentina,False,Home,1,1,0,0
496,2018-09-20,Uruguay,Argentina,1,1,Copa Lipton,Montevideo,Uruguay,False,Home,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away,1,1,0,0
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1


## Poisson Distribution amongst all World Cup teams

In [9]:
#create standings table for each team
table = results_cur_all[['team','p','w','d','l','team_score','opponent_score']]
table = table.rename(columns = {'team_score':'gf',
                                            'opponent_score':'ga'})
table = table.groupby(by = 'team').sum().sort_values('team', ascending = True)

table['gd'] = table['gf'] - table['ga']
table['gpg_scored'] = table['gf'] / table['p']
table['gpg_conceded'] = table['ga'] / table['p']

table = table.reset_index()
table

Unnamed: 0,team,p,w,d,l,gf,ga,gd,gpg_scored,gpg_conceded
0,Argentina,51,24,10,17,77,55,22,1.509804,1.078431
1,Australia,10,2,2,6,6,13,-7,0.6,1.3
2,Belgium,32,16,7,9,52,39,13,1.625,1.21875
3,Brazil,37,22,8,7,61,29,32,1.648649,0.783784
4,Cameroon,9,1,4,4,4,8,-4,0.444444,0.888889
5,Canada,18,7,2,9,20,24,-4,1.111111,1.333333
6,Costa Rica,22,3,5,14,14,39,-25,0.636364,1.772727
7,Croatia,27,8,8,11,32,44,-12,1.185185,1.62963
8,Denmark,25,8,6,11,30,31,-1,1.2,1.24
9,Ecuador,21,3,8,10,21,36,-15,1.0,1.714286


In [10]:
#goals scored per game for all teams
avg_goals = table['gf'].sum() / table['p'].sum()
avg_goals

1.296875

In [11]:
#goals conceded per game for all teams (should be the same as goals scored per game for all teams)
table['ga'].sum() / table['p'].sum()

1.296875

In [12]:
#calculate strengths
strengths = results_cur_all.groupby(['team']).mean().sort_values('team', ascending = True)
strengths = strengths.reset_index()
strengths = strengths[['team', 'team_score', 'opponent_score']]
strengths['att'] = strengths['team_score'] / avg_goals
strengths['def'] = strengths['opponent_score'] / avg_goals
strengths = strengths[['team', 'att', 'def']]
strengths_team = strengths.rename(columns = {'att':'team_att',
                                             'def':'team_def'})
strengths_opponent = strengths.rename(columns = {'team':'opponent',
                                                 'att':'opponent_att',
                                                 'def':'opponent_def'})
strengths

Unnamed: 0,team,att,def
0,Argentina,1.164186,0.831562
1,Australia,0.462651,1.00241
2,Belgium,1.253012,0.939759
3,Brazil,1.271247,0.604363
4,Cameroon,0.342704,0.685408
5,Canada,0.85676,1.028112
6,Costa Rica,0.49069,1.366922
7,Croatia,0.913878,1.256582
8,Denmark,0.925301,0.956145
9,Ecuador,0.771084,1.321859


In [13]:
#third place game
third_place = {'Home':['Croatia'],
               'Away':['Morocco']}
third_place = pd.DataFrame(third_place)

third_place = pd.merge(third_place, strengths, left_on = 'Home', right_on = 'team')
third_place = pd.merge(third_place, strengths, left_on = 'Away', right_on = 'team')
third_place = third_place.drop(['team_x','team_y'], axis = 1)
third_place = third_place.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                            'att_y':'Away_Att', 'def_y':'Away_Def'})

third_place['Home_xG'] = third_place['Home_Att'] * third_place['Away_Def'] * avg_goals
third_place['Away_xG'] = third_place['Away_Att'] * third_place['Home_Def'] * avg_goals
third_place['Home_Score'] = third_place['Home_xG'].round()
third_place['Away_Score'] = third_place['Away_xG'].round()

third_place['Winner'] = np.where(third_place['Home_xG'] > third_place['Away_xG'], third_place['Home'],
                        np.where(third_place['Home_xG'] < third_place['Away_xG'], third_place['Away'], 'Draw'))

third_place

Unnamed: 0,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner
0,Croatia,Morocco,0.913878,1.256582,0.771084,0.44062,0.522216,1.256582,1.0,1.0,Morocco


In [14]:
#final
final = {'Home':['Argentina'],
         'Away':['France']}
final = pd.DataFrame(final)

final = pd.merge(final, strengths, left_on = 'Home', right_on = 'team')
final = pd.merge(final, strengths, left_on = 'Away', right_on = 'team')
final = final.drop(['team_x','team_y'], axis = 1)
final = final.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                'att_y':'Away_Att', 'def_y':'Away_Def'})

final['Home_xG'] = final['Home_Att'] * final['Away_Def'] * avg_goals
final['Away_xG'] = final['Away_Att'] * final['Home_Def'] * avg_goals
final['Home_Score'] = final['Home_xG'].round()
final['Away_Score'] = final['Away_xG'].round()

final['Winner'] = np.where(final['Home_xG'] > final['Away_xG'], final['Home'],
                  np.where(final['Home_xG'] < final['Away_xG'], final['Away'], 'Draw'))

final

Unnamed: 0,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner
0,Argentina,France,1.164186,0.831562,1.204819,1.012048,1.527994,1.299315,2.0,1.0,Argentina
