# 2022 FIFA World Cup Predictions

Kaggle link: https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017?resource=download

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#previewing the data
df = pd.read_csv('results.csv')
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [3]:
#home team data
results_home = pd.read_csv('results.csv')
results_home['date'] = pd.to_datetime(results_home['date'])
results_home = results_home.rename(columns = {'home_team':'team', 
                                              'away_team':'opponent',
                                              'home_score':'team_score',
                                              'away_score':'opponent_score'})
results_home['h_or_a'] = 'Home'

#away team data
results_away = pd.read_csv('results.csv')
results_away['date'] = pd.to_datetime(results_away['date'])
results_away = results_away.rename(columns = {'home_team':'opponent', 
                                              'away_team':'team',
                                              'home_score':'opponent_score',
                                              'away_score':'team_score'})
results_away = results_away[['date', 'team', 'opponent', 'team_score', 'opponent_score', 
                             'tournament', 'city', 'country', 'neutral']]
results_away['h_or_a'] = 'Away'

#concatenating home and away team data
results = pd.concat([results_home, results_away], axis = 0)
results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home
...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away


In [4]:
#shape of newly formatted data
results.shape

(88694, 10)

In [5]:
#shape of original data
df.shape

(44347, 9)

In [6]:
#number of matches
results['p'] = 1

#wins
def set_result(row):
    if row["team_score"] > row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(w = results.apply(set_result, axis=1))

#draws
def set_result(row):
    if row["team_score"] == row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(d = results.apply(set_result, axis=1))

#losses
def set_result(row):
    if row["team_score"] < row["opponent_score"]:
        return 1
    else:
        return 0
results = results.assign(l = results.apply(set_result, axis=1))

results

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Home,1,0,1,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Home,1,1,0,0
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Home,1,0,1,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Home,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away,1,1,0,0
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1


In [7]:
#matches from when Scaloni took charge to 2022 World Cup Final (certain competitions)
results_cmp = results[(results['date'] >= '2018-08-03') & 
                      (results['date'] <= '2022-12-17')]
results_cmp

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,h_or_a,p,w,d,l
493,2018-08-15,Argentina,Uruguay,0,0,Copa Premio Honor Argentino,Buenos Aires,Argentina,False,Home,1,0,1,0
494,2018-08-25,Argentina,Uruguay,2,1,Copa Premio Honor Argentino,Buenos Aires,Argentina,False,Home,1,1,0,0
495,2018-09-15,Norway,Sweden,2,1,Friendly,Kristiania,Norway,False,Home,1,1,0,0
496,2018-09-20,Uruguay,Argentina,1,1,Copa Lipton,Montevideo,Uruguay,False,Home,1,0,1,0
497,2018-09-29,Argentina,Uruguay,2,0,Copa Newton,Buenos Aires,Argentina,False,Home,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44342,2022-12-09,Argentina,Netherlands,2,2,FIFA World Cup,Lusail,Qatar,True,Away,1,0,1,0
44343,2022-12-10,Portugal,Morocco,0,1,FIFA World Cup,Doha,Qatar,True,Away,1,0,0,1
44344,2022-12-10,France,England,2,1,FIFA World Cup,Al Khor,Qatar,True,Away,1,1,0,0
44345,2022-12-13,Croatia,Argentina,0,3,FIFA World Cup,Lusail,Qatar,True,Away,1,0,0,1


In [8]:
#finalists
arg = results_cmp[(results_cmp['team'] == 'Argentina')]
fra = results_cmp[(results_cmp['team'] == 'France')]

## Poisson Distribution amongst all World Cup teams

In [9]:
#create standings table for each team
table_arg = arg[['team','p','w','d','l','team_score','opponent_score']]
table_arg = table_arg.rename(columns = {'team_score':'gf', 'opponent_score':'ga'})
table_arg = table_arg.groupby(by = 'team').sum().sort_values('team', ascending = True)

table_arg['gd'] = table_arg['gf'] - table_arg['ga']
table_arg['gpg_scored'] = table_arg['gf'] / table_arg['p']
table_arg['gpg_conceded'] = table_arg['ga'] / table_arg['p']

table_arg = table_arg.reset_index()
table_arg

Unnamed: 0,team,p,w,d,l,gf,ga,gd,gpg_scored,gpg_conceded
0,Argentina,93,53,21,19,172,81,91,1.849462,0.870968


In [10]:
#goals scored per game for all teams
avg_goals_arg = table_arg['gf'].sum() / table_arg['p'].sum()
avg_goals_arg

1.8494623655913978

In [11]:
#goals conceded per game for all teams
table_arg['ga'].sum() / table_arg['p'].sum()

0.8709677419354839

In [12]:
#calculate strengths
strengths_arg = arg.groupby(['team']).mean().sort_values('team', ascending = True)
strengths_arg = strengths_arg.reset_index()
strengths_arg = strengths_arg[['team', 'team_score', 'opponent_score']]
strengths_arg['att'] = strengths_arg['team_score'] / avg_goals_arg
strengths_arg['def'] = strengths_arg['opponent_score'] / avg_goals_arg
strengths_arg = strengths_arg[['team', 'att', 'def']]
strengths_team_arg = strengths_arg.rename(columns = {'att':'team_att', 'def':'team_def'})
strengths_opponent_arg = strengths_arg.rename(columns = {'team':'opponent',
                                                         'att':'opponent_att',
                                                         'def':'opponent_def'})
strengths_arg

Unnamed: 0,team,att,def
0,Argentina,1.0,0.47093


In [13]:
#create standings table for each team
table_fra = fra[['team','p','w','d','l','team_score','opponent_score']]
table_fra = table_fra.rename(columns = {'team_score':'gf', 'opponent_score':'ga'})
table_fra = table_fra.groupby(by = 'team').sum().sort_values('team', ascending = True)

table_fra['gd'] = table_fra['gf'] - table_fra['ga']
table_fra['gpg_scored'] = table_fra['gf'] / table_fra['p']
table_fra['gpg_conceded'] = table_fra['ga'] / table_fra['p']

table_fra = table_fra.reset_index()
table_fra

Unnamed: 0,team,p,w,d,l,gf,ga,gd,gpg_scored,gpg_conceded
0,France,65,39,13,13,128,79,49,1.969231,1.215385


In [14]:
#goals scored per game for all teams
avg_goals_fra = table_fra['gf'].sum() / table_fra['p'].sum()
avg_goals_fra

1.9692307692307693

In [15]:
#goals conceded per game for all teams (should be the same as goals scored per game for all teams)
table_fra['ga'].sum() / table_fra['p'].sum()

1.2153846153846153

In [16]:
#calculate strengths
strengths_fra = fra.groupby(['team']).mean().sort_values('team', ascending = True)
strengths_fra = strengths_fra.reset_index()
strengths_fra = strengths_fra[['team', 'team_score', 'opponent_score']]
strengths_fra['att'] = strengths_fra['team_score'] / avg_goals_fra
strengths_fra['def'] = strengths_fra['opponent_score'] / avg_goals_fra
strengths_fra = strengths_fra[['team', 'att', 'def']]
strengths_team_fra = strengths_fra.rename(columns = {'att':'team_att', 'def':'team_def'})
strengths_opponent_fra = strengths_fra.rename(columns = {'team':'opponent', 
                                                         'att':'opponent_att',
                                                         'def':'opponent_def'})
strengths_fra

Unnamed: 0,team,att,def
0,France,1.0,0.617187


In [17]:
table = pd.concat([table_arg, table_fra], axis = 0)
table

Unnamed: 0,team,p,w,d,l,gf,ga,gd,gpg_scored,gpg_conceded
0,Argentina,93,53,21,19,172,81,91,1.849462,0.870968
0,France,65,39,13,13,128,79,49,1.969231,1.215385


In [18]:
#goals scored per game for all teams
avg_goals = table['gf'].sum() / table['p'].sum()
avg_goals

1.8987341772151898

In [19]:
#goals conceded per game for all teams
table['ga'].sum() / table['p'].sum()

1.0126582278481013

In [20]:
strengths = pd.concat([strengths_arg, strengths_fra], axis = 0)
strengths

Unnamed: 0,team,att,def
0,Argentina,1.0,0.47093
0,France,1.0,0.617187


In [21]:
#final
final = {'Home':['Argentina'],
         'Away':['France']}
final = pd.DataFrame(final)

final = pd.merge(final, strengths, left_on = 'Home', right_on = 'team')
final = pd.merge(final, strengths, left_on = 'Away', right_on = 'team')
final = final.drop(['team_x','team_y'], axis = 1)
final = final.rename(columns = {'att_x':'Home_Att', 'def_x':'Home_Def',
                                'att_y':'Away_Att', 'def_y':'Away_Def'})

final['Home_xG'] = final['Home_Att'] * final['Away_Def'] * avg_goals
final['Away_xG'] = final['Away_Att'] * final['Home_Def'] * avg_goals
final['Home_Score'] = final['Home_xG'].round()
final['Away_Score'] = final['Away_xG'].round()

final['Winner'] = np.where(final['Home_xG'] > final['Away_xG'], final['Home'],
                  np.where(final['Home_xG'] < final['Away_xG'], final['Away'], 'Draw'))

final

Unnamed: 0,Home,Away,Home_Att,Home_Def,Away_Att,Away_Def,Home_xG,Away_xG,Home_Score,Away_Score,Winner
0,Argentina,France,1.0,0.47093,1.0,0.617187,1.171875,0.894171,1.0,1.0,Argentina
