In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import poisson
data = pd.read_csv('World Football Results 2018 to 2022 - data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4096 entries, 0 to 4095
Data columns (total 23 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   year         4096 non-null   int64  
 1   date         4096 non-null   object 
 2   tournament   4096 non-null   object 
 3   h_team       4096 non-null   object 
 4   a_team       4096 non-null   object 
 5   h_elo        4096 non-null   int64  
 6   a_elo        4096 non-null   int64  
 7   h_score      4096 non-null   int64  
 8   a_score      4096 non-null   int64  
 9   location     4096 non-null   object 
 10  neutral      4096 non-null   object 
 11  Unnamed: 11  0 non-null      float64
 12  Unnamed: 12  0 non-null      float64
 13  Unnamed: 13  0 non-null      float64
 14  Unnamed: 14  0 non-null      float64
 15  Unnamed: 15  0 non-null      float64
 16  Unnamed: 16  0 non-null      float64
 17  Unnamed: 17  0 non-null      float64
 18  Unnamed: 18  0 non-null      float64
 19  Unname

In [4]:
data = data.dropna(axis=1)

In [5]:
data.isnull().sum()

year          0
date          0
tournament    0
h_team        0
a_team        0
h_elo         0
a_elo         0
h_score       0
a_score       0
location      0
neutral       0
dtype: int64

In [6]:
data

Unnamed: 0,year,date,tournament,h_team,a_team,h_elo,a_elo,h_score,a_score,location,neutral
0,2018,January 2,Gulf Cup,Iraq,United Arab Emirates,1570,1560,0,0,in Kuwait,Yes
1,2018,January 2,Gulf Cup,Oman,Bahrain,1511,1418,1,0,in Kuwait,Yes
2,2018,January 5,Gulf Cup,United Arab Emirates,Oman,1561,1526,0,0,in Kuwait,Yes
3,2018,January 7,Friendly,Sweden,Estonia,1825,1508,1,1,in the United Arab Emirates,Yes
4,2018,January 11,Friendly,Finland,Jordan,1595,1480,2,1,in the United Arab Emirates,Yes
...,...,...,...,...,...,...,...,...,...,...,...
4091,2022,December 23,Southeast Asian Championship,Indonesia,Cambodia,1271,873,2,1,in Indonesia,No
4092,2022,December 24,Southeast Asian Championship,Singapore,Myanmar,1127,918,3,2,in Singapore,No
4093,2022,December 24,Southeast Asian Championship,Malaysia,Laos,1223,651,5,0,in Malaysia,No
4094,2022,December 26,Southeast Asian Championship,Thailand,Philippines,1374,1068,4,0,in Thailand,No


In [7]:
data_home = data[['h_team', 'h_score', 'a_score']]
data_away = data[['a_team', 'h_score', 'a_score']]

In [8]:
data_home = data_home.rename(columns={'h_team': 'Team', 'h_score': 'GoalsScored', 'a_score': 'GoalsConceded'})
data_away = data_away.rename(columns={'a_team': 'Team', 'h_score': 'GoalsConceded', 'a_score': 'GoalsScored'})

In [9]:
team_strength = pd.concat([data_home, data_away], ignore_index=True).groupby('Team').mean()
team_strength

Unnamed: 0_level_0,GoalsScored,GoalsConceded
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,0.947368,1.947368
Albania,1.066667,1.200000
Algeria,2.018868,0.849057
Andorra,0.488889,1.711111
Angola,1.205882,0.735294
...,...,...
Vietnam,1.710526,1.026316
Wales,1.000000,1.098039
Yemen,0.454545,2.090909
Zambia,1.384615,1.442308


In [10]:
def predict_point(home, away):
    if home in team_strength.index and away in team_strength.index:
        lamb_home = team_strength.at[home, 'GoalsScored'] * team_strength.at[away, 'GoalsConceded']
        lamb_away = team_strength.at[away, 'GoalsScored'] * team_strength.at[home, 'GoalsConceded']
        prob_home, prob_away, prob_draw = 0, 0 , 0
        for x in range(0,11): #number of goals home team
            for y in range(0,11): #number of goals away team
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_home += p
                else:
                    prob_away += p
        points_home = 3 * prob_home + prob_draw
        points_away = 3 * prob_away + prob_draw
        return[(points_home, points_away), (prob_home*100, prob_draw*100, prob_away*100)]
    else:
        return [(0,0), (prob_home*100, prob_draw*100, prob_away*100)]

In [11]:
predict_point('Vietnam', 'Singapore')

[(2.1109892380372846, 0.7216473605554833),
 (64.80390629075727, 16.687204931456627, 18.492510374697236)]

Vietnam wins the match.
Result: Viet Nam: 2.11 pts, Singapore: 0.72 pts


Vietnam win probability: Viet Nam: 64.8%, Draw: 16.7%, Singapore: 18.5%