# Vanilla Goals Poisson Model

This model was adapted using the blog post from: https://pena.lt/y/2021/06/18/predicting-football-results-using-the-poisson-distribution/

and using data from https://beatthebookie.blog/

In [2]:
import pandas as pd
import requests
import pandas as pd
import json
import os

API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}
params = {'division':'Premier League', 'season': ['2023_2024']}
response = requests.get(url, headers=headers, params=params)
json_str = response.content.decode('utf-8')
df = pd.read_json(json_str)
print(df[["match_date", "home_team", "away_team", "home_goals", "home_xgoals", "away_goals", "away_xgoals"]].tail())

     match_date       home_team      away_team  home_goals  home_xgoals  \
375  2024-05-19        Man City       West Ham           3      2.31504   
376  2024-05-19       Liverpool         Wolves           2      5.65937   
377  2024-05-19         Burnley  Nott'm Forest           1      1.08222   
378  2024-05-19  Crystal Palace    Aston Villa           5      2.23189   
379  2024-05-19           Luton         Fulham           2      2.06743   

     away_goals  away_xgoals  
375           1     0.240159  
376           0     0.731254  
377           2     1.579620  
378           0     0.598846  
379           4     1.115980  


  df = pd.read_json(json_str)


In [3]:
import numpy as np
from scipy.stats import poisson


def log_likelihood(
        goals_home_observed,
        goals_away_observed,
        home_attack,
        home_defence,
        away_attack,
        away_defence,
        home_advantage
):
    
    # Using exponential to convert log-liklihood back to real number
    goal_expectation_home = np.exp(home_attack + away_defence + home_advantage)
    goal_expectation_away = np.exp(away_attack + home_defence)

    # If expectation is below 0 then tell model that it's bad by increasing log likelihood
    if goal_expectation_home < 0 or goal_expectation_away < 0: 
        return 10000
    
    # Poisson model to calculate probability that team scores goals observed given the model's expectation
    home_llk = poisson.pmf(goals_home_observed, goal_expectation_home)
    away_llk = poisson.pmf(goals_away_observed, goal_expectation_away)

    # Calculates total log liklihood for the match, the higher the value the better
    log_llk = np.log(home_llk) + np.log(away_llk)

    # Returning negative as we are using minimiser function later.
    return -log_llk

In [4]:
from scipy.optimize import minimize

def fit_poisson_model():
    # Generate list of unique teams in dataframe and get number of teams
    teams = np.sort(np.unique(np.concatenate([df["home_team"], df["away_team"]])))
    n_teams = len(teams)

    # Generate random model default values for attack and defence strength for each team and add a 0.25 value for home advantage
    params = np.concatenate(
        (
            np.random.uniform(0.5, 1.5, (n_teams)), #attack strength
            np.random.uniform(0.5, 1.5, (n_teams)), #defence strength
            [0.25], #home advantage
        )
    )

    def _fit(params, df, teams):
        # Function to fit the model

        # Creates dictionary to attach each team to their attack score (first 20 items), and defence score (next 20 items), and home adv. (last item)
        attack_params = dict(zip(teams, params[:n_teams]))
        defence_params = dict(zip(teams, params[n_teams : (2*n_teams)]))
        home_advantage = params[-1]

        # Create empty array, then loop through dataframe, and apply log_likelihood function with parameters from each row, append to array
        llk = list()
        for idx, row in df.iterrows():
            tmp = log_likelihood(
                row["home_goals"],
                row["away_goals"],
                attack_params[row["home_team"]],
                defence_params[row["home_team"]],
                attack_params[row["away_team"]],
                defence_params[row["away_team"]],
                home_advantage,
            )
            llk.append(tmp)
        
        # Return sum of log likelihood of all games
        return np.sum(llk)

    # Settings for optimization process, 100 iterations, do not display messages during run
    options = {
        "maxiter": 100,
        "disp": False,
    }

    # equality constraint, this maintains baseline or standard scale across all teams
    constraints = [{"type":"eq", "fun": lambda x: sum(x[:n_teams]) - n_teams}]

    # Train model on functions above and data
    res = minimize(
        _fit,
        params,
        args=(df, teams),
        constraints=constraints,
        options=options
    )

    # Processing model results
    model_params = dict(
        zip(
            ["attack_" + team for team in teams]
            + ["defence_" + team for team in teams]
            + ["home_adv"],
            res["x"]
        )
    )

    return model_params

model_params = fit_poisson_model()

  log_llk = np.log(home_llk) + np.log(away_llk)


In [5]:
from pprint import pprint

pprint(model_params)

{'attack_Arsenal': 1.3899655622119784,
 'attack_Aston Villa': 1.2366178633697997,
 'attack_Bournemouth': 0.898953386890275,
 'attack_Brentford': 0.9337334585656074,
 'attack_Brighton': 0.9131433215339007,
 'attack_Burnley': 0.6321821063513595,
 'attack_Chelsea': 1.2514588801510302,
 'attack_Crystal Palace': 0.9455354598542196,
 'attack_Everton': 0.5849274202440973,
 'attack_Fulham': 0.9123508970432253,
 'attack_Liverpool': 1.3435065122244272,
 'attack_Luton': 0.87647956077264,
 'attack_Man City': 1.4478166012833789,
 'attack_Man United': 0.9455774845613523,
 'attack_Newcastle': 1.349806055816141,
 "attack_Nott'm Forest": 0.8015980361062903,
 'attack_Sheffield United': 0.49571468949385095,
 'attack_Tottenham': 1.2098857462355457,
 'attack_West Ham': 1.0105976244795385,
 'attack_Wolves': 0.82014933281134,
 'defence_Arsenal': -1.3865700510862706,
 'defence_Aston Villa': -0.6538570863887184,
 'defence_Bournemouth': -0.5784773017107556,
 'defence_Brentford': -0.6072104685076984,
 'defence_B

In [6]:
def predict(home_team, away_team, params, max_goals=10):

    home_attack = params["attack_" + home_team]
    home_defence = params["defence_" + home_team]
    away_attack = params["attack_" + away_team]
    away_defence = params["defence_" + away_team]
    home_advantage = params["home_adv"]

    home_goal_expectation = np.exp(home_attack + away_defence + home_advantage)
    away_goal_expectation = np.exp(away_attack + home_defence)

    home_probs = poisson.pmf(list(range(max_goals + 1)), home_goal_expectation)
    away_probs = poisson.pmf(range(max_goals + 1), away_goal_expectation)

    # outer product of home_probs and away_probs vectors to create matrix where each entry is product of both probabilities for each scoreline
    probability_matrix = np.outer(home_probs, away_probs)

    return probability_matrix


In [7]:
probs = predict("Tottenham", "Arsenal", model_params)

np.set_printoptions(formatter={'float': '{:0.5f}'.format})
pprint(probs)

array([[0.04486, 0.09350, 0.09744, 0.06769, 0.03527, 0.01470, 0.00511,
        0.00152, 0.00040, 0.00009, 0.00002],
       [0.04575, 0.09536, 0.09938, 0.06905, 0.03598, 0.01500, 0.00521,
        0.00155, 0.00040, 0.00009, 0.00002],
       [0.02333, 0.04864, 0.05068, 0.03521, 0.01835, 0.00765, 0.00266,
        0.00079, 0.00021, 0.00005, 0.00001],
       [0.00793, 0.01654, 0.01723, 0.01197, 0.00624, 0.00260, 0.00090,
        0.00027, 0.00007, 0.00002, 0.00000],
       [0.00202, 0.00422, 0.00439, 0.00305, 0.00159, 0.00066, 0.00023,
        0.00007, 0.00002, 0.00000, 0.00000],
       [0.00041, 0.00086, 0.00090, 0.00062, 0.00032, 0.00014, 0.00005,
        0.00001, 0.00000, 0.00000, 0.00000],
       [0.00007, 0.00015, 0.00015, 0.00011, 0.00006, 0.00002, 0.00001,
        0.00000, 0.00000, 0.00000, 0.00000],
       [0.00001, 0.00002, 0.00002, 0.00002, 0.00001, 0.00000, 0.00000,
        0.00000, 0.00000, 0.00000, 0.00000],
       [0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,
 

In [8]:
print(f"Probability of a home win: {np.sum(np.tril(probs, -1))}")
print(f"Probability of a draw: {np.sum(np.diag(probs))}")
print(f"Probability of an away win: {np.sum(np.triu(probs, 1))}")

Probability of a home win: 0.17687557402474685
Probability of a draw: 0.20461428613644356
Probability of an away win: 0.6184980046565599
