# Vanilla Goals Poisson Model

This model was adapted using the blog post from: https://pena.lt/y/2021/06/18/predicting-football-results-using-the-poisson-distribution/

and using data from https://beatthebookie.blog/

In [1]:
import pandas as pd
import requests
import pandas as pd
import json
import os

API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}
params = {'division':'Premier League', 'season': ['2023_2024']}
response = requests.get(url, headers=headers, params=params)
json_str = response.content.decode('utf-8')
df = pd.read_json(json_str)
print(df[["match_date", "home_team", "away_team", "home_goals", "home_xgoals", "away_goals", "away_xgoals"]].tail())

     match_date       home_team      away_team  home_goals  home_xgoals  \
375  2024-05-19        Man City       West Ham           3      2.31504   
376  2024-05-19       Liverpool         Wolves           2      5.65937   
377  2024-05-19         Burnley  Nott'm Forest           1      1.08222   
378  2024-05-19  Crystal Palace    Aston Villa           5      2.23189   
379  2024-05-19           Luton         Fulham           2      2.06743   

     away_goals  away_xgoals  
375           1     0.240159  
376           0     0.731254  
377           2     1.579620  
378           0     0.598846  
379           4     1.115980  


  df = pd.read_json(json_str)


In [2]:
import numpy as np
from scipy.stats import poisson


def log_likelihood(
        goals_home_observed,
        goals_away_observed,
        home_attack,
        home_defence,
        away_attack,
        away_defence,
        home_advantage
):
    
    # Using exponential to convert log-liklihood back to real number
    goal_expectation_home = np.exp(home_attack + away_defence + home_advantage)
    goal_expectation_away = np.exp(away_attack + home_defence)

    # If expectation is below 0 then tell model that it's bad by increasing log likelihood
    if goal_expectation_home < 0 or goal_expectation_away < 0: 
        return 10000
    
    # Poisson model to calculate probability that team scores goals observed given the model's expectation
    home_llk = poisson.pmf(goals_home_observed, goal_expectation_home)
    away_llk = poisson.pmf(goals_away_observed, goal_expectation_away)

    # Calculates total log liklihood for the match, the higher the value the better
    log_llk = np.log(home_llk) + np.log(away_llk)

    # Returning negative as we are using minimiser function later.
    return -log_llk

In [3]:
from scipy.optimize import minimize

def fit_poisson_model():
    # Generate list of unique teams in dataframe and get number of teams
    teams = np.sort(np.unique(np.concatenate([df["home_team"], df["away_team"]])))
    n_teams = len(teams)

    # Generate random model default values for attack and defence strength for each team and add a 0.25 value for home advantage
    params = np.concatenate(
        (
            np.random.uniform(0.5, 1.5, (n_teams)), #attack strength
            np.random.uniform(0.5, 1.5, (n_teams)), #defence strength
            [0.25], #home advantage
        )
    )

    def _fit(params, df, teams):
        # Function to fit the model

        # Creates dictionary to attach each team to their attack score (first 20 items), and defence score (next 20 items), and home adv. (last item)
        attack_params = dict(zip(teams, params[:n_teams]))
        defence_params = dict(zip(teams, params[n_teams : (2*n_teams)]))
        home_advantage = params[-1]

        # Create empty array, then loop through dataframe, and apply log_likelihood function with parameters from each row, append to array
        llk = list()
        for idx, row in df.iterrows():
            tmp = log_likelihood(
                row["home_goals"],
                row["away_goals"],
                attack_params[row["home_team"]],
                defence_params[row["home_team"]],
                attack_params[row["away_team"]],
                defence_params[row["away_team"]],
                home_advantage,
            )
            llk.append(tmp)
        
        # Return sum of log likelihood of all games
        return np.sum(llk)

    # Settings for optimization process, 100 iterations, do not display messages during run
    options = {
        "maxiter": 100,
        "disp": False,
    }

    # equality constraint, this maintains baseline or standard scale across all teams
    constraints = [{"type":"eq", "fun": lambda x: sum(x[:n_teams]) - n_teams}]

    # Train model on functions above and data
    res = minimize(
        _fit,
        params,
        args=(df, teams),
        constraints=constraints,
        options=options
    )

    # Processing model results
    model_params = dict(
        zip(
            ["attack_" + team for team in teams]
            + ["defence_" + team for team in teams]
            + ["home_adv"],
            res["x"]
        )
    )

    return model_params

model_params = fit_poisson_model()

  log_llk = np.log(home_llk) + np.log(away_llk)


In [None]:
from pprint import pprint

pprint(model_params)

{'attack_Arsenal': 1.3899716350875886,
 'attack_Aston Villa': 1.2366280435882093,
 'attack_Bournemouth': 0.898991905827364,
 'attack_Brentford': 0.9337552978328465,
 'attack_Brighton': 0.9131792454920687,
 'attack_Burnley': 0.6321950742964448,
 'attack_Chelsea': 1.2514620417343691,
 'attack_Crystal Palace': 0.9455659727310841,
 'attack_Everton': 0.58493678130215,
 'attack_Fulham': 0.9122861775739516,
 'attack_Liverpool': 1.3435062214735645,
 'attack_Luton': 0.8764782206650822,
 'attack_Man City': 1.4478199197433328,
 'attack_Man United': 0.9455378321900526,
 'attack_Newcastle': 1.3498141262155285,
 "attack_Nott'm Forest": 0.8015870091868306,
 'attack_Sheffield United': 0.4957225473670446,
 'attack_Tottenham': 1.2098749956482573,
 'attack_West Ham': 1.0105592880895355,
 'attack_Wolves': 0.8201276639546942,
 'defence_Arsenal': -1.386554394744862,
 'defence_Aston Villa': -0.6538088362036165,
 'defence_Bournemouth': -0.5784861810039897,
 'defence_Brentford': -0.6071690800957636,
 'defence_

In [None]:
def predict(home_team, away_team, params, max_goals=10):

    home_attack = params["attack_" + home_team]
    home_defence = params["defence_" + home_team]
    away_attack = params["attack_" + away_team]
    away_defence = params["defence_" + away_team]
    home_advantage = params["home_adv"]

    home_goal_expectation = np.exp(home_attack + away_defence + home_advantage)
    away_goal_expectation = np.exp(away_attack + home_defence)

    home_probs = poisson.pmf(list(range(max_goals + 1)), home_goal_expectation)
    away_probs = poisson.pmf(range(max_goals + 1), away_goal_expectation)

    # outer product of home_probs and away_probs vectors to create matrix where each entry is product of both probabilities for each scoreline
    probability_matrix = np.outer(home_probs, away_probs)

    return probability_matrix


In [None]:
probs = predict("Arsenal", "Tottenham", model_params)

np.set_printoptions(formatter={'float': '{:0.5f}'.format})
pprint(probs)

array([[0.03422, 0.02868, 0.01202, 0.00336, 0.00070, 0.00012, 0.00002,
        0.00000, 0.00000, 0.00000, 0.00000],
       [0.08682, 0.07276, 0.03049, 0.00852, 0.00178, 0.00030, 0.00004,
        0.00001, 0.00000, 0.00000, 0.00000],
       [0.11012, 0.09229, 0.03867, 0.01080, 0.00226, 0.00038, 0.00005,
        0.00001, 0.00000, 0.00000, 0.00000],
       [0.09312, 0.07804, 0.03270, 0.00913, 0.00191, 0.00032, 0.00004,
        0.00001, 0.00000, 0.00000, 0.00000],
       [0.05905, 0.04949, 0.02074, 0.00579, 0.00121, 0.00020, 0.00003,
        0.00000, 0.00000, 0.00000, 0.00000],
       [0.02996, 0.02511, 0.01052, 0.00294, 0.00062, 0.00010, 0.00001,
        0.00000, 0.00000, 0.00000, 0.00000],
       [0.01267, 0.01062, 0.00445, 0.00124, 0.00026, 0.00004, 0.00001,
        0.00000, 0.00000, 0.00000, 0.00000],
       [0.00459, 0.00385, 0.00161, 0.00045, 0.00009, 0.00002, 0.00000,
        0.00000, 0.00000, 0.00000, 0.00000],
       [0.00146, 0.00122, 0.00051, 0.00014, 0.00003, 0.00001, 0.00000,
 

In [None]:
print(f"Probability of a home win: {np.sum(np.tril(probs, -1))}")
print(f"Probability of a draw: {np.sum(np.diag(probs))}")
print(f"Probability of an away win: {np.sum(np.triu(probs, 1))}")

Probability of a home win: 0.741746144836529
Probability of a draw: 0.15610944406975652
Probability of an away win: 0.10207439507613661
