# Football Betting using FiveThirtyEight's Soccer Power Index
## Goal
Backtest a strategy of using the FiveThirtyEight's Soccer Power Index for betting.

## Imports

In [1]:
import concurrent.futures
import io
import json
import os
import warnings
from datetime import datetime

import numpy as np
import optuna
import optuna.logging
import pandas as pd
import requests

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARN)

ROOT_DIR = os.getcwd()
DATA_DIR = os.path.join(ROOT_DIR, "data")


  from .autonotebook import tqdm as notebook_tqdm


## FiveThirtyEight
Download Soccer Power Index dataset.

In [2]:
SPI_SOCCER_URL = "https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv"

content = requests.get(SPI_SOCCER_URL, verify=False).content
df_spi = pd.read_csv(io.StringIO(content.decode()))
df_spi["date"] = pd.to_datetime(df_spi["date"], format="%Y-%m-%d")

assert df_spi["date"].isna().sum() == 0

df_spi.sample(5)


Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
45940,2021,2021-09-13,2411,Barclays Premier League,Everton,Burnley,75.0,63.65,0.5412,0.2121,...,32.3,36.2,3.0,1.0,1.66,1.92,1.18,1.19,3.15,1.05
13539,2018,2018-09-22,1832,Belgian Jupiler League,St. Truidense,Antwerp,43.91,51.11,0.3584,0.3289,...,6.2,19.3,2.0,0.0,,,,,,
53840,2021,2022-04-16,1844,French Ligue 2,Dunkerque,Grenoble,19.1,22.56,0.3263,0.3127,...,86.0,73.1,0.0,3.0,,,,,,
54972,2021,2022-05-11,1837,Danish SAS-Ligaen,Randers FC,AaB,38.95,43.13,0.3768,0.3538,...,0.0,0.0,2.0,2.0,,,,,,
22486,2018,2019-06-04,1871,Spanish Segunda Division,Albacete,Málaga,39.29,40.95,0.4136,0.2572,...,20.7,37.3,1.0,2.0,,,,,,


## Football-Data.co.uk
[football-data.co.uk](https://www.football-data.co.uk) is a website that provides historical betting odds for many soccer leagues.

In [4]:
FOOTBALL_DATA_MAIN_URL = "https://www.football-data.co.uk/mmz4281/{season}/{league}.csv"
LEAGUES = {
    "E0": ["Barclays Premier League"],
    "E1": ["English League Championship"],
    "E2": ["English League One"],
    "E3": ["English League Two"],
    "SC0": ["Scottish Premiership"],
    "D1": ["German Bundesliga"],
    "D2": ["German 2. Bundesliga"],
    "I1": ["Italy Serie A"],
    "I2": ["Italy Serie B"],
    "SP1": ["Spanish Primera Division"],
    "SP2": ["Spanish Segunda Division"],
    "F1": ["French Ligue 1"],
    "F2": ["French Ligue 2"],
    "N1": ["Dutch Eredivisie"],
    "B1": ["Belgian Jupiler League"],
    "P1": ["Portuguese Liga"],
    "T1": ["Turkish Turkcell Super Lig"],
    "G1": ["Greek Super League"],
}

FOOTBALL_DATA_OTHER_URL = "https://www.football-data.co.uk/new/{league}.csv"
OTHER_LEAGUES = {
    "ARG": ["Argentina Primera Division"],
    "AUT": ["Austrian T-Mobile Bundesliga"],
    "BRA": ["Brasileiro Série A"],
    "CHN": ["Chinese Super League"],
    "DNK": ["Danish SAS-Ligaen"],
    "JPN": ["Japanese J League"],
    "MEX": [
        "Mexican Primera Division Torneo Apertura",
        "Mexican Primera Division Torneo Clausura",
    ],
    "NOR": ["Norwegian Tippeligaen"],
    "RUS": ["Russian Premier Liga"],
    "SWE": ["Swedish Allsvenskan"],
    "SWZ": ["Swiss Raiffeisen Super League"],
}


def url_to_pandas(url):
    """Download URL content to a pandas dataframe."""
    content = requests.get(url, verify=False).content
    data = pd.read_csv(io.StringIO(content.decode(encoding="latin1")))
    data = data.dropna(how="all", axis=0)
    data = data.dropna(how="all", axis=1)
    data["URL"] = url
    return data


def get_football_data_main(year, league):
    """Get football data."""
    season = str(year - 1)[-2:] + str(year)[-2:]
    url = FOOTBALL_DATA_MAIN_URL.format(season=season, league=league)
    data = url_to_pandas(url)
    data["Season"] = season
    return data


def get_football_data_other(league):
    """Get football data."""
    url = FOOTBALL_DATA_OTHER_URL.format(league=league)
    data = url_to_pandas(url)
    data["Div"] = league
    return data


with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(get_football_data_main, year=year, league=league)
        for league in LEAGUES.keys()
        for year in range(2023, 2015, -1)
    ]
    results_main = [
        future.result() for future in concurrent.futures.as_completed(futures)
    ]

    futures = [
        executor.submit(get_football_data_other, league=league)
        for league in OTHER_LEAGUES.keys()
    ]
    results_other = [
        future.result() for future in concurrent.futures.as_completed(futures)
    ]

df_bet_main = pd.concat(results_main)
df_bet_other = pd.concat(results_other)


Rename columns and fill gaps to match main and other leagues.

In [5]:
df_bet_main["MaxH"] = df_bet_main["MaxH"].fillna(df_bet_main["BbMxH"])
df_bet_main["MaxD"] = df_bet_main["MaxD"].fillna(df_bet_main["BbMxD"])
df_bet_main["MaxA"] = df_bet_main["MaxA"].fillna(df_bet_main["BbMxA"])
df_bet_main["AvgH"] = df_bet_main["AvgH"].fillna(df_bet_main["BbAvH"])
df_bet_main["AvgD"] = df_bet_main["AvgD"].fillna(df_bet_main["BbAvD"])
df_bet_main["AvgA"] = df_bet_main["AvgA"].fillna(df_bet_main["BbAvA"])

df_bet_other = df_bet_other.rename(
    columns={
        "Home": "HomeTeam",
        "Away": "AwayTeam",
        "HG": "FTHG",
        "AG": "FTAG",
        "Res": "FTR",
        "PH": "PSH",
        "PD": "PSD",
        "PA": "PSA",
    }
)

df_bet = pd.concat([df_bet_main, df_bet_other])


Dates comes in differents formats depending on the year.

In [6]:
date1 = pd.to_datetime(df_bet["Date"], format="%d/%m/%Y", errors="coerce")
date2 = pd.to_datetime(df_bet["Date"], format="%d/%m/%y", errors="coerce")
df_bet["Date"] = date1.fillna(date2)

assert df_bet["Date"].isna().sum() == 0

df_bet.sample(5)


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,BbAvAHH,BbMxAHA,BbAvAHA,LBH,LBD,LBA,HFKC,AFKC,Country,League
51,E3,2021-08-28,15:00,Leyton Orient,Bradford,2.0,0.0,H,0.0,0.0,...,,,,,,,,,,
154,B1,2020-12-20,15:00,Oud-Heverlee Leuven,Mechelen,1.0,2.0,A,0.0,2.0,...,,,,,,,,,,
93,E1,2017-09-16,,Sheffield United,Norwich,0.0,1.0,A,0.0,1.0,...,2.1,1.81,1.77,2.45,3.4,2.75,,,,
399,E1,2017-02-25,,Preston,QPR,2.0,1.0,H,1.0,1.0,...,2.06,1.83,1.79,2.05,3.25,3.75,,,,
165,N1,2016-01-23,,Heerenveen,Willem II,3.0,1.0,H,0.0,1.0,...,2.02,1.87,1.82,2.0,3.5,3.5,,,,


Each data source uses different ways of writing the same clubs names. I will use a dict to make names the same.

In [7]:
# from thefuzz import fuzz
# import networkx as nx

# # Uncomment this block to generate empty names dict

# frames = [df_spi["team1"], df_spi["team2"], df_bet["HomeTeam"], df_bet["AwayTeam"]]
# names = pd.concat(frames).drop_duplicates()

# pairs = [
#     (name, other_name)
#     for name in names
#     for other_name in names
#     if fuzz.partial_ratio(name, other_name) > 90
# ]

# graph = nx.Graph()
# graph.add_edges_from(pairs)

# clusters = [list(cluster) for cluster in nx.connected_components(graph)]
# clusters = {cluster[0]: cluster for cluster in clusters}
# (
#     pd.Series(clusters)
#     .sort_index()
#     .to_json(os.path.join(DATA_DIR, "names.json"), force_ascii=False)
# )


In [8]:
with open(os.path.join(DATA_DIR, "names.json"), encoding="utf-8") as file:
    names_dict = json.load(file)


names_dict = {
    name.strip(): i.strip() for i, name_list in names_dict.items() for name in name_list
}


def clean(series, translate_dict):
    """Clean text in pandas series."""
    return series.str.strip().apply(
        lambda x: translate_dict[x] if x in translate_dict else x
    )


df_spi["home"] = clean(df_spi["team1"], names_dict)
df_spi["away"] = clean(df_spi["team2"], names_dict)
df_bet["home"] = clean(df_bet["HomeTeam"], names_dict)
df_bet["away"] = clean(df_bet["AwayTeam"], names_dict)


Odds dates are no consistent as FiveThirtyEight, the dates sometimes does not match due to timezone.

To fix this, I will make copies changing the date for the date after and before. This will make sure that it find the right match.

In [9]:
df_bet_before = df_bet.copy()
df_bet_after = df_bet.copy()

df_bet_before["Date"] = df_bet_before["Date"] - pd.Timedelta(days=1)
df_bet_after["Date"] = df_bet_after["Date"] + pd.Timedelta(days=1)

df_bet_expanded = pd.concat((df_bet, df_bet_before, df_bet_after))


Merge datasets.

In [10]:
df_bet_expanded["dt"] = df_bet_expanded["Date"].dt.date
df_spi["dt"] = df_spi["date"].dt.date
df = df_spi.merge(
    df_bet_expanded,
    how="inner",
    on=["dt", "home", "away"],
    validate="1:1",
)
df = df.drop(columns=["HomeTeam", "AwayTeam", "team1", "team2"])


Add each results point of view to the dataset.

In [11]:
df_home = df.copy()
df_home["club"] = df["home"]
df_home["opponent"] = df["away"]
df_home["pov"] = "home"
df_home["outcome"] = df["FTR"] == "H"
df_home["prob"] = df["prob1"]
df_home["Avg"] = df["AvgH"]
df_home["Max"] = df["MaxH"]
df_home["B365"] = df["B365H"]
df_home["PS"] = df["PSH"]


In [12]:
df_away = df.copy()
df_away["club"] = df["away"]
df_away["opponent"] = df["home"]
df_away["pov"] = "away"
df_away["outcome"] = df["FTR"] == "A"
df_away["prob"] = df["prob2"]
df_away["Avg"] = df["AvgA"]
df_away["Max"] = df["MaxA"]
df_away["B365"] = df["B365A"]
df_away["PS"] = df["PSA"]


In [13]:
df_draw = df.copy()
df_draw["club"] = df["home"]
df_draw["opponent"] = df["away"]
df_draw["pov"] = "draw"
df_draw["outcome"] = df["FTR"] == "D"
df_draw["prob"] = df["probtie"]
df_draw["Avg"] = df["AvgD"]
df_draw["Max"] = df["MaxD"]
df_draw["B365"] = df["B365D"]
df_draw["PS"] = df["PSD"]


In [14]:
df = pd.concat((df_home, df_away, df_draw))[
    [
        "date",
        "league",
        "club",
        "opponent",
        "pov",
        "outcome",
        "prob",
        "Avg",
        "Max",
        "B365",
        "PS",
    ]
]

df.sample(5)


Unnamed: 0,date,league,club,opponent,pov,outcome,prob,Avg,Max,B365,PS
23872,2019-12-21,German 2. Bundesliga,Arminia,FC St. Pauli,away,False,0.4411,2.22,2.3,2.2,2.26
30428,2020-11-24,English League One,Peterborough United,Plymouth,draw,False,0.2126,3.75,4.0,3.8,3.98
33385,2021-02-20,Austrian T-Mobile Bundesliga,Ried,Hartberg,away,False,0.2415,2.75,2.92,,2.83
4774,2017-11-03,Italy Serie B,Palermo,Pescara,away,False,0.3551,3.05,3.3,3.1,3.3
5465,2017-12-02,Turkish Turkcell Super Lig,Yeni Malatyaspor,Akhisarspor,away,False,0.1886,4.55,5.03,4.75,5.03


In [16]:
df["KellyAvg"] = (((df["Avg"] - 1) * df["prob"]) - (1 - df["prob"])) / (df["Avg"] - 1)
df["KellyMax"] = (((df["Max"] - 1) * df["prob"]) - (1 - df["prob"])) / (df["Avg"] - 1)
df["KellyB365"] = (((df["B365"] - 1) * df["prob"]) - (1 - df["prob"])) / (df["Avg"] - 1)
df["KellyPS"] = (((df["PS"] - 1) * df["prob"]) - (1 - df["prob"])) / (df["Avg"] - 1)

# +EV Betting

In [21]:
df["EVAvg"] = df["prob"] * (df["Avg"] - 1) - (1 - df["prob"])
df["EVMax"] = df["prob"] * (df["Max"] - 1) - (1 - df["prob"])
df["EVB365"] = df["prob"] * (df["B365"] - 1) - (1 - df["prob"])
df["EVPS"] = df["prob"] * (df["PS"] - 1) - (1 - df["prob"])

df.sample(5)


Unnamed: 0,date,league,club,opponent,pov,outcome,prob,Avg,Max,B365,PS,KellyAvg,KellyMax,KellyB365,KellyPS,EVAvg,EVMax,EVB365,EVPS
38531,2021-09-20,Portuguese Liga,Tondela,Sp Braga,away,False,0.1473,8.94,10.0,8.0,9.75,0.039907,0.059572,0.022469,0.054934,0.316862,0.473,0.1784,0.436175
36728,2021-07-10,Swedish Allsvenskan,Varberg,AIK,away,False,0.2232,6.18,6.84,,6.78,0.073239,0.101677,,0.099092,0.379376,0.526688,,0.513296
42581,2022-02-05,French Ligue 2,Quevilly Rouen,Grenoble,home,True,0.3038,2.5,2.63,2.5,2.6,-0.160333,-0.134004,-0.160333,-0.14008,-0.2405,-0.201006,-0.2405,-0.21012
46441,2022-07-15,German 2. Bundesliga,Hannover 96,1. FCK,away,False,0.4228,2.8,2.9,2.8,2.88,0.102133,0.125622,0.102133,0.120924,0.18384,0.22612,0.18384,0.217664
19193,2019-07-14,Brasileiro Série A,Cruzeiro,Botafogo,draw,True,0.2443,3.6,3.8,,3.69,-0.046354,-0.027562,,-0.037897,-0.12052,-0.07166,,-0.098533


In [29]:
def simulate(data, bookmaker="Avg", threshold=0.0):
    """Simulate +EV betting."""
    return (data[f"EV{bookmaker}"] > threshold) * (data[f"Kelly{bookmaker}"] * data["outcome"] * data[bookmaker] - data[f"Kelly{bookmaker}"])


results = simulate(df, bookmaker="Avg", threshold=0.0)
results.sum()


-230.77491205367573

Now let's find out the best threshold and leagues combination that maximizes profits.

In [30]:
def objective(trial):
    """Optuna objective."""
    threshold = trial.suggest_float("threshold", 0.0, 1.0)
    leagues = [
        l
        for l in df["league"].sort_values().unique()
        if trial.suggest_categorical(l, [True, False])
    ]
    return simulate(
        df.query(f"league in {leagues}").dropna(how="all", axis=0).dropna(how="all", axis=1),
        bookmaker="Avg",
        threshold=threshold,
    ).sum()


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=2500, show_progress_bar=True)

print(f"Profit: {study.best_value:.2f}")
print(f"Threshold: {study.best_params['threshold']:.3f}")
print(
    f"Leagues:{json.dumps([k for k, v in study.best_params.items() if k not in ['threshold'] and v], indent=2, ensure_ascii=False)}"
)


100%|██████████| 500/500 [03:50<00:00,  2.17it/s]

Profit: 17.12
Threshold: 0.809
Leagues:[
  "Austrian T-Mobile Bundesliga",
  "Belgian Jupiler League",
  "Brasileiro Série A",
  "Chinese Super League",
  "Dutch Eredivisie",
  "English League Championship",
  "English League Two",
  "German 2. Bundesliga",
  "Greek Super League",
  "Italy Serie A",
  "Italy Serie B",
  "Mexican Primera Division Torneo Clausura",
  "Spanish Segunda Division",
  "Swedish Allsvenskan",
  "Swiss Raiffeisen Super League",
  "Turkish Turkcell Super Lig"
]





# Conclusion
This strategy would be break even against the average betting site. However, it should be able to have a small margin agains some specific websites that offers good odds.
