# Football Betting using FiveThirtyEight's Soccer Power Index
## Goal
Backtest a strategy of using the FiveThirtyEight's Soccer Power Index for betting.

## Imports

In [1]:
import concurrent.futures
import io
import json
import os
import warnings
from datetime import datetime

import numpy as np
import optuna
import optuna.logging
import pandas as pd
import requests

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARN)

ROOT_DIR = os.getcwd()
DATA_DIR = os.path.join(ROOT_DIR, "data")


  from .autonotebook import tqdm as notebook_tqdm


## FiveThirtyEight
Download Soccer Power Index dataset.

In [2]:
SPI_SOCCER_URL = "https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv"

content = requests.get(SPI_SOCCER_URL, verify=False).content
df_spi = pd.read_csv(io.StringIO(content.decode()))
df_spi["date"] = pd.to_datetime(df_spi["date"], format="%Y-%m-%d")

assert df_spi["date"].isna().sum() == 0

df_spi.sample(5)


Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
63548,2022,2023-03-04,1845,German Bundesliga,Borussia Monchengladbach,SC Freiburg,67.65,72.97,0.4007,0.3537,...,,,,,,,,,,
30326,2019,2020-02-22,2413,English League One,Lincoln City,Gillingham,19.75,23.13,0.3368,0.3667,...,0.4,15.0,0.0,0.0,,,,,,
54219,2022,2022-04-23,2160,United Soccer League,Tulsa Roughnecks,Colorado Springs Switchbacks FC,13.57,20.1,0.3461,0.3973,...,21.7,13.3,0.0,2.0,0.65,1.25,1.16,2.21,0.0,2.1
7234,2017,2018-01-16,1844,French Ligue 2,Auxerre,Niort,26.11,31.79,0.3512,0.324,...,28.9,4.6,5.0,0.0,,,,,,
42336,2020,2021-04-25,1882,Turkish Turkcell Super Lig,Yeni Malatyaspor,Ankaragucu,34.9,36.39,0.4548,0.2788,...,60.6,41.2,2.0,1.0,,,,,,


## Football-Data.co.uk
[football-data.co.uk](https://www.football-data.co.uk) is a website that provides historical betting odds for many soccer leagues.

In [3]:
FOOTBALL_DATA_MAIN_URL = "https://www.football-data.co.uk/mmz4281/{season}/{league}.csv"
LEAGUES = {
    "E0": ["Barclays Premier League"],
    "E1": ["English League Championship"],
    "E2": ["English League One"],
    "E3": ["English League Two"],
    "SC0": ["Scottish Premiership"],
    "D1": ["German Bundesliga"],
    "D2": ["German 2. Bundesliga"],
    "I1": ["Italy Serie A"],
    "I2": ["Italy Serie B"],
    "SP1": ["Spanish Primera Division"],
    "SP2": ["Spanish Segunda Division"],
    "F1": ["French Ligue 1"],
    "F2": ["French Ligue 2"],
    "N1": ["Dutch Eredivisie"],
    "B1": ["Belgian Jupiler League"],
    "P1": ["Portuguese Liga"],
    "T1": ["Turkish Turkcell Super Lig"],
    "G1": ["Greek Super League"],
}

FOOTBALL_DATA_OTHER_URL = "https://www.football-data.co.uk/new/{league}.csv"
OTHER_LEAGUES = {
    "ARG": ["Argentina Primera Division"],
    "AUT": ["Austrian T-Mobile Bundesliga"],
    "BRA": ["Brasileiro Série A"],
    "CHN": ["Chinese Super League"],
    "DNK": ["Danish SAS-Ligaen"],
    "JPN": ["Japanese J League"],
    "MEX": [
        "Mexican Primera Division Torneo Apertura",
        "Mexican Primera Division Torneo Clausura",
    ],
    "NOR": ["Norwegian Tippeligaen"],
    "RUS": ["Russian Premier Liga"],
    "SWE": ["Swedish Allsvenskan"],
    "SWZ": ["Swiss Raiffeisen Super League"],
}


def url_to_pandas(url):
    """Download URL content to a pandas dataframe."""
    content = requests.get(url, verify=False).content
    data = pd.read_csv(io.StringIO(content.decode(encoding="latin1")))
    data = data.dropna(how="all", axis=0)
    data = data.dropna(how="all", axis=1)
    data["URL"] = url
    return data


def get_football_data_main(year, league):
    """Get football data."""
    season = str(year - 1)[-2:] + str(year)[-2:]
    url = FOOTBALL_DATA_MAIN_URL.format(season=season, league=league)
    data = url_to_pandas(url)
    data["Season"] = season
    return data


def get_football_data_other(league):
    """Get football data."""
    url = FOOTBALL_DATA_OTHER_URL.format(league=league)
    data = url_to_pandas(url)
    data["Div"] = league
    return data


with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(get_football_data_main, year=year, league=league)
        for league in LEAGUES.keys()
        for year in range(2023, 2015, -1)
    ]
    results_main = [
        future.result() for future in concurrent.futures.as_completed(futures)
    ]

    futures = [
        executor.submit(get_football_data_other, league=league)
        for league in OTHER_LEAGUES.keys()
    ]
    results_other = [
        future.result() for future in concurrent.futures.as_completed(futures)
    ]

df_bet_main = pd.concat(results_main)
df_bet_other = pd.concat(results_other)


Rename columns and fill gaps to match main and other leagues.

In [4]:
df_bet_main["MaxH"] = df_bet_main["MaxH"].fillna(df_bet_main["BbMxH"])
df_bet_main["MaxD"] = df_bet_main["MaxD"].fillna(df_bet_main["BbMxD"])
df_bet_main["MaxA"] = df_bet_main["MaxA"].fillna(df_bet_main["BbMxA"])
df_bet_main["AvgH"] = df_bet_main["AvgH"].fillna(df_bet_main["BbAvH"])
df_bet_main["AvgD"] = df_bet_main["AvgD"].fillna(df_bet_main["BbAvD"])
df_bet_main["AvgA"] = df_bet_main["AvgA"].fillna(df_bet_main["BbAvA"])

df_bet_other = df_bet_other.rename(
    columns={
        "Home": "HomeTeam",
        "Away": "AwayTeam",
        "HG": "FTHG",
        "AG": "FTAG",
        "Res": "FTR",
        "PH": "PSH",
        "PD": "PSD",
        "PA": "PSA",
    }
)

df_bet = pd.concat([df_bet_main, df_bet_other])


Dates comes in differents formats depending on the year.

In [5]:
date1 = pd.to_datetime(df_bet["Date"], format="%d/%m/%Y", errors="coerce")
date2 = pd.to_datetime(df_bet["Date"], format="%d/%m/%y", errors="coerce")
df_bet["Date"] = date1.fillna(date2)

assert df_bet["Date"].isna().sum() == 0

df_bet.sample(5)


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,HFKC,AFKC,Country,League
3103,ARG,2020-11-04,00:00,River Plate,Banfield,1.0,3.0,A,,,...,,,,,,,,,Argentina,Copa de la Liga Profesional
100,SP1,2016-11-04,,Malaga,Sp Gijon,3.0,2.0,H,1.0,1.0,...,31.0,-0.75,1.97,1.92,2.01,1.94,,,,
507,E2,2016-04-19,,Coventry,Bradford,1.0,0.0,H,0.0,0.0,...,25.0,0.25,1.83,1.79,2.13,2.05,,,,
645,DNK,2015-09-20,15:00,FC Copenhagen,Hobro,1.0,0.0,H,,,...,,,,,,,,,Denmark,Superliga
3169,BRA,2020-10-08,00:30,Goias,Fluminense,2.0,4.0,A,,,...,,,,,,,,,Brazil,Serie A


Each data source uses different ways of writing the same clubs names. I will use a dict to make names the same.

In [6]:
# from thefuzz import fuzz
# import networkx as nx

# # Uncomment this block to generate empty names dict

# frames = [df_spi["team1"], df_spi["team2"], df_bet["HomeTeam"], df_bet["AwayTeam"]]
# names = pd.concat(frames).drop_duplicates()

# pairs = [
#     (name, other_name)
#     for name in names
#     for other_name in names
#     if fuzz.partial_ratio(name, other_name) > 90
# ]

# graph = nx.Graph()
# graph.add_edges_from(pairs)

# clusters = [list(cluster) for cluster in nx.connected_components(graph)]
# clusters = {cluster[0]: cluster for cluster in clusters}
# (
#     pd.Series(clusters)
#     .sort_index()
#     .to_json(os.path.join(DATA_DIR, "names.json"), force_ascii=False)
# )


In [7]:
with open(os.path.join(DATA_DIR, "names.json"), encoding="utf-8") as file:
    names_dict = json.load(file)


names_dict = {
    name.strip(): i.strip() for i, name_list in names_dict.items() for name in name_list
}


def clean(series, translate_dict):
    """Clean text in pandas series."""
    return series.str.strip().apply(
        lambda x: translate_dict[x] if x in translate_dict else x
    )


df_spi["home"] = clean(df_spi["team1"], names_dict)
df_spi["away"] = clean(df_spi["team2"], names_dict)
df_bet["home"] = clean(df_bet["HomeTeam"], names_dict)
df_bet["away"] = clean(df_bet["AwayTeam"], names_dict)


Odds dates are no consistent as FiveThirtyEight, the dates sometimes does not match due to timezone.

To fix this, I will make copies changing the date for the date after and before. This will make sure that it find the right match.

In [8]:
df_bet_before = df_bet.copy()
df_bet_after = df_bet.copy()

df_bet_before["Date"] = df_bet_before["Date"] - pd.Timedelta(days=1)
df_bet_after["Date"] = df_bet_after["Date"] + pd.Timedelta(days=1)

df_bet_expanded = pd.concat((df_bet, df_bet_before, df_bet_after))


Merge datasets.

In [9]:
df_bet_expanded["dt"] = df_bet_expanded["Date"].dt.date
df_spi["dt"] = df_spi["date"].dt.date
df = df_spi.merge(
    df_bet_expanded,
    how="inner",
    on=["dt", "home", "away"],
    validate="1:1",
)
df = df.drop(columns=["HomeTeam", "AwayTeam", "team1", "team2"])


Add each results point of view to the dataset.

In [10]:
df_home = df.copy()
df_home["club"] = df["home"]
df_home["opponent"] = df["away"]
df_home["pov"] = "home"
df_home["outcome"] = df["FTR"] == "H"
df_home["prob"] = df["prob1"]
df_home["Avg"] = df["AvgH"]
df_home["Max"] = df["MaxH"]
df_home["B365"] = df["B365H"]
df_home["PS"] = df["PSH"]


In [11]:
df_away = df.copy()
df_away["club"] = df["away"]
df_away["opponent"] = df["home"]
df_away["pov"] = "away"
df_away["outcome"] = df["FTR"] == "A"
df_away["prob"] = df["prob2"]
df_away["Avg"] = df["AvgA"]
df_away["Max"] = df["MaxA"]
df_away["B365"] = df["B365A"]
df_away["PS"] = df["PSA"]


In [12]:
df_draw = df.copy()
df_draw["club"] = df["home"]
df_draw["opponent"] = df["away"]
df_draw["pov"] = "draw"
df_draw["outcome"] = df["FTR"] == "D"
df_draw["prob"] = df["probtie"]
df_draw["Avg"] = df["AvgD"]
df_draw["Max"] = df["MaxD"]
df_draw["B365"] = df["B365D"]
df_draw["PS"] = df["PSD"]


In [13]:
df = pd.concat((df_home, df_away, df_draw))[
    [
        "date",
        "league",
        "club",
        "opponent",
        "pov",
        "outcome",
        "prob",
        "Avg",
        "Max",
        "B365",
        "PS",
    ]
]

df.sample(5)


Unnamed: 0,date,league,club,opponent,pov,outcome,prob,Avg,Max,B365,PS
10223,2018-08-18,English League One,Bristol Rovers,Wycombe Wanderers,away,True,0.323,2.58,2.75,2.62,2.64
16659,2019-03-16,English League One,Sunderland,Walsall,home,True,0.7101,1.5,1.55,1.5,1.48
35983,2021-05-08,French Ligue 2,Rodez,Nancy,draw,True,0.2934,2.91,3.12,3.1,2.88
7738,2018-03-17,Italy Serie B,Empoli,F.B.C Unione Venezia,draw,False,0.2389,3.57,3.75,3.6,3.71
44043,2022-03-19,Spanish Segunda Division,Oviedo,Lugo,away,False,0.3704,2.73,2.95,2.75,2.82


# +EV Betting

In [14]:
df["EVAvg"] = df["prob"] * (df["Avg"] - 1) - (1 - df["prob"])
df["EVMax"] = df["prob"] * (df["Max"] - 1) - (1 - df["prob"])
df["EVB365"] = df["prob"] * (df["B365"] - 1) - (1 - df["prob"])
df["EVPS"] = df["prob"] * (df["PS"] - 1) - (1 - df["prob"])

df.sample(5)


Unnamed: 0,date,league,club,opponent,pov,outcome,prob,Avg,Max,B365,PS,EVAvg,EVMax,EVB365,EVPS
18791,2019-05-19,Brasileiro Série A,Bahía,São Paulo,away,False,0.1825,4.91,5.5,,5.1,-0.103925,0.00375,,-0.06925
21026,2019-09-22,Norwegian Tippeligaen,Kristiansund,Lille,away,False,0.3771,3.3,3.5,,3.44,0.24443,0.31985,,0.297224
21241,2019-09-28,Austrian T-Mobile Bundesliga,St. Polten,LASK,home,False,0.1265,8.74,10.5,,9.86,0.10561,0.32825,,0.24729
1378,2017-03-11,Barclays Premier League,West Ham United,Bournemouth,away,False,0.3272,2.78,2.88,2.8,2.84,-0.090384,-0.057664,-0.08384,-0.070752
15450,2019-02-09,German 2. Bundesliga,Greuther Furth,MSV Duisburg,draw,False,0.2823,3.3,3.4,3.4,3.37,-0.06841,-0.04018,-0.04018,-0.048649


In [15]:
def simulate(data, bookmaker="Avg", threshold=0.0):
    """ "Simulate +EV betting."""
    return (df[f"EV{bookmaker}"] > threshold) * (data["outcome"] * data[bookmaker] - 1)


results = simulate(df, bookmaker="Avg", threshold=0.0)
results.sum()


-4351.710000000001

Now let's find out the best threshold and leagues combination that maximizes profits.

In [20]:
def objective(trial):
    """Optuna objective."""
    threshold = trial.suggest_float("threshold", 0.0, 1.0)
    leagues = [
        l
        for l in df["league"].sort_values().unique()
        if trial.suggest_categorical(l, [True, False])
    ]
    return simulate(
        df.query(f"league in {leagues}").dropna(how="all", axis=0).dropna(how="all", axis=1),
        bookmaker="Avg",
        threshold=threshold,
    ).sum()


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000, show_progress_bar=True)

print(f"Profit: {study.best_value:.2f}")
print(f"Threshold: {study.best_params['threshold']:.3f}")
print(
    f"Leagues:{json.dumps([k for k, v in study.best_params.items() if k not in ['threshold'] and v], indent=2, ensure_ascii=False)}"
)


100%|██████████| 1000/1000 [05:07<00:00,  3.25it/s]

Profit: 25.51
Threshold: 0.827
Leagues:[
  "Argentina Primera Division",
  "Austrian T-Mobile Bundesliga",
  "Barclays Premier League",
  "Brasileiro Série A",
  "English League One",
  "French Ligue 1",
  "Greek Super League",
  "Italy Serie A",
  "Mexican Primera Division Torneo Clausura",
  "Scottish Premiership",
  "Swiss Raiffeisen Super League"
]





# Conclusion
This strategy would be break even against the average betting site. However, it should be able to have a small margin agains some specific websites that offers good odds.
