# Football Betting using FiveThirtyEight's Soccer Power Index
## Goal
Backtest a strategy of using the FiveThirtyEight's Soccer Power Index for betting.

## Imports

In [1]:
import concurrent.futures
import io
import json
import os
import warnings
from datetime import datetime

import numpy as np
import optuna
import optuna.logging
import pandas as pd
import requests

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARN)

ROOT_DIR = os.getcwd()
DATA_DIR = os.path.join(ROOT_DIR, "data")


  from .autonotebook import tqdm as notebook_tqdm


## FiveThirtyEight
Download Soccer Power Index dataset.

In [2]:
SPI_SOCCER_URL = "https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv"

content = requests.get(SPI_SOCCER_URL, verify=False).content
df_spi = pd.read_csv(io.StringIO(content.decode()))
df_spi["date"] = pd.to_datetime(df_spi["date"], format="%Y-%m-%d")

assert df_spi["date"].isna().sum() == 0

df_spi.sample(5)


Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
65075,2022,2023-04-23,1843,French Ligue 1,Lyon,Marseille,71.75,75.12,0.4099,0.3306,...,,,,,,,,,,
16624,2018,2018-12-09,1837,Danish SAS-Ligaen,FC Copenhagen,Esbjerg,65.54,32.74,0.8416,0.0396,...,84.8,14.8,3.0,0.0,,,,,,
56241,2022,2022-07-23,1859,Norwegian Tippeligaen,Rosenborg,Tromso,44.68,24.46,0.6404,0.1433,...,12.3,28.3,3.0,0.0,,,,,,
14674,2018,2018-10-20,2160,United Soccer League,Pittsburgh Riverhounds,Bethlehem Steel FC,29.09,30.17,0.6594,0.3406,...,94.1,77.3,2.0,2.0,,,,,,
45582,2021,2021-09-03,2160,United Soccer League,Orange County SC,El Paso Locomotive FC,17.13,23.08,0.3512,0.3621,...,18.5,8.3,2.0,2.0,1.32,0.71,1.96,0.58,2.1,2.1


## Football-Data.co.uk
[football-data.co.uk](https://www.football-data.co.uk) is a website that provides historical betting odds for many soccer leagues.

In [3]:
FOOTBALL_DATA_MAIN_URL = "https://www.football-data.co.uk/mmz4281/{season}/{league}.csv"
LEAGUES = {
    "E0": ["Barclays Premier League"],
    "E1": ["English League Championship"],
    "E2": ["English League One"],
    "E3": ["English League Two"],
    "SC0": ["Scottish Premiership"],
    "D1": ["German Bundesliga"],
    "D2": ["German 2. Bundesliga"],
    "I1": ["Italy Serie A"],
    "I2": ["Italy Serie B"],
    "SP1": ["Spanish Primera Division"],
    "SP2": ["Spanish Segunda Division"],
    "F1": ["French Ligue 1"],
    "F2": ["French Ligue 2"],
    "N1": ["Dutch Eredivisie"],
    "B1": ["Belgian Jupiler League"],
    "P1": ["Portuguese Liga"],
    "T1": ["Turkish Turkcell Super Lig"],
    "G1": ["Greek Super League"],
}

FOOTBALL_DATA_OTHER_URL = "https://www.football-data.co.uk/new/{league}.csv"
OTHER_LEAGUES = {
    "ARG": ["Argentina Primera Division"],
    "AUT": ["Austrian T-Mobile Bundesliga"],
    "BRA": ["Brasileiro Série A"],
    "CHN": ["Chinese Super League"],
    "DNK": ["Danish SAS-Ligaen"],
    "JPN": ["Japanese J League"],
    "MEX": [
        "Mexican Primera Division Torneo Apertura",
        "Mexican Primera Division Torneo Clausura",
    ],
    "NOR": ["Norwegian Tippeligaen"],
    "RUS": ["Russian Premier Liga"],
    "SWE": ["Swedish Allsvenskan"],
    "SWZ": ["Swiss Raiffeisen Super League"],
}


def url_to_pandas(url):
    """Download URL content to a pandas dataframe."""
    content = requests.get(url, verify=False).content
    data = pd.read_csv(io.StringIO(content.decode(encoding="latin1")))
    data = data.dropna(how="all", axis=0)
    data = data.dropna(how="all", axis=1)
    data["URL"] = url
    return data


def get_football_data_main(year, league):
    """Get football data."""
    season = str(year - 1)[-2:] + str(year)[-2:]
    url = FOOTBALL_DATA_MAIN_URL.format(season=season, league=league)
    data = url_to_pandas(url)
    data["Season"] = season
    return data


def get_football_data_other(league):
    """Get football data."""
    url = FOOTBALL_DATA_OTHER_URL.format(league=league)
    data = url_to_pandas(url)
    data["Div"] = league
    return data


with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(get_football_data_main, year=year, league=league)
        for league in LEAGUES.keys()
        for year in range(2023, 2015, -1)
    ]
    results_main = [
        future.result() for future in concurrent.futures.as_completed(futures)
    ]

    futures = [
        executor.submit(get_football_data_other, league=league)
        for league in OTHER_LEAGUES.keys()
    ]
    results_other = [
        future.result() for future in concurrent.futures.as_completed(futures)
    ]

df_bet_main = pd.concat(results_main)
df_bet_other = pd.concat(results_other)


Rename columns and fill gaps to match main and other leagues.

In [4]:
df_bet_main["MaxH"] = df_bet_main["MaxH"].fillna(df_bet_main["BbMxH"])
df_bet_main["MaxD"] = df_bet_main["MaxD"].fillna(df_bet_main["BbMxD"])
df_bet_main["MaxA"] = df_bet_main["MaxA"].fillna(df_bet_main["BbMxA"])
df_bet_main["AvgH"] = df_bet_main["AvgH"].fillna(df_bet_main["BbAvH"])
df_bet_main["AvgD"] = df_bet_main["AvgD"].fillna(df_bet_main["BbAvD"])
df_bet_main["AvgA"] = df_bet_main["AvgA"].fillna(df_bet_main["BbAvA"])

df_bet_other = df_bet_other.rename(
    columns={
        "Home": "HomeTeam",
        "Away": "AwayTeam",
        "HG": "FTHG",
        "AG": "FTAG",
        "Res": "FTR",
        "PH": "PSH",
        "PD": "PSD",
        "PA": "PSA",
    }
)

df_bet = pd.concat([df_bet_main, df_bet_other])


Dates comes in differents formats depending on the year.

In [5]:
date1 = pd.to_datetime(df_bet["Date"], format="%d/%m/%Y", errors="coerce")
date2 = pd.to_datetime(df_bet["Date"], format="%d/%m/%y", errors="coerce")
df_bet["Date"] = date1.fillna(date2)

assert df_bet["Date"].isna().sum() == 0

df_bet.sample(5)


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,HFKC,AFKC,Country,League
71,I1,2015-10-17,,Torino,Milan,1.0,1.0,D,0.0,0.0,...,25.0,0.0,1.7,1.65,2.43,2.29,,,,
100,I1,2020-12-12,17:00,Torino,Udinese,2.0,3.0,A,0.0,1.0,...,,,,,,,,,,
93,F2,2019-10-04,19:00,Lorient,Ajaccio,0.0,0.0,D,0.0,0.0,...,,,,,,,,,,
138,G1,2020-01-23,17:00,Lamia,PAOK,0.0,1.0,A,0.0,1.0,...,,,,,,,,,,
251,E0,2017-02-25,,Crystal Palace,Middlesbrough,1.0,0.0,H,1.0,0.0,...,25.0,-0.25,1.8,1.76,2.2,2.13,,,,


Each data source uses different ways of writing the same clubs names. I will use a dict to make names the same.

In [6]:
# from thefuzz import fuzz
# import networkx as nx

# # Uncomment this block to generate empty names dict

# frames = [df_spi["team1"], df_spi["team2"], df_bet["HomeTeam"], df_bet["AwayTeam"]]
# names = pd.concat(frames).drop_duplicates()

# pairs = [
#     (name, other_name)
#     for name in names
#     for other_name in names
#     if fuzz.partial_ratio(name, other_name) > 90
# ]

# graph = nx.Graph()
# graph.add_edges_from(pairs)

# clusters = [list(cluster) for cluster in nx.connected_components(graph)]
# clusters = {cluster[0]: cluster for cluster in clusters}
# (
#     pd.Series(clusters)
#     .sort_index()
#     .to_json(os.path.join(DATA_DIR, "names.json"), force_ascii=False)
# )


In [7]:
with open(os.path.join(DATA_DIR, "names.json"), encoding="utf-8") as file:
    names_dict = json.load(file)


names_dict = {
    name.strip(): i.strip() for i, name_list in names_dict.items() for name in name_list
}


def clean(series, translate_dict):
    """Clean text in pandas series."""
    return series.str.strip().apply(
        lambda x: translate_dict[x] if x in translate_dict else x
    )


df_spi["home"] = clean(df_spi["team1"], names_dict)
df_spi["away"] = clean(df_spi["team2"], names_dict)
df_bet["home"] = clean(df_bet["HomeTeam"], names_dict)
df_bet["away"] = clean(df_bet["AwayTeam"], names_dict)


Odds dates are no consistent as FiveThirtyEight, the dates sometimes does not match due to timezone.

To fix this, I will make copies changing the date for the date after and before. This will make sure that it find the right match.

In [8]:
df_bet_before = df_bet.copy()
df_bet_after = df_bet.copy()

df_bet_before["Date"] = df_bet_before["Date"] - pd.Timedelta(days=1)
df_bet_after["Date"] = df_bet_after["Date"] + pd.Timedelta(days=1)

df_bet_expanded = pd.concat((df_bet, df_bet_before, df_bet_after))


Merge datasets.

In [9]:
df_bet_expanded["dt"] = df_bet_expanded["Date"].dt.date
df_spi["dt"] = df_spi["date"].dt.date
df = df_spi.merge(
    df_bet_expanded,
    how="inner",
    on=["dt", "home", "away"],
    validate="1:1",
)
df = df.drop(columns=["HomeTeam", "AwayTeam", "team1", "team2"])


Add each results point of view to the dataset.

In [10]:
df_home = df.copy()
df_home["club"] = df["home"]
df_home["opponent"] = df["away"]
df_home["pov"] = "home"
df_home["outcome"] = df["FTR"] == "H"
df_home["prob"] = df["prob1"]
df_home["Avg"] = df["AvgH"]
df_home["Max"] = df["MaxH"]
df_home["B365"] = df["B365H"]
df_home["PS"] = df["PSH"]


In [11]:
df_away = df.copy()
df_away["club"] = df["away"]
df_away["opponent"] = df["home"]
df_away["pov"] = "away"
df_away["outcome"] = df["FTR"] == "A"
df_away["prob"] = df["prob2"]
df_away["Avg"] = df["AvgA"]
df_away["Max"] = df["MaxA"]
df_away["B365"] = df["B365A"]
df_away["PS"] = df["PSA"]


In [12]:
df_draw = df.copy()
df_draw["club"] = df["home"]
df_draw["opponent"] = df["away"]
df_draw["pov"] = "draw"
df_draw["outcome"] = df["FTR"] == "D"
df_draw["prob"] = df["probtie"]
df_draw["Avg"] = df["AvgD"]
df_draw["Max"] = df["MaxD"]
df_draw["B365"] = df["B365D"]
df_draw["PS"] = df["PSD"]


In [13]:
df = pd.concat((df_home, df_away, df_draw))[
    [
        "date",
        "league",
        "club",
        "opponent",
        "pov",
        "outcome",
        "prob",
        "Avg",
        "Max",
        "B365",
        "PS",
    ]
]

df.sample(5)


Unnamed: 0,date,league,club,opponent,pov,outcome,prob,Avg,Max,B365,PS
6120,2017-12-30,English League Championship,Nottingham Forest,Sunderland,home,False,0.4047,2.01,2.1,2.04,2.03
34833,2021-04-05,German 2. Bundesliga,Eintracht Braunschweig,FC St. Pauli,away,False,0.2793,4.08,4.47,4.2,4.42
29958,2020-11-07,Italy Serie A,Cagliari,Sampdoria,draw,False,0.2602,3.56,3.76,3.5,3.67
11870,2018-10-01,Turkish Turkcell Super Lig,Ankaragucu,Antalyaspor,home,False,0.4314,2.16,2.28,2.1,2.22
27265,2020-07-15,Italy Serie A,Fiorentina,Lecce,away,True,0.4618,2.0,2.1,1.95,2.05


# +EV Betting

In [14]:
df["EVAvg"] = df["prob"] * (df["Avg"] - 1) - (1 - df["prob"])
df["EVMax"] = df["prob"] * (df["Max"] - 1) - (1 - df["prob"])
df["EVB365"] = df["prob"] * (df["B365"] - 1) - (1 - df["prob"])
df["EVPS"] = df["prob"] * (df["PS"] - 1) - (1 - df["prob"])

df.sample(5)


Unnamed: 0,date,league,club,opponent,pov,outcome,prob,Avg,Max,B365,PS,EVAvg,EVMax,EVB365,EVPS
9380,2018-05-14,Brasileiro Série A,Ceará,América Mineiro,draw,True,0.27,3.25,3.42,,3.42,-0.1225,-0.0766,,-0.0766
17968,2019-04-25,Spanish Primera Division,Sevilla,Vallecano,home,True,0.7026,1.33,1.38,1.33,1.33,-0.065542,-0.030412,-0.065542,-0.065542
20062,2019-08-24,English League One,Oxford United,Bristol Rovers,away,False,0.3967,2.55,2.65,2.55,2.58,0.011585,0.051255,0.011585,0.023486
25445,2020-02-22,German 2. Bundesliga,Heidenheim,Holstein Kiel,away,True,0.2848,3.74,3.95,3.8,3.93,0.065152,0.12496,0.08224,0.119264
37163,2021-08-08,Japanese J League,Hokkaido Consadole Sapporo,Urawa Reds,draw,False,0.2649,3.34,3.52,,3.45,-0.115234,-0.067552,,-0.086095


In [15]:
def simulate(data, bookmaker="Avg", threshold=0.0):
    """ "Simulate +EV betting."""
    return (df[f"EV{bookmaker}"] > threshold) * (data["outcome"] * data[bookmaker] - 1)


results = simulate(df, bookmaker="Avg", threshold=0.0)
results.sum()


-4351.710000000001

Now let's find out the best threshold and leagues combination that maximizes profits.

In [16]:
def objective(trial):
    """Optuna objective."""
    threshold = trial.suggest_float("threshold", 0.0, 1.0)
    leagues = [
        l
        for l in df["league"].sort_values().unique()
        if trial.suggest_categorical(l, [True, False])
    ]
    return simulate(
        df.query(f"league in {leagues}").dropna(how="all", axis=0).dropna(how="all", axis=1),
        bookmaker="Avg",
        threshold=threshold,
    ).sum()


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=2500, show_progress_bar=True)

print(f"Profit: {study.best_value:.2f}")
print(f"Threshold: {study.best_params['threshold']:.3f}")
print(
    f"Leagues:{json.dumps([k for k, v in study.best_params.items() if k not in ['threshold'] and v], indent=2, ensure_ascii=False)}"
)


100%|██████████| 2500/2500 [34:54<00:00,  1.19it/s]

Profit: 78.02
Threshold: 0.795
Leagues:[
  "Austrian T-Mobile Bundesliga",
  "Barclays Premier League",
  "Brasileiro Série A",
  "Chinese Super League",
  "Dutch Eredivisie",
  "English League One",
  "English League Two",
  "German 2. Bundesliga",
  "Italy Serie B",
  "Spanish Segunda Division"
]





# Conclusion
This strategy would be break even against the average betting site. However, it should be able to have a small margin agains some specific websites that offers good odds.
