# Football Betting using FiveThirtyEight's Soccer Power Index
## Goal
Backtest a strategy of using the FiveThirtyEight's Soccer Power Index for betting.

## Imports

In [1]:
import io
import os

import numpy as np
import pandas as pd
import requests

ROOT_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(ROOT_DIR, "data")

## FiveThirtyEight
Download Soccer Power Index dataset.

In [2]:
SPI_SOCCER_URL = "https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv"

content = requests.get(SPI_SOCCER_URL).content
df_spi = pd.read_csv(io.StringIO(content.decode()))

df_spi.sample(5)

Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
29677,2019,2020-02-02,2417,Scottish Premiership,Hamilton Academical,Celtic,18.3,67.31,0.0467,0.8448,...,52.4,83.2,1.0,4.0,,,,,,
21781,2018,2019-05-10,2414,English League Two,Tranmere Rovers,Forest Green Rovers,13.35,17.56,0.3567,0.3494,...,0.0,0.0,1.0,0.0,,,,,,
6812,2017,2017-12-16,1849,Dutch Eredivisie,PSV,ADO Den Haag,66.74,33.06,0.8116,0.0588,...,,,3.0,0.0,,,,,,
23954,2019,2019-08-24,2411,Barclays Premier League,Manchester United,Crystal Palace,81.86,70.22,0.6352,0.1538,...,56.1,22.6,1.0,2.0,2.54,0.73,2.5,0.24,1.05,2.1
22621,2019,2019-06-22,2160,United Soccer League,Indy Eleven,Atlanta United 2,30.77,6.67,0.8433,0.0277,...,10.9,7.1,1.0,0.0,,,,,,


We'll be using Premier League only. This is the most popular league on the world.

In [3]:
df_spi["date"] = pd.to_datetime(df_spi["date"], format="%Y-%m-%d")

df_spi = df_spi[df_spi["league"] == "Barclays Premier League"]

df_spi.sample(5)

Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
960,2016,2016-12-26,2411,Barclays Premier League,Manchester United,Sunderland,80.36,50.44,0.7464,0.0693,...,58.5,51.2,3.0,1.0,2.07,0.52,3.28,0.75,2.47,1.05
46939,2021,2021-10-02,2411,Barclays Premier League,Wolverhampton,Newcastle,72.93,62.84,0.4887,0.2507,...,22.5,43.8,2.0,1.0,1.01,0.98,0.59,0.35,2.1,1.05
46239,2021,2021-09-19,2411,Barclays Premier League,West Ham United,Manchester United,75.65,84.65,0.2955,0.4589,...,41.9,63.1,1.0,2.0,1.76,2.47,1.1,1.69,1.05,2.1
54059,2021,2022-05-07,2411,Barclays Premier League,Norwich City,West Ham United,54.08,76.22,0.1979,0.5621,...,,,,,,,,,,
6988,2017,2017-12-26,2411,Barclays Premier League,Watford,Leicester City,57.56,62.86,0.3927,0.3475,...,5.8,5.8,2.0,1.0,1.42,1.4,1.26,1.74,2.1,1.05


## Football-Data.co.uk
[football-data.co.uk](https://www.football-data.co.uk) is a website that provides historical betting odds for many soccer leagues.

In [4]:
FOOTBALL_DATA_URL = "https://www.football-data.co.uk/mmz4281/{season}/E0.csv"

df_bet = pd.DataFrame()
for year in range(2022, 2015, -1):

    season = str(year - 1)[-2:] + str(year)[-2:]
    content = requests.get(FOOTBALL_DATA_URL.format(season=season)).content
    df_year = pd.read_csv(io.StringIO(content.decode()))
    df_bet = df_bet.append(df_year)

df_bet.sample(5)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,LBH,LBD,LBA
133,E0,01/12/2018,,Leicester,Watford,2,0,H,2,0,...,1.83,21.0,-0.25,1.97,1.93,1.99,1.94,,,
60,E0,02/10/2021,12:30,Man United,Everton,1,1,D,1,0,...,,,,,,,,,,
148,E0,03/12/2017,,Bournemouth,Southampton,1,1,D,1,0,...,1.69,18.0,0.25,1.81,1.78,2.2,2.13,3.0,3.1,2.45
263,E0,04/03/17,,Watford,Southampton,3,4,A,1,2,...,1.65,21.0,0.25,2.03,1.98,1.94,1.88,3.4,3.25,2.15
121,E0,21/11/2015,,Everton,Aston Villa,4,0,H,3,0,...,1.89,26.0,-1.0,2.07,1.99,1.92,1.88,1.6,4.0,6.0


Dates comes in differents formats depending on the year.

In [5]:
df_bet["Date"] = pd.to_datetime(df_bet["Date"], format="%d/%m/%Y", errors="coerce")
df_bet["Date"] = df_bet["Date"].fillna(pd.to_datetime(df_bet["Date"], format="%d/%m/%y", errors="coerce"))

df_bet = df_bet[df_bet["Date"] >= df_spi["date"].min()]

df_bet.sample(5)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,LBH,LBD,LBA
187,E0,2017-12-23,,Stoke,West Brom,3,1,H,2,0,...,1.6,20.0,-0.25,2.01,1.95,1.95,1.92,2.25,3.1,3.4
168,E0,2019-12-15,16:30,Arsenal,Man City,0,3,A,0,3,...,,,,,,,,,,
165,E0,2021-01-12,20:15,Burnley,Man United,0,1,A,0,0,...,,,,,,,,,,
134,E0,2020-12-20,19:15,West Brom,Aston Villa,0,3,A,0,1,...,,,,,,,,,,
74,E0,2018-10-06,,Man United,Newcastle,3,2,H,0,2,...,1.92,21.0,-1.0,1.81,1.75,2.2,2.12,,,


Each data source uses different ways of writing the same clubs names. I will use a dict to make names the same.

In [6]:
names_dict = {
    "Bournemouth": ["Bournemouth", "AFC Bournemouth"],
    "Brighton": ["Brighton", "Brighton and Hove Albion"],
    "Cardiff": ["Cardiff City"],
    "Huddersfield": ["Huddersfield Town"],
    "Hull City": ["Hull City", "Hull"],
    "Leeds": ["Leeds", "Leeds United"],
    "Leicester": ["Leicester", "Leicester City"],
    "Manchester City": ["Manchester City", "Man City"],
    "Manchester United": ["Manchester United", "Man United"],
    "Norwich": ["Norwich", "Norwich City"],
    "Stoke City": ["Stoke City", "Stoke"],
    "Tottenham": ["Tottenham", "Tottenham Hotspur"],
    "West Brom": ["West Brom", "West Bromwich Albion"],
    "West Ham": ["West Ham", "West Ham United"],
    "Wolverhampton": ["Wolverhampton", "Wolves"],
    "Middlesbrough": ["Middlesbrough"],
    "Sunderland": ["Sunderland"],
    "Swansea City": ["Swansea City", "Swansea"],
}

names_dict = {
    name.lower().strip(): i.lower().strip()
    for i, name_list in names_dict.items()
    for name in name_list
}


def clean(series, translate_dict):
    """Clean text inpandas series."""
    return (
        series.str.normalize("NFKD")
        .str.encode("ascii", errors="ignore")
        .str.decode("utf-8")
        .str.lower()
        .str.strip()
        .apply(lambda x: translate_dict[x] if x in translate_dict else x)
    )


df_spi["team1"] = clean(df_spi["team1"], names_dict)
df_spi["team2"] = clean(df_spi["team2"], names_dict)
df_bet["HomeTeam"] = clean(df_bet["HomeTeam"], names_dict)
df_bet["AwayTeam"] = clean(df_bet["AwayTeam"], names_dict)

spi_teams = set(list(df_spi["team1"]) + list(df_spi["team2"]))
bet_teams = set(list(df_bet["HomeTeam"]) + list(df_bet["AwayTeam"]))

sym_diff = bet_teams.symmetric_difference(spi_teams)
print(sorted(sym_diff))


['hull city', 'middlesbrough', 'sunderland']


There are only a few clubs left that are unmatch. Maybe one dataset has more games than the other.

Now that names are fixed, it is able to be merged.

In [7]:
df_bet = df_bet.rename({"Date": "date", "HomeTeam": "team1", "AwayTeam": "team2"}, axis=1)

df_bet["date"] = df_bet["date"].dt.date
df_spi["date"] = df_spi["date"].dt.date

# df_bet_minus = df_bet.copy()
# df_bet_minus["date"] = df_bet_minus["date"] - pd.Timedelta(days=1)

# df_bet_plus = df_bet.copy()
# df_bet_plus["date"] = df_bet_plus["date"] + pd.Timedelta(days=1)

# df_bet = df_bet.append(df_bet_minus)
# df_bet = df_bet.append(df_bet_plus)

df = df_spi.merge(df_bet, how="inner", on=["date", "team1", "team2"])

print(df_spi.shape[0] - df.shape[0])

584


Add each results point of view to the dataset.

In [8]:
df["win"] = df["score1"] > df["score2"]
df["draw"] = df["score1"] == df["score2"]
df["loss"] = df["score1"] < df["score2"]

df_inv = df.copy()

df_inv["team2"], df_inv["team1"] = df["team1"], df["team2"]
df_inv["spi2"], df_inv["spi1"] = df["spi1"], df["spi2"]
df_inv["prob2"], df_inv["prob1"] = df["prob1"], df["prob2"]
df_inv["proj_score2"], df_inv["proj_score1"] = df["proj_score1"], df["proj_score2"]
df_inv["importance2"], df_inv["importance1"] = df["importance1"], df["importance2"]
df_inv["score2"], df_inv["score1"] = df["score1"], df["score2"]
df_inv["xg2"], df_inv["xg1"] = df["xg1"], df["xg2"]
df_inv["nsxg2"], df_inv["nsxg1"] = df["nsxg1"], df["nsxg2"]
df_inv["adj_score2"], df_inv["adj_score1"] = df["adj_score1"], df["adj_score2"]
df_inv["B365A"], df_inv["B365H"] = df["B365H"], df["B365A"]
df_inv["MaxA"], df_inv["MaxH"] = df["MaxH"], df["MaxA"]
df_inv["AvgA"], df_inv["AvgH"] = df["AvgH"], df["AvgA"]
df_inv["loss"], df_inv["win"] = df["win"], df["loss"]

df_draw = df.copy()
df_draw["team1"] = "draw"
df_draw["team2"] = np.nan
df_draw["prob1"] = df["probtie"]
df_draw["B365H"] = df["B365D"]
df_draw["MaxH"] = df["MaxD"]
df_draw["AvgH"] = df["AvgD"]
df_draw["win"] = df["draw"]

df = df.append(df_inv).reset_index(drop=True)
df = df.append(df_draw).reset_index(drop=True)

print(df.shape)

df.sample(5)

(5088, 150)


Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,...,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,LBH,LBD,LBA,win,draw,loss
2363,2018,2019-03-03,2411,Barclays Premier League,chelsea,fulham,84.02,56.99,0.6604,0.1354,...,2.09,2.02,1.9,1.84,,,,True,False,False
2870,2020,2020-10-04,2411,Barclays Premier League,sheffield united,arsenal,66.93,78.69,0.1772,0.5815,...,,,,,,,,False,False,True
2126,2018,2018-09-22,2411,Barclays Premier League,watford,fulham,69.59,61.89,0.3389,0.3858,...,2.17,2.12,1.82,1.77,,,,False,True,False
4263,2019,2019-11-09,2411,Barclays Premier League,draw,,86.49,70.4,0.1774,0.1056,...,,,,,,,,False,False,False
1143,2020,2020-09-12,2411,Barclays Premier League,west ham,newcastle,70.38,64.72,0.4887,0.2556,...,,,,,,,,False,False,True


## Results
### ROI Lines

In [9]:
def roi_lines(data, odds_col):
    """Calculate ROI."""
    data["ev"] = data["prob1"] * (data[odds_col] - 1) - (1 - data["prob1"])
    data["bet"] = data["ev"] > 0
    data["balance"] = data["bet"].astype(int) * (data["win"].astype(int) * data[odds_col] - 1)

    return data["balance"].sum() / data["bet"].sum()


print(f"Avg ROI = {roi_lines(df, 'AvgH') * 100:.2g}%")
print(f"B365 ROI = {roi_lines(df, 'B365H') * 100:.2g}%")
print(f"Max ROI = {roi_lines(df, 'MaxH') * 100:.2g}%")

Avg ROI = -1.7%
B365 ROI = 2.1%
Max ROI = 3.6%


### ROI Over/Under

In [10]:
df[">2.5"] = df["score1"] + df["score2"] > 2.5
df["<2.5"] = df["score1"] + df["score2"] < 2.5


def roi_over_under(data, odds_col):
    """Calculate ROI."""
    pd.options.mode.chained_assignment = None
    data = data.dropna(subset=[f"{odds_col}<2.5", f"{odds_col}>2.5", "team2"])
    data["bet_over"] = data["x>2.5"].astype(int)
    data["bet_under"] = data["x<2.5"].astype(int)
    data["balance_over"] = data["bet_over"] * (data[">2.5"].astype(int) * df[f"{odds_col}>2.5"] - 1)
    data["balance_under"] = data["bet_under"] * (data["<2.5"].astype(int) * df[f"{odds_col}<2.5"] - 1)
    data["balance"] = data["balance_over"] + data["balance_under"]
    return data["balance"].sum() / data["bet"].sum()


df["x>2.5"] = df["proj_score1"] + df["proj_score1"] > 2.5
df["x<2.5"] = df["proj_score1"] + df["proj_score1"] < 2.5

print(f"Avg ROI = {roi_over_under(df, 'Avg') * 100:.2g}%")
print(f"B365 ROI = {roi_over_under(df, 'B365') * 100:.2g}%")
print(f"Max ROI = {roi_over_under(df, 'Max') * 100:.2g}%")

Avg ROI = -7.3%
B365 ROI = -8.1%
Max ROI = -0.29%


## Conclusion
### Lines
This strategy would lose money against the average betting site. However, it is able to have a small margin agains some specific websites that offers good odds.

### Over/Under
The over/under strategy is not profitable, even considering the best odds available.