In [1]:
from IPython.display import clear_output
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
sys.path.insert(0, '../..')
import ScraperFC as sfc
import traceback
from xgboost.sklearn import XGBRegressor

In [2]:
# Load football-data.co.uk data
odds_df = pd.DataFrame()
filenames = ['2019-20.csv', '2020-21.csv', '2021-22.csv']
for filename in filenames:
    t = pd.read_csv(filename)
    odds_df = pd.concat([odds_df, t], axis=0, ignore_index=True)

# Load fivethirtyeight data
scraper = sfc.FiveThirtyEight()
try:
    fte_df = pd.DataFrame()
    years = range(2020,2023)
    for year in years:
        t = scraper.scrape_matches(year, 'EPL')
        fte_df = pd.concat([fte_df, t], axis=0, ignore_index=True)
    clear_output()
except:
    traceback.print_exc()
finally:
    scraper.close()

# Convert date columns to datetime objects
fte_df['date'] = pd.to_datetime(fte_df['date'], format='%Y-%m-%d')
odds_df['Date'] = pd.to_datetime(odds_df['Date'], format='%d/%m/%Y')

# map fivethirtyeight team names to football-data.co.uk team names
team_mappings = { 
    'AFC Bournemouth': 'Bournemouth', 
    'Brighton and Hove Albion': 'Brighton',
    'Cardiff City': 'Cardiff', 
    'Huddersfield Town': 'Huddersfield', 
    'Leeds United': 'Leeds', 
    'Leicester City': 'Leicester', 
    'Manchester City': 'Man City', 
    'Manchester United': 'Man United', 
    'Norwich City': 'Norwich',
    'Stoke City': 'Stoke', 
    'Swansea City': 'Swansea', 
    'Tottenham Hotspur': 'Tottenham',
    'West Bromwich Albion': 'West Brom', 
    'West Ham United': 'West Ham', 
    'Wolverhampton': 'Wolves',
}
fte_df = fte_df.replace({'team1': team_mappings, 'team2': team_mappings})

# Merge dataframes
df = pd.merge(fte_df, odds_df, left_on=['date','team1','team2'], right_on=['Date','HomeTeam','AwayTeam'])

fte_cols = [
    'prob1', 'prob2', 'probtie', # FTE probas
    'proj_score1', 'proj_score2', # FTE projected scores
    'spi1', 'spi2', # FTE SPI index
    'importance1', 'importance2', # FTE match importance values
]
odds_cols = [
    # Outcomes odds
    'B365H', 'B365D', 'B365A', # Bet365 odds
    # 'BSH', 'BSD', 'BSA', # Blue Square odds
    'BWH', 'BWD', 'BWA', # Bet&Win odds
    # 'GBH', 'GBD', 'GBA', # Gamebookers odds
    'IWH', 'IWD', 'IWA', # Interwetten odds
    # 'LBH', 'LBD', 'LBA', # Ladbrokes odds
    # 'PSH and PH', 'PSD and PD', 'PSA and PA', # Pinnacle odds
    # 'SOH', 'SOD', 'SOA', # Sporting Odds odds
    # 'SBH', 'SBD', 'SBA', # Sportingbet odds
    # 'SJH', 'SJD', 'SJA', # Stan James odds
    # 'SYH', 'SYD', 'SYA', # Stanleybet odds
    'VCH', 'VCD', 'VCA', # VC Bet odds
    'WHH', 'WHD', 'WHA', # William Hill odds
    # 'Bb1X2', # Number of BetBrain bookmakers used to calculate match odds averages and maximums
    # 'BbMxH', 'BbMxD', 'BbMxA', # Betbrain maximum odds
    # 'BbAvH', 'BbAvD', 'BbAvA', # Betbrain average odds
    'MaxH', 'MaxD', 'MaxA',# Market maximum odds
    'AvgH', 'AvgD', 'AvgA', # Market average odds
    # Total goals odds
    # 'BbOU', # Number of BetBrain bookmakers used to calculate over/under 2.5 goals (total goals) averages and maximums
    # 'BbMx>2.5', 'BbMx<2.5', # Betbrain maximum over/under 2.5 goals
    # 'BbAv>2.5', 'BbAv<2.5', # Betbrain average over/under 2.5 goals
    # 'GB>2.5', 'GB<2.5', # Gamebookers over/under 2.5 goals
    'B365>2.5', 'B365<2.5', # Bet365 over/under 2.5 goals
    'P>2.5', 'P<2.5', # Pinnacle over/under 2.5 goals
    'Max>2.5', 'Max<2.5', # Market maximum over/under 2.5 goals
    'Avg>2.5', 'Avg<2.5', # Market average over/under 2.5 goals
    # Asian handicap odds
    # 'BbAH', # Number of BetBrain bookmakers used to Asian handicap averages and maximums
    # 'BbAHh', # Betbrain size of handicap (home team)
    'AHh', # Market size of handicap (home team) (since 2019/2020)
    # 'BbMxAHH', 'BbMxAHA', # Betbrain maximum Asian handicap team odds
    # 'BbAvAHH', 'BbAvAHA', # Betbrain average Asian handicap team odds
    # 'GBAHH', 'GBAHA', # Gamebookers Asian handicap home team odds
    # 'GBAH', # Gamebookers size of handicap (home team)
    # 'LBAHH', 'LBAHA ', # Ladbrokes Asian handicap team odds
    # 'LBAH', # Ladbrokes size of handicap (home team)
    'B365AHH', 'B365AHA', # Bet365 Asian handicap team odds
    # 'B365AH', # Bet365 size of handicap (home team)
    'PAHH', 'PAHA', # Pinnacle Asian handicap team odds
    'MaxAHH', 'MaxAHA', # Market maximum Asian handicap team odds
    'AvgAHH', 'AvgAHA', # Market average Asian handicap team odds
]
input_cols = fte_cols + odds_cols

# Add hits columns for actual outcomes
df['home win'] = df['score1'] > df['score2']
df['away win'] = df['score2'] > df['score1']
df['draw'] = df['score1'] == df['score2']
df['over 2.5 goals'] = (df['score1']+df['score2']) > 2.5
df['under 2.5 goals'] = (df['score1']+df['score2']) < 2.5
label_cols = ['home win', 'away win', 'draw', 'over 2.5 goals', 'under 2.5 goals']
df[label_cols] = df[label_cols].astype(float)

# Remove '<' and '>' from columns names
for i in range(len(input_cols)):
    if '<' in input_cols[i]:
        input_cols[i] = input_cols[i].replace('<','under')
    elif '>' in input_cols[i]:
        input_cols[i] = input_cols[i].replace('>','over')
df.columns = df.columns.str.replace('<','under')
df.columns = df.columns.str.replace('>','over')

In [8]:
# Train-validation-test split
random_state = 18
train, valid, test = 0.7, 0.15, 0.15
assert (train + valid + test) == 1
train_valid_df, test_df = train_test_split(df, test_size=test, random_state=random_state)
train_df, valid_df = train_test_split(train_valid_df, test_size=valid/(train+valid), random_state=random_state)

# Train
X = train_df[input_cols]
y = train_df[label_cols]
xgb = XGBRegressor(tree_method='gpu_hist')
xgb.fit(X, y)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [34]:
bet_thresh = 0.5

yhat = xgb.predict(valid_df[input_cols])
yhat_df = pd.DataFrame(data=yhat, columns=label_cols)

# Flags for bets placed
bets = (yhat_df > bet_thresh).astype(float).reset_index(drop=True)
# Flags for outcomes occurring
outcomes = valid_df[label_cols].reset_index(drop=True)
# AND of bets placed and outcomes occurred
wins = bets * outcomes

# Compute payouts
avg_odds = valid_df[['AvgH', 'AvgA', 'AvgD', 'Avgover2.5', 'Avgunder2.5']]
payouts = np.multiply(wins, avg_odds.values) - bets
payouts = np.round(payouts, 2)

print(f'{int(bets.values.sum())} bets placed.')
print(f'{int(wins.values.sum())} bets won.')
print(f'${np.sum(payouts.values)} won.')

312 bets placed.
149 bets won.
$-35.72 won.
