In [9]:
import sys
sys.path.insert(0, '../..')

from IPython.display import clear_output
import numpy as np
import pandas as pd
import ScraperFC as sfc
from sklearn.model_selection import train_test_split
import traceback
from xgboost.sklearn import XGBRegressor, XGBClassifier

In [10]:
# Load fivethirtyeight data
scraper = sfc.FiveThirtyEight()
try:
    fte_df = pd.DataFrame()
    years = [2022,]
    for year in years:
        temp = scraper.scrape_matches(year, 'EPL')
        fte_df = pd.concat([fte_df, temp], axis=0, ignore_index=True)
    clear_output()
except:
    traceback.print_exc()
finally:
    scraper.close()
print('FTE DF:', fte_df.shape)

# Load pre-scraped Oddsportal data
odds_df = pd.DataFrame()
filenames = ['epl_2022_odds.pkl']
for filename in filenames:
    t = pd.read_pickle(filename)
    odds_df = pd.concat([odds_df, t], axis=0, ignore_index=True)
print('Odds DF:', odds_df.shape)

# Convert date column to datetime objects
fte_df['date'] = pd.to_datetime(fte_df['date'], format='%Y-%m-%d')

# Drop kickoff times from Oddsportal date
odds_df['Date'] = pd.DatetimeIndex(odds_df['Date']).normalize()

# Remap FTE team names to match Oddsportal team names
fte_to_odds_names = {
    'Leicester City': 'Leicester',
    'Brighton and Hove Albion': 'Brighton',
    'Leeds United': 'Leeds',
    'Manchester United': 'Manchester Utd',
    'Norwich City': 'Norwich',
    'Tottenham Hotspur': 'Tottenham',
    'West Ham United': 'West Ham',
    'Wolverhampton': 'Wolves',
}
fte_df = fte_df.replace({'team1': fte_to_odds_names, 'team2': fte_to_odds_names})
assert np.all(np.sort(fte_df['team1'].unique()) == np.sort(odds_df['Team1'].unique()))

# Merge dataframes
df = pd.merge(fte_df, odds_df, left_on=['date','team1','team2'], right_on=['Date','Team1','Team2'])
assert df.shape[0] == odds_df.shape[0]
assert df.shape[0] == fte_df.shape[0]
print('DF:', df.shape)

# De-fragment dataframe
df = df.copy()

# Input cols
fte_input_cols = [
    'prob1', 'prob2', 'probtie', # FTE probas
    'proj_score1', 'proj_score2', # FTE projected scores
    'spi1', 'spi2', # FTE SPI index
    'importance1', 'importance2', # FTE match importance values
]
odds_input_cols = (
    [c for c in odds_df.columns if c.endswith(' 1') or c.endswith(' X') or c.endswith(' 2')] + 
    [c for c in odds_df.columns if c.endswith(' over') or c.endswith(' under')]
)
input_cols = fte_input_cols + odds_input_cols

# Target cols
df['1'] = df['score1'] > df['score2']
df['X'] = df['score1'] == df['score2']
df['2'] = df['score2'] > df['score1']
ou_goals = [c.split('+')[1].split(' ')[0] for c in odds_df.columns if c.endswith(' over') or c.endswith(' under')]
ou_goals = np.unique(ou_goals)
for ou in ou_goals:
    df[f'+{ou} over'] = (df['score1']+df['score2']) > float(ou)
    df[f'+{ou} under'] = (df['score1']+df['score2']) < float(ou)
target_cols = ['1', 'X', '2'] + [f'+{ou} over' for ou in ou_goals] + [f'+{ou} under' for ou in ou_goals]
df[target_cols] = df[target_cols].astype(int)

FTE DF: (380, 23)
Odds DF: (380, 1524)
DF: (380, 1547)


In [11]:
# Train-validation-test split
random_state = 18
train, valid, test = 0.7, 0.15, 0.15
assert (train + valid + test) == 1
train_valid_df, test_df = train_test_split(df, test_size=test, random_state=random_state)
train_df, valid_df = train_test_split(train_valid_df, test_size=valid/(train+valid), random_state=random_state)
print('Training', train_df.shape)
print('Validation', valid_df.shape)
print('Test', test_df.shape)

Training (265, 1616)
Validation (58, 1616)
Test (57, 1616)


In [12]:
# Train
X = train_df[input_cols]
y = train_df[target_cols]
# xgb = XGBRegressor(tree_method='gpu_hist')
xgb = XGBClassifier(tree_method='gpu_hist')
xgb.fit(X, y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [13]:
bet_thresh = 0.5

if type(xgb) is XGBClassifier:
    yhat = xgb.predict_proba(valid_df[input_cols])
elif type(xgb) is XGBRegressor:
    yhat = xgb.predict(valid_df[input_cols])
yhat_df = pd.DataFrame(data=yhat, columns=target_cols)

# Flags for bets placed
bets_placed = (yhat_df > bet_thresh).astype(float).reset_index(drop=True)
# Flags for outcomes occurring
actual_outcomes = valid_df[target_cols].reset_index(drop=True)
not_actual_outcomes = (~(actual_outcomes.astype(bool))).astype(float)
# AND of bets placed and outcomes occurred
won = bets_placed * actual_outcomes
lost = bets_placed * not_actual_outcomes

print(f'{int(bets_placed.values.sum())} bets placed.')
print(f'{int(won.values.sum())} bets won.')
print(f'{int(lost.values.sum())} bets lost.')

1909 bets placed.
1512 bets won.
397 bets lost.


In [15]:
def american_to_decimal(american):
    decimal = np.zeros(american.shape)
    decimal[american<0] = 1 - (100 / -american[american<0]) # Negative american odds
    decimal[american>0] = (american[american>0] / 100) + 1 # Positive american odds
    decimal[np.isnan(american)] = np.nan # NaN american odds
    return decimal

# Compute payouts
dec_odds_df = pd.DataFrame(
    data=american_to_decimal(valid_df[odds_input_cols].values),
    columns=odds_input_cols
)
bookies = list(np.unique([
    ' '.join(c.split(' ')[:-1]) for c in odds_input_cols 
    if (c.__contains__('1') or c.__contains__('X') or c.__contains__('2')) and ('over' not in c and 'under' not in c)
]))
bookies.remove('Average')
bookies.remove('Highest')
print('Payouts if betting with the following bookies:')
for bookie in bookies:
    bookie_net = 0
    for tcol in target_cols:
        # Payout = payout for each win
        wins_mask = bets_placed[tcol] * actual_outcomes[tcol]
        payouts = dec_odds_df[f'{bookie} {tcol}'] * wins_mask
        total_payout = np.sum(payouts)

        # Debt = bet placed for each loss
        loss_mask = bets_placed[tcol] * not_actual_outcomes[tcol]
        debts = bets_placed[tcol] * loss_mask
        total_debt = np.sum(bets_placed)

        bookie_net = bookie_net + total_payout - total_debt
    print(f'{bookie}: {round(bookie_net, 2)}')

Payouts if betting with the following bookies:


KeyError: '10x10bet +6.75 over'