In [1]:
import sys
sys.path.insert(0, '../..')

from IPython.display import clear_output
import numpy as np
import pandas as pd
import ScraperFC as sfc
from sklearn.model_selection import train_test_split
import traceback
from xgboost.sklearn import XGBRegressor, XGBClassifier

In [2]:
# Load fivethirtyeight data
scraper = sfc.FiveThirtyEight()
try:
    fte_df = pd.DataFrame()
    years = [2022,]
    for year in years:
        temp = scraper.scrape_matches(year, 'EPL')
        fte_df = pd.concat([fte_df, temp], axis=0, ignore_index=True)
    clear_output()
except:
    traceback.print_exc()
finally:
    scraper.close()
print('FiveThirtyEight DF:', fte_df.shape)

# Load pre-scraped Oddsportal data
odds_df = pd.DataFrame()
filenames = ['epl_2022_odds.pkl']
for filename in filenames:
    t = pd.read_pickle(filename)
    odds_df = pd.concat([odds_df, t], axis=0, ignore_index=True)
print('Oddsportal DF:', odds_df.shape)

# Convert date column to datetime objects
fte_df['date'] = pd.to_datetime(fte_df['date'], format='%Y-%m-%d')

# Drop kickoff times from Oddsportal date
odds_df['Date'] = pd.DatetimeIndex(odds_df['Date']).normalize()

# Remap FTE team names to match Oddsportal team names
fte_to_odds_names = {
    'Leicester City': 'Leicester',
    'Brighton and Hove Albion': 'Brighton',
    'Leeds United': 'Leeds',
    'Manchester United': 'Manchester Utd',
    'Norwich City': 'Norwich',
    'Tottenham Hotspur': 'Tottenham',
    'West Ham United': 'West Ham',
    'Wolverhampton': 'Wolves',
}
fte_df = fte_df.replace({'team1': fte_to_odds_names, 'team2': fte_to_odds_names})
assert np.all(np.sort(fte_df['team1'].unique()) == np.sort(odds_df['Team1'].unique()))

# Merge dataframes
df = pd.merge(fte_df, odds_df, left_on=['date','team1','team2'], right_on=['Date','Team1','Team2'])
assert df.shape[0] == odds_df.shape[0]
assert df.shape[0] == fte_df.shape[0]
print('Merged DF:', df.shape)

# De-fragment dataframe
df = df.copy()

# Input cols
fte_input_cols = [
    'prob1', 'prob2', 'probtie', # FTE probas
    'proj_score1', 'proj_score2', # FTE projected scores
    'spi1', 'spi2', # FTE SPI index
    'importance1', 'importance2', # FTE match importance values
]
moneyline_odds_cols = [c for c in odds_df.columns if c.endswith(' 1') or c.endswith(' X') or c.endswith(' 2')]
ou_odds_cols = [c for c in odds_df.columns if c.endswith(' over') or c.endswith(' under')]
odds_input_cols = moneyline_odds_cols + ou_odds_cols
input_cols = fte_input_cols + odds_input_cols

# Target cols
df['1'] = df['score1'] > df['score2']
df['X'] = df['score1'] == df['score2']
df['2'] = df['score2'] > df['score1']
ou_goals = [c.split('+')[1].split(' ')[0] for c in odds_df.columns if c.endswith(' over') or c.endswith(' under')]
ou_goals = np.unique(ou_goals)
for ou in ou_goals:
    df[f'+{ou} over'] = (df['score1']+df['score2']) > float(ou)
    df[f'+{ou} under'] = (df['score1']+df['score2']) < float(ou)
target_cols = ['1', 'X', '2'] + [f'+{ou} over' for ou in ou_goals] + [f'+{ou} under' for ou in ou_goals]
df[target_cols] = df[target_cols].astype(int)
print('Final DF:', df.shape)

FiveThirtyEight DF: (380, 23)
Oddsportal DF: (380, 1524)
Merged DF: (380, 1547)
Final DF: (380, 1616)


In [3]:
# Train-validation-test split
random_state = 18
train, valid, test = 0.7, 0.15, 0.15
assert (train + valid + test) == 1
train_valid_df, test_df = train_test_split(df, test_size=test, random_state=random_state)
train_df, valid_df = train_test_split(train_valid_df, test_size=valid/(train+valid), random_state=random_state)
print('Training', train_df.shape)
print('Validation', valid_df.shape)
print('Test', test_df.shape)

Training (265, 1616)
Validation (58, 1616)
Test (57, 1616)


In [4]:
# Train
X = train_df[input_cols]
y = train_df[target_cols]
# xgb = XGBRegressor(tree_method='gpu_hist')
xgb = XGBClassifier(tree_method='gpu_hist')
xgb = xgb.fit(X, y)

In [5]:
bet_thresh = 0.5

if type(xgb) is XGBClassifier:
    yhat = xgb.predict_proba(valid_df[input_cols])
elif type(xgb) is XGBRegressor:
    yhat = xgb.predict(valid_df[input_cols])
yhat_df = pd.DataFrame(data=yhat, columns=target_cols)

# Flags for bets placed
bets_placed = (yhat_df > bet_thresh).astype(float).reset_index(drop=True)
# Flags for outcomes occurring
actual_outcomes = valid_df[target_cols].reset_index(drop=True)
not_actual_outcomes = (~(actual_outcomes.astype(bool))).astype(float)
# AND of bets placed and outcomes occurred
wins_mask = bets_placed * actual_outcomes
loss_mask = bets_placed * not_actual_outcomes

print(f'{int(bets_placed.values.sum())} bets placed.')
print(f'{int(wins_mask.values.sum())} bets won.')
print(f'{int(loss_mask.values.sum())} bets lost.')

1900 bets placed.
1496 bets won.
404 bets lost.


In [6]:
def american_to_decimal(american):
    decimal = np.zeros(american.shape)
    decimal[american<0] = 1 - (100 / american[american<0]) # Negative american odds
    decimal[american>0] = (american[american>0] / 100) + 1 # Positive american odds
    decimal[np.isnan(american)] = np.nan # NaN american odds
    return decimal

# Compute payouts
dec_odds_df = pd.DataFrame(
    data=american_to_decimal(valid_df[odds_input_cols].values),
    columns=odds_input_cols
)
bookies = list(np.unique([
    ' '.join(c.split(' ')[:-1]) for c in odds_input_cols 
    if (c.__contains__('1') or c.__contains__('X') or c.__contains__('2')) and ('over' not in c and 'under' not in c)
]))
bookies.remove('Average')
bookies.remove('Highest')
print('Payouts if betting with the following bookies:')
for bookie in bookies:
    bookie_net = 0
    for tcol in target_cols:
        # Check if the bookie even offers odds on the bet
        bookie_odds_col = f'{bookie} {tcol}'
        if bookie_odds_col not in dec_odds_df.columns:
            continue

        # Determine bets actually placed with bookie
        # Don't bet on NaN bets from bookie
        bets = ~dec_odds_df[bookie_odds_col].isna() * bets_placed[tcol]

        # Payout = payout for each win
        wins_mask = bets_placed[tcol] * actual_outcomes[tcol]
        payouts = dec_odds_df[bookie_odds_col] * wins_mask
        total_payout = np.nansum(payouts.values)

        # Debt = bet placed for each loss
        loss_mask = bets_placed[tcol] * not_actual_outcomes[tcol]
        debts = bets_placed[tcol] * loss_mask
        total_debt = np.nansum(debts.values)


        if np.isnan(total_payout) or np.isnan(total_debt):
            print(bookie_odds_col)

        bookie_net = bookie_net + total_payout - total_debt
    print(f'{bookie}: ${round(bookie_net, 2)}')

Payouts if betting with the following bookies:
10x10bet: $4945.11
1xBet: $7567.97
Curebet: $4243.49
GGBET: $5595.88
Interwetten: $3185.57
Lasbet: $4470.22
Marathonbet: $4539.05
Marsbet: $1784.12
Pinnacle: $29.85
Unibet: $5545.14
VOBET: $-198.08
William Hill: $4051.31
bet-at-home: $5451.09
bet365: $7187.38
bwin: $6035.63


In [7]:
# tcol = '1'
# bookie = 'William Hill'
# bookie_odds_col = f'{bookie} {tcol}'

# # Determine bets actually placed with bookie
# # Don't bet on NaN bets from bookie
# bets = ~dec_odds_df[bookie_odds_col].isna() * bets_placed[tcol]

# # Payout = payout for each win
# wins_mask = bets_placed[tcol] * actual_outcomes[tcol]
# payouts = dec_odds_df[bookie_odds_col] * wins_mask
# total_payout = np.nansum(payouts.values).round(2)

# # Debt = bet placed for each loss
# loss_mask = bets_placed[tcol] * not_actual_outcomes[tcol]
# debts = bets_placed[tcol] * loss_mask
# total_debt = np.nansum(debts.values).round(2)

# print(f'{bets.values.sum()} bets placed')
# print(f'{wins_mask.values.sum()} bets won')
# print(f'${total_payout} paid out')
# print(f'${total_debt} lost')

# temp = pd.concat(
#     [
#         valid_df[['Date', 'Team1', 'Team2', bookie_odds_col]].reset_index(drop=True),
#         pd.DataFrame(data=actual_outcomes['1'].values, columns=['Actual']),
#         pd.DataFrame(data=yhat_df['1'].values, columns=['yhat']),
#         pd.DataFrame(data=bets.astype(int), columns=['Bet?']),
#         pd.DataFrame(data=wins_mask.values, columns=['1 Wins'], dtype=int),
#         pd.DataFrame(data=dec_odds_df[bookie_odds_col].values, columns=['Dec. odds']),
#         pd.DataFrame(data=payouts.round(2), columns=['Payout'])
#     ],
#     axis=1,
# )
# temp[temp['Bet?']==1]