In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from fastai.tabular.all import *

# Kaggle analysis

## Data preparation

In [4]:
data_dir = Path('../data/kaggle')
data_dir.ls()

(#1) [Path('../data/kaggle/closing_odds.feather')]

In [5]:
raw_df = pd.read_feather(data_dir/'closing_odds.feather')
raw_df.head()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win
0,170088,England: Premier League,2005-01-01,Liverpool,0,Chelsea,1,2.9944,3.1944,2.2256,3.2,3.25,2.29
1,170125,England: League Two,2005-01-01,Lincoln City,1,Bury,0,1.8667,3.2222,3.6922,1.91,3.3,3.93
2,170126,England: League Two,2005-01-01,Macclesfield,1,Chester,2,1.7822,3.3,4.0,1.85,3.5,4.34
3,170127,England: League Two,2005-01-01,Oxford Utd,2,Wycombe,1,2.3122,3.1967,2.7067,2.38,3.27,2.85
4,170128,England: League Two,2005-01-01,Scunthorpe,0,Darlington,1,1.6411,3.3922,4.7078,1.67,3.5,5.5


In [6]:
raw_df['result'] = 0
raw_df.loc[raw_df.home_score>raw_df.away_score, 'result'] = -1
raw_df.loc[raw_df.home_score<raw_df.away_score, 'result'] = 1

## Odds only

To see if this method is viable at all we'll use the maximum odds for betting.

In [9]:
odds_df = pd.DataFrame()

odds_df['x_home'] = raw_df.max_odds_home_win
odds_df['x_draw'] = raw_df.max_odds_draw
odds_df['x_away'] = raw_df.max_odds_away_win

odds_df['y_home'] = -1
odds_df['y_away'] = -1
odds_df['y_draw'] = -1
odds_df['y_none'] = 0

odds_df.loc[raw_df.result==-1, 'y_home'] = odds_df.x_home[raw_df.result==-1] - 1
odds_df.loc[raw_df.result==1, 'y_away'] = odds_df.x_away[raw_df.result==1] - 1
odds_df.loc[raw_df.result==0, 'y_draw'] = odds_df.x_away[raw_df.result==0] - 1

### Add league info

Due to a bug(?) in fastai we need at least one categorical column for the model to work, so we'll add the league name and the country (or 'category' in case of 'world' and 'europe').

In [11]:
odds_df['country'] = raw_df.apply(lambda r: r.league.split(': ')[0], axis=1)
odds_df['league'] = raw_df.apply(lambda r: r.league.split(': ')[1], axis=1)

In [12]:
odds_df.head()

Unnamed: 0,x_home,x_draw,x_away,y_home,y_away,y_draw,y_none,country,league
0,3.2,3.25,2.29,-1.0,1.29,-1.0,0,England,Premier League
1,1.91,3.3,3.93,0.91,-1.0,-1.0,0,England,League Two
2,1.85,3.5,4.34,-1.0,3.34,-1.0,0,England,League Two
3,2.38,3.27,2.85,1.38,-1.0,-1.0,0,England,League Two
4,1.67,3.5,5.5,-1.0,4.5,-1.0,0,England,League Two


### Standardization

We want the three input odds to be standardized according to their (total) mean and standard deviation, so we'll do this by hand.

In [14]:
odds_mean = odds_df.iloc[:,:3].values.mean()
odds_std = odds_df.iloc[:,:3].values.std()
print('Odds mean: ', odds_mean)
print('Odds std: ', odds_std)

Odds mean:  3.767972801601869
Odds std:  3.1308143325062425


In [15]:
odds_df.iloc[:, :3] = (odds_df.iloc[:, :3] - odds_mean) / odds_std

odds_df.head()

Unnamed: 0,x_home,x_draw,x_away,y_home,y_away,y_draw,y_none,country,league
0,-0.181414,-0.165443,-0.472073,-1.0,1.29,-1.0,0,England,Premier League
1,-0.593447,-0.149473,0.051752,0.91,-1.0,-1.0,0,England,League Two
2,-0.612611,-0.085592,0.182709,-1.0,3.34,-1.0,0,England,League Two
3,-0.443326,-0.159055,-0.293206,1.38,-1.0,-1.0,0,England,League Two
4,-0.670105,-0.085592,0.553219,-1.0,4.5,-1.0,0,England,League Two


### Dataloaders

In [16]:
to = TabularPandas(odds_df, cat_names=['league', 'country'], procs=[Categorify],
                   cont_names = ['x_home', 'x_draw', 'x_away'],
                   y_names=['y_home', 'y_draw', 'y_away', 'y_none'],
                   splits=RandomSplitter(valid_pct=0.2)(range_of(odds_df)))

In [17]:
dls = to.dataloaders(bs=256)
dls.show_batch()

Unnamed: 0,league,country,x_home,x_draw,x_away,y_home,y_draw,y_away,y_none
0,Premier League,Bosnia and Herzegovina,-0.788285,0.122022,1.463526,0.3,-1.0,-1.0,0.0
1,Blue Square Bet Premier,England,-0.65094,0.054946,0.553219,-1.0,-1.0,4.5,0.0
2,Premier,Wales,-0.564701,-0.117533,-0.101562,1.0,-1.0,-1.0,0.0
3,T-League,Australia,0.074111,0.553219,-0.686075,-1.0,-1.0,0.62,0.0
4,Premier League,Armenia,-0.692463,0.106051,0.808744,-1.0,-1.0,5.3,0.0
5,Championship,England,-0.436938,-0.165443,-0.293206,-1.0,1.85,-1.0,0.0
6,Persian Gulf Pro League,Iran,0.14438,-0.165443,-0.50082,-1.0,-1.0,1.2,0.0
7,Club Friendly,World,-0.676493,0.150768,0.633071,0.65,-1.0,-1.0,0.0
8,Division 2,Poland,0.185903,-0.085592,-0.628582,-1.0,-1.0,0.8,0.0
9,Euro U21,Europe,-0.778702,0.553219,2.565475,0.33,-1.0,-1.0,0.0


### Loss function, profit

Applying softmax to the output of the network lets us interpret the values as ratios. These ratios can be understood as the ratio of a fixed amount of money that is bet on one of the outcomes (home win, draw, away win). Multiplying these ratios with the odds(which already have 1 selected to account for the cost of taking the bet) and summing across that row gives the bank roll after the bet.

Optimization algorithms in machine learning usually minimize, so we'll use minus the bank roll as the loss function. More precise: The mean of the bank rolls in the current batch.

When watching the training progress keep in mind that a negative loss is good in this case. Additionally, the actual profit for each batch is shown too (fastai takes the mean across all validation batches?)

In [21]:
def odds_loss(actual, target):
    probs = F.softmax(actual, dim=1)
    return -(probs*target).sum(dim=1).mean()

def odds_profit(actual, target):
    probs = F.softmax(actual, dim=1)
    return (probs*target).sum()

In [22]:
learn = tabular_learner(dls, loss_func=odds_loss, metrics=odds_profit)
learn.model = learn.model.to('cuda')

In [23]:
learn.fit_one_cycle(3, lr_max=1e-3)

epoch,train_loss,valid_loss,odds_profit,time
0,-0.176046,-0.158153,40.45826,00:19
1,-0.167066,-0.168299,43.053513,00:19
2,-0.187183,-0.176055,45.05534,00:20


In [27]:
profit = 0.
for x1,x2,y in dls.valid:
    preds = learn.model(x1,x2)
    profit += odds_profit(preds, y)

print('Total profit on the validation set: ', profit.item())
print('Samples in the validation set: ', len(dls.valid_ds))

Total profit on the validation set:  16881.53125
Samples in the validation set:  95888


Looking promising. One apparent 'bug' is that we're using random split for validation. Since this is timeseries data, we really should use the latest (date-wise) rows for validation

### Better validation, seasons

The easiest way to get better validation data is to transform the dates into seasons. First as string ('0506' for the season 2005/2006) for storage, then as (continue)