In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fastai.tabular.all import *

# Kaggle analysis

## Data preparation

In [3]:
data_dir = Path('../data/kaggle')
data_dir.ls()

(#1) [Path('../data/kaggle/closing_odds.feather')]

In [4]:
raw_df = pd.read_feather(data_dir/'closing_odds.feather')
raw_df.head()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win
0,170088,England: Premier League,2005-01-01,Liverpool,0,Chelsea,1,2.9944,3.1944,2.2256,3.2,3.25,2.29
1,170125,England: League Two,2005-01-01,Lincoln City,1,Bury,0,1.8667,3.2222,3.6922,1.91,3.3,3.93
2,170126,England: League Two,2005-01-01,Macclesfield,1,Chester,2,1.7822,3.3,4.0,1.85,3.5,4.34
3,170127,England: League Two,2005-01-01,Oxford Utd,2,Wycombe,1,2.3122,3.1967,2.7067,2.38,3.27,2.85
4,170128,England: League Two,2005-01-01,Scunthorpe,0,Darlington,1,1.6411,3.3922,4.7078,1.67,3.5,5.5


In [5]:
raw_df['result'] = 0
raw_df.loc[raw_df.home_score>raw_df.away_score, 'result'] = -1
raw_df.loc[raw_df.home_score<raw_df.away_score, 'result'] = 1

## Odds only

To see if this method is viable at all we'll use the maximum odds for betting.

In [6]:
odds_df = pd.DataFrame()

odds_df['x_home'] = raw_df.max_odds_home_win
odds_df['x_draw'] = raw_df.max_odds_draw
odds_df['x_away'] = raw_df.max_odds_away_win

odds_df['y_home'] = -1
odds_df['y_away'] = -1
odds_df['y_draw'] = -1
odds_df['y_none'] = 0

odds_df.loc[raw_df.result==-1, 'y_home'] = odds_df.x_home[raw_df.result==-1] - 1
odds_df.loc[raw_df.result==1, 'y_away'] = odds_df.x_away[raw_df.result==1] - 1
odds_df.loc[raw_df.result==0, 'y_draw'] = odds_df.x_away[raw_df.result==0] - 1

### Add league info

Due to a bug(?) in fastai we need at least one categorical column for the model to work, so we'll add the league name and the country (or 'category' in case of 'world' and 'europe').

In [7]:
league_df = raw_df.league.str.split(pat=': ', expand=True)
league_df.columns = ['country', 'league']

In [8]:
odds_df['country'] = league_df.country
odds_df['league'] = league_df.league

In [9]:
odds_df.head()

Unnamed: 0,x_home,x_draw,x_away,y_home,y_away,y_draw,y_none,country,league
0,3.2,3.25,2.29,-1.0,1.29,-1.0,0,England,Premier League
1,1.91,3.3,3.93,0.91,-1.0,-1.0,0,England,League Two
2,1.85,3.5,4.34,-1.0,3.34,-1.0,0,England,League Two
3,2.38,3.27,2.85,1.38,-1.0,-1.0,0,England,League Two
4,1.67,3.5,5.5,-1.0,4.5,-1.0,0,England,League Two


### Standardization

We want the three input odds to be standardized according to their (total) mean and standard deviation, so we'll do this by hand.

In [10]:
odds_mean = odds_df.iloc[:,:3].values.mean()
odds_std = odds_df.iloc[:,:3].values.std()
print('Odds mean: ', odds_mean)
print('Odds std: ', odds_std)

Odds mean:  3.767972801601869
Odds std:  3.1308143325062425


In [11]:
odds_df.iloc[:, :3] = (odds_df.iloc[:, :3] - odds_mean) / odds_std

odds_df.head()

Unnamed: 0,x_home,x_draw,x_away,y_home,y_away,y_draw,y_none,country,league
0,-0.181414,-0.165443,-0.472073,-1.0,1.29,-1.0,0,England,Premier League
1,-0.593447,-0.149473,0.051752,0.91,-1.0,-1.0,0,England,League Two
2,-0.612611,-0.085592,0.182709,-1.0,3.34,-1.0,0,England,League Two
3,-0.443326,-0.159055,-0.293206,1.38,-1.0,-1.0,0,England,League Two
4,-0.670105,-0.085592,0.553219,-1.0,4.5,-1.0,0,England,League Two


### Dataloaders

In [12]:
to = TabularPandas(odds_df, cat_names=['league', 'country'], procs=[Categorify],
                   cont_names = ['x_home', 'x_draw', 'x_away'],
                   y_names=['y_home', 'y_draw', 'y_away', 'y_none'],
                   splits=RandomSplitter(valid_pct=0.2)(range_of(odds_df)))

In [13]:
dls = to.dataloaders(bs=1024)
dls.show_batch()

Unnamed: 0,league,country,x_home,x_draw,x_away,y_home,y_draw,y_away,y_none
0,Division 2,Czech Republic,-0.852166,1.351734,2.948762,0.1,-1.0,-1.0,0.0
1,3. Liga,Germany,-0.612611,-0.012129,0.438233,0.85,-1.0,-1.0,0.0
2,Youth League,Czech Republic,-0.357087,0.01023,-0.277235,-1.0,-1.0,1.9,0.0
3,Northern Premier League,England,-0.65094,-0.005741,0.137992,-1.0,3.2,-1.0,0.0
4,Premier League,Ireland,-0.443326,-0.165443,-0.213354,-1.0,2.1,-1.0,0.0
5,Ryman League,England,-0.325146,-0.021711,-0.443326,1.75,-1.0,-1.0,0.0
6,Division 2 - Center,Russia,-0.692463,0.01023,0.537249,0.6,-1.0,-1.0,0.0
7,Serbian Cup,Serbia,0.36477,-0.085592,-0.628582,-1.0,-1.0,0.8,0.0
8,Football League,Greece,-0.772314,0.313665,2.309951,-1.0,10.0,-1.0,0.0
9,S.League,Singapore,-0.436938,-0.149473,-0.277235,1.4,-1.0,-1.0,0.0


### Loss function, profit

Applying softmax to the output of the network lets us interpret the values as ratios. These ratios can be understood as the ratio of a fixed amount of money that is bet on one of the outcomes (home win, draw, away win). Multiplying these ratios with the odds(which already have 1 selected to account for the cost of taking the bet) and summing across that row gives the bank roll after the bet.

Optimization algorithms in machine learning usually minimize, so we'll use minus the bank roll as the loss function. More precise: The mean of the bank rolls in the current batch.

When watching the training progress keep in mind that a negative loss is good in this case. Additionally, the actual profit for each batch is shown too (fastai takes the mean across all validation batches?)

In [14]:
def odds_loss(actual, target):
    probs = F.softmax(actual, dim=1)
    return -(probs*target).sum(dim=1).mean()

def odds_profit(actual, target):
    probs = F.softmax(actual, dim=1)
    return (probs*target).sum()

In [15]:
learn = tabular_learner(dls, loss_func=odds_loss, metrics=odds_profit, layers=[500,250, 100])
learn.model = learn.model.to('cuda')

In [16]:
learn.fit_one_cycle(5, lr_max=1e-3)

epoch,train_loss,valid_loss,odds_profit,time
0,-0.163496,-0.159577,162.417938,00:07
1,-0.173421,-0.165823,168.985794,00:07
2,-0.177111,-0.178063,181.36113,00:07
3,-0.191212,-0.182701,186.220673,00:08
4,-0.192067,-0.188173,191.792068,00:07


In [17]:
profit = 0.
for x1,x2,y in dls.valid:
    preds = learn.model(x1,x2)
    profit += odds_profit(preds, y)

print('Total profit on the validation set: ', profit.item())
print('Samples in the validation set: ', len(dls.valid_ds))

Total profit on the validation set:  18043.5234375
Samples in the validation set:  95888


Looking promising. One apparent 'bug' is that we're using random split for validation. Since this is timeseries data, we really should use the latest (date-wise) rows for validation

### Test without categories

In [18]:
without_cats = odds_df.copy()
without_cats.drop(columns=['league'], inplace=True)
without_cats.country = 0

In [51]:
to = TabularPandas(without_cats, cat_names=['country'], procs=[Categorify],
                   cont_names = ['x_home', 'x_draw', 'x_away'],
                   y_names=['y_home', 'y_draw', 'y_away', 'y_none'],
                   splits=RandomSplitter(valid_pct=0.2)(range_of(odds_df)))
dls = to.dataloaders(bs=1014)

In [52]:
learn = tabular_learner(dls, loss_func=odds_loss, metrics=odds_profit, layers=[500,250, 100])
learn.model = learn.model.to('cuda')
learn.fit_one_cycle(5, lr_max=1e-3)

epoch,train_loss,valid_loss,odds_profit,time
0,-0.175149,-0.189389,191.569122,00:07
1,-0.170207,-0.186377,188.516266,00:07
2,-0.1848,-0.187636,189.837128,00:07
3,-0.174793,-0.190697,192.924408,00:07
4,-0.192104,-0.190026,192.196274,00:07


### Better validation, seasons

The easiest way to get better validation data is to transform the dates into seasons. First as string ('0506' for the season 2005/2006) for storage, then as category.

In [53]:
def season_from_row(row):
    date = row.match_date
    if date.month<=7:
        season = date.year-1
    else:
        season = date.year
    return season

In [54]:
season_from_row(raw_df.iloc[0])

2004

In [55]:
raw_df['season'] = raw_df.apply(season_from_row, axis=1)
raw_df.head()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win,result,season
0,170088,England: Premier League,2005-01-01,Liverpool,0,Chelsea,1,2.9944,3.1944,2.2256,3.2,3.25,2.29,1,2004
1,170125,England: League Two,2005-01-01,Lincoln City,1,Bury,0,1.8667,3.2222,3.6922,1.91,3.3,3.93,-1,2004
2,170126,England: League Two,2005-01-01,Macclesfield,1,Chester,2,1.7822,3.3,4.0,1.85,3.5,4.34,1,2004
3,170127,England: League Two,2005-01-01,Oxford Utd,2,Wycombe,1,2.3122,3.1967,2.7067,2.38,3.27,2.85,-1,2004
4,170128,England: League Two,2005-01-01,Scunthorpe,0,Darlington,1,1.6411,3.3922,4.7078,1.67,3.5,5.5,1,2004


In [56]:
class ColumnNormalizer:
    def __init__(self, column):
        self.mean = column.mean()
        self.std = column.std()
        
    def __call__(self, col):
        return (col-self.mean) / self.std
    
    def __str__(self): return f'Mean: {self.mean} | Std: {self.std}'
    def __repr__(self): return str(self)

In [57]:
season_norm = ColumnNormalizer(raw_df.season)
season_norm

Mean: 2010.4186446687802 | Std: 2.7880621822507554

In [58]:
odds_df['season'] = season_norm(raw_df.season)
odds_df.head()

Unnamed: 0,x_home,x_draw,x_away,y_home,y_away,y_draw,y_none,country,league,season,valid
0,-0.181414,-0.165443,-0.472073,-1.0,1.29,-1.0,0,England,Premier League,-2.302188,False
1,-0.593447,-0.149473,0.051752,0.91,-1.0,-1.0,0,England,League Two,-2.302188,False
2,-0.612611,-0.085592,0.182709,-1.0,3.34,-1.0,0,England,League Two,-2.302188,False
3,-0.443326,-0.159055,-0.293206,1.38,-1.0,-1.0,0,England,League Two,-2.302188,False
4,-0.670105,-0.085592,0.553219,-1.0,4.5,-1.0,0,England,League Two,-2.302188,False


In [59]:
odds_df['valid'] = raw_df.season>=2013
odds_df.head()

Unnamed: 0,x_home,x_draw,x_away,y_home,y_away,y_draw,y_none,country,league,season,valid
0,-0.181414,-0.165443,-0.472073,-1.0,1.29,-1.0,0,England,Premier League,-2.302188,False
1,-0.593447,-0.149473,0.051752,0.91,-1.0,-1.0,0,England,League Two,-2.302188,False
2,-0.612611,-0.085592,0.182709,-1.0,3.34,-1.0,0,England,League Two,-2.302188,False
3,-0.443326,-0.159055,-0.293206,1.38,-1.0,-1.0,0,England,League Two,-2.302188,False
4,-0.670105,-0.085592,0.553219,-1.0,4.5,-1.0,0,England,League Two,-2.302188,False


In [65]:
to = TabularPandas(odds_df, cat_names=['country', 'league'], procs=[Categorify],
                   cont_names = ['x_home', 'x_draw', 'x_away', 'season'],
                   y_names=['y_home', 'y_draw', 'y_away', 'y_none'],
                   splits=ColSplitter('valid')(odds_df))
dls = to.dataloaders(bs=1024)

In [66]:
learn = tabular_learner(dls, loss_func=odds_loss, metrics=odds_profit, layers=[500,250, 100, 50])
learn.model = learn.model.to('cuda')
learn.fit_one_cycle(8, lr_max=1e-3)

epoch,train_loss,valid_loss,odds_profit,time
0,-0.157196,-0.13469,137.903198,00:08
1,-0.17548,-0.143944,147.31662,00:08
2,-0.167195,-0.14264,145.988068,00:07
3,-0.175182,-0.13572,138.777374,00:08
4,-0.183399,-0.150847,154.375427,00:07
5,-0.194536,-0.158576,162.284927,00:08
6,-0.203045,-0.159377,163.143448,00:08
7,-0.215906,-0.160316,164.079315,00:07


In [75]:
save_path = Path('../data/kaggle/closed_preprocess.feather')
odds_df.to_feather(save_path)