In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fastai.tabular.all import *

# Kaggle analysis

## Data preparation

In [3]:
data_dir = Path('../data/kaggle')
data_dir.ls()

(#1) [Path('../data/kaggle/closing_odds.feather')]

In [4]:
raw_df = pd.read_feather(data_dir/'closing_odds.feather')
raw_df.head()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win
0,170088,England: Premier League,2005-01-01,Liverpool,0,Chelsea,1,2.9944,3.1944,2.2256,3.2,3.25,2.29
1,170125,England: League Two,2005-01-01,Lincoln City,1,Bury,0,1.8667,3.2222,3.6922,1.91,3.3,3.93
2,170126,England: League Two,2005-01-01,Macclesfield,1,Chester,2,1.7822,3.3,4.0,1.85,3.5,4.34
3,170127,England: League Two,2005-01-01,Oxford Utd,2,Wycombe,1,2.3122,3.1967,2.7067,2.38,3.27,2.85
4,170128,England: League Two,2005-01-01,Scunthorpe,0,Darlington,1,1.6411,3.3922,4.7078,1.67,3.5,5.5


In [5]:
raw_df['result'] = 0
raw_df.loc[raw_df.home_score>raw_df.away_score, 'result'] = -1
raw_df.loc[raw_df.home_score<raw_df.away_score, 'result'] = 1

## Odds only

In [6]:
odds_df = pd.DataFrame()

odds_df['x_home'] = raw_df.max_odds_home_win
odds_df['x_draw'] = raw_df.max_odds_draw
odds_df['x_away'] = raw_df.max_odds_away_win

odds_df.head()

Unnamed: 0,x_home,x_draw,x_away
0,2.9944,3.1944,2.2256
1,1.8667,3.2222,3.6922
2,1.7822,3.3,4.0
3,2.3122,3.1967,2.7067
4,1.6411,3.3922,4.7078


In [7]:
odds_df['y_home'] = -1
odds_df['y_away'] = -1
odds_df['y_draw'] = -1
odds_df['y_none'] = 0

odds_df.loc[raw_df.result==-1, 'y_home'] = odds_df.x_home[raw_df.result==-1] - 1
odds_df.loc[raw_df.result==1, 'y_away'] = odds_df.x_away[raw_df.result==1] - 1
odds_df.loc[raw_df.result==0, 'y_draw'] = odds_df.x_away[raw_df.result==0] - 1

odds_df['league'] = raw_df.league

odds_df.head()

Unnamed: 0,x_home,x_draw,x_away,y_home,y_away,y_draw,y_none,league
0,2.9944,3.1944,2.2256,-1.0,1.2256,-1.0,0,England: Premier League
1,1.8667,3.2222,3.6922,0.8667,-1.0,-1.0,0,England: League Two
2,1.7822,3.3,4.0,-1.0,3.0,-1.0,0,England: League Two
3,2.3122,3.1967,2.7067,1.3122,-1.0,-1.0,0,England: League Two
4,1.6411,3.3922,4.7078,-1.0,3.7078,-1.0,0,England: League Two


In [8]:
odds_mean = odds_df.iloc[:,:3].values.mean()
odds_std = odds_df.iloc[:,:3].values.std()
odds_mean, odds_std

(3.389858971230326, 2.1492889127618686)

In [9]:
odds_df.iloc[:, :3] = (odds_df.iloc[:, :3] - odds_mean) / odds_std

In [10]:
odds_df.head()

Unnamed: 0,x_home,x_draw,x_away,y_home,y_away,y_draw,y_none,league
0,-0.183995,-0.090941,-0.541695,-1.0,1.2256,-1.0,0,England: Premier League
1,-0.70868,-0.078007,0.14067,0.8667,-1.0,-1.0,0,England: League Two
2,-0.747996,-0.041809,0.28388,-1.0,3.0,-1.0,0,England: League Two
3,-0.501403,-0.089871,-0.317853,1.3122,-1.0,-1.0,0,England: League Two
4,-0.813645,0.001089,0.613199,-1.0,3.7078,-1.0,0,England: League Two


### Dataloaders

In [11]:
to = TabularPandas(odds_df, cat_names=['league'], procs=[Categorify],
                   cont_names = ['x_home', 'x_draw', 'x_away'],
                   y_names=['y_home', 'y_draw', 'y_away', 'y_none'],
                   splits=RandomSplitter(valid_pct=0.2)(range_of(odds_df)))

In [12]:
dls = to.dataloaders(bs=256)
dls.show_batch()

Unnamed: 0,league,x_home,x_draw,x_away,y_home,y_draw,y_away,y_none
0,Serbia: Super Liga,-0.857009,-0.000679,0.985461,0.5479,-1.0,-1.0,0.0
1,World: Club Friendly,-0.222752,-0.042507,-0.557375,-1.0,-1.0,1.1919,0.0
2,Germany: 2. Bundesliga,-0.699887,-0.052138,0.129829,-1.0,-1.0,2.6689,0.0
3,Germany: Oberliga Niederrhein,0.291511,0.344691,-0.832442,-1.0,-1.0,0.6007,0.0
4,Germany: Regionalliga North,1.152307,0.314821,-0.893718,-1.0,0.469,-1.0,0.0
5,Ireland: Division 1,1.46669,0.277832,-0.908421,-1.0,0.4374,-1.0,0.0
6,Scotland: Division 2,-0.913399,0.378982,0.977645,-1.0,4.4911,-1.0,0.0
7,England: Vanarama Conference North,-0.823649,0.177427,0.551039,-1.0,-1.0,3.5742,0.0
8,Chile: Primera B,0.077812,-0.055115,-0.69984,-1.0,0.8857,-1.0,0.0
9,Venezuela: Primera Division,-0.956065,0.341481,1.982116,-1.0,-1.0,6.65,0.0


In [13]:
def odds_loss(actual, target):
    probs = F.softmax(actual, dim=1)
    return -(probs*target).sum(dim=1).mean()

def odds_profit(actual, target):
    probs = F.softmax(actual, dim=1)
    return (probs*target).sum()

In [14]:
learn = tabular_learner(dls, loss_func=odds_loss, metrics=odds_profit)
learn.model = learn.model.to('cuda')

In [15]:
learn.fit_one_cycle(10, lr_max=1e-5)

epoch,train_loss,valid_loss,odds_profit,time
0,0.076025,0.072771,-18.606413,00:16
1,0.053508,0.051151,-13.076734,00:17
2,0.006017,0.01516,-3.86958,00:16
3,-0.011368,-0.020713,5.309167,00:17
4,-0.041465,-0.044193,11.318804,00:16
5,-0.063444,-0.056178,14.386627,00:16
6,-0.081621,-0.061381,15.718001,00:17
7,-0.080551,-0.064154,16.427647,00:16
8,-0.065254,-0.064802,16.593849,00:16
9,-0.076853,-0.06492,16.624287,00:16


In [16]:
len(dls.valid_ds)

95888

In [17]:
profit = 0.
for x1,x2,y in dls.valid:
    preds = learn.model(x1,x2)
    profit += odds_profit(preds, y)

profit.item()

6225.01123046875

In [18]:
F.softmax(preds, dim=1).argmin(dim=0)

tensor([91, 91, 60, 60], device='cuda:0')