In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.externals import joblib
from scipy import stats

PATH = os.path.join(os.getcwd(), '../')

if PATH not in sys.path:
    sys.path.append(PATH)
    
from server.ml_estimators import BenchmarkEstimator, BaggingEstimator
from server.ml_data import JoinedMLData
from server.data_readers import FitzroyDataReader

class YearSplit:
    def __init__(self, n_splits=3, year_col='year'):
        if n_splits < 2:
            raise ValueError('n_splits must be > 2.')

        self.n_splits = n_splits
        self.year_col = year_col
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits
    
    def split(self, X, y=None, gorups=None):
        max_test_year = int(X[self.year_col].max())
        min_test_year = int(max_test_year - self.n_splits + 1)
        numeric_index = X.reset_index().index.values
        
        for year in range(min_test_year, max_test_year + 1):
            train_index = (X[self.year_col] < year).values
            test_index = (X[self.year_col] == year).values
            yield numeric_index[train_index], numeric_index[test_index]
            
SEED = 42
N_SPLITS = 5
N_ITER = 100

np.random.seed(SEED)

In [2]:
data = JoinedMLData(train_years=(None, 2015), test_years=(2016, 2016), fetch_data=True)
data.data

R[write to console]: Returning data from 1965-01-01 to 2019-03-17

R[write to console]: Error in open.connection(x, "rb") : HTTP error 404.
Calls: <Anonymous> ... <Anonymous> -> map -> .Call -> .f -> read_html.default

R[write to console]: Returning data from 1965-01-01 to 2018-12-31



end_date of 2019-03-17 is in a year for which AFLTables has no data. Retrying with an end_date of the end of last year (2018-12-31).


R[write to console]: Downloading data


R[write to console]: 
Finished downloading data. Processing XMLs


R[write to console]: Finished getting afltables data



Unnamed: 0,Unnamed: 1,Unnamed: 2,team,oppo_team,round_type,venue,win_odds,line_odds,oppo_win_odds,oppo_line_odds,betting_pred_win,rolling_betting_pred_win_rate,...,oppo_rolling_mean_score_by_venue,cum_percent,ladder_position,elo_pred_win,rolling_elo_pred_win_rate,elo_rating_divided_by_ladder_position,oppo_cum_percent,oppo_ladder_position,elo_rating_divided_by_win_odds,win_odds_multiplied_by_ladder_position
Adelaide,1991.0,1.0,Adelaide,Hawthorn,Regular,Football Park,0.00,0.0,0.00,0.0,0.0,0.000000,...,0.000000,0.000000,8.0,1.0,1.000000,125.000000,0.876712,4.0,0.000000,0.00
Adelaide,1991.0,2.0,Adelaide,Carlton,Regular,Football Park,0.00,0.0,0.00,0.0,0.0,0.000000,...,0.000000,2.246377,2.0,1.0,1.000000,502.732476,0.904000,8.0,0.000000,0.00
Adelaide,1991.0,3.0,Adelaide,Sydney,Regular,S.C.G.,0.00,0.0,0.00,0.0,0.0,0.000000,...,91.000000,1.364162,2.0,1.0,1.000000,500.426152,0.603279,14.0,0.000000,0.00
Adelaide,1991.0,4.0,Adelaide,Essendon,Regular,Windy Hill,0.00,0.0,0.00,0.0,0.0,0.000000,...,114.625000,1.309609,2.0,0.0,0.750000,499.891415,1.099315,6.0,0.000000,0.00
Adelaide,1991.0,5.0,Adelaide,West Coast,Regular,Subiaco,0.00,0.0,0.00,0.0,0.0,0.000000,...,118.125000,1.112601,6.0,0.0,0.600000,165.944176,1.452756,3.0,0.000000,0.00
Adelaide,1991.0,6.0,Adelaide,Western Bulldogs,Regular,Football Park,0.00,0.0,0.00,0.0,0.0,0.000000,...,0.000000,0.954274,10.0,1.0,0.666667,98.917578,0.872302,6.0,0.000000,0.00
Adelaide,1991.0,7.0,Adelaide,St Kilda,Regular,Moorabbin Oval,0.00,0.0,0.00,0.0,0.0,0.000000,...,109.500000,1.013333,7.0,1.0,0.714286,141.398951,0.909361,9.0,0.000000,0.00
Adelaide,1991.0,9.0,Adelaide,North Melbourne,Regular,M.C.G.,0.00,0.0,0.00,0.0,0.0,0.000000,...,123.875000,0.838583,12.0,0.0,0.625000,80.926845,0.875962,8.0,0.000000,0.00
Adelaide,1991.0,10.0,Adelaide,Melbourne,Regular,Football Park,0.00,0.0,0.00,0.0,0.0,0.000000,...,0.000000,0.858277,12.0,0.0,0.555556,81.001462,1.233261,2.0,0.000000,0.00
Adelaide,1991.0,11.0,Adelaide,Geelong,Regular,Kardinia Park,0.00,0.0,0.00,0.0,0.0,0.000000,...,104.000000,0.904612,12.0,1.0,0.600000,81.335584,1.100665,7.0,0.000000,0.00


## Get best params for BaggingEstimator

In [3]:
X_train, y_train = data.train_data()
bag_model = BaggingEstimator()

bag_model.set_params(pipeline__correlationselector__labels=y_train)
bag_params = {
    'pipeline__baggingregressor__base_estimator__booster': ['gbtree', 'gblinear', 'dart'],
    'pipeline__baggingregressor__base_estimator__colsample_bylevel': stats.uniform(0.8, 0.2),
    'pipeline__baggingregressor__base_estimator__colsample_bytree': stats.uniform(0.8, 0.2),
    'pipeline__baggingregressor__base_estimator__learning_rate': stats.uniform(0.1, 0.04),
    'pipeline__baggingregressor__base_estimator__max_depth': stats.randint(2, 10),
    'pipeline__baggingregressor__base_estimator__n_estimators': stats.randint(75, 150),
    'pipeline__baggingregressor__base_estimator__reg_alpha': stats.uniform(0.05, 0.05),
    'pipeline__baggingregressor__base_estimator__reg_lambda': stats.uniform(0.1, 0.04),
    'pipeline__baggingregressor__base_estimator__subsample': stats.uniform(0.8, 0.2),
    'pipeline__baggingregressor__n_estimators': stats.randint(5, 15),
    'pipeline__correlationselector__threshold': stats.uniform(0.03, 0.02),
}

bag_cv = RandomizedSearchCV(
    bag_model,
    bag_params,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    cv=YearSplit(n_splits=N_SPLITS),
    n_iter=N_ITER,
    random_state=SEED,
    verbose=2
)
                            
bag_cv

RandomizedSearchCV(cv=<__main__.YearSplit object at 0x7fa132ad66a0>,
          error_score='raise-deprecating',
          estimator=BaggingEstimator(name='BaggingEstimator',
         pipeline=Pipeline(memory=None,
     steps=[('correlationselector', CorrelationSelector(cols_to_keep=['team', 'oppo_team', 'round_type', 'venue'],
          labels=Adelaide          1991.0  1.0      86.0
                          2.0     -2...ators=10, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False))])),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'pipeline__baggingregressor__base_estimator__booster': ['gbtree', 'gblinear', 'dart'], 'pipeline__baggingregressor__base_estimator__colsample_bylevel': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa133d2f128>, 'pipeline__baggingregressor__base_estimator__colsample_...elationselector__threshold': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa132ad6588

In [4]:
bag_cv.fit(*data.train_data())

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 22.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 104.3min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 204.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 279.6min finished


RandomizedSearchCV(cv=<__main__.YearSplit object at 0x7fa132ad66a0>,
          error_score='raise-deprecating',
          estimator=BaggingEstimator(name='BaggingEstimator',
         pipeline=Pipeline(memory=None,
     steps=[('correlationselector', CorrelationSelector(cols_to_keep=['team', 'oppo_team', 'round_type', 'venue'],
          labels=Adelaide          1991.0  1.0      86.0
                          2.0     -2...ators=10, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False))])),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'pipeline__baggingregressor__base_estimator__booster': ['gbtree', 'gblinear', 'dart'], 'pipeline__baggingregressor__base_estimator__colsample_bylevel': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa133d2f128>, 'pipeline__baggingregressor__base_estimator__colsample_...elationselector__threshold': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa132ad6588

In [5]:
bag_cv.best_score_, bag_cv.refit_time_, bag_cv.best_params_

(-28.81475174024802,
 296.55789399147034,
 {'pipeline__baggingregressor__base_estimator__booster': 'dart',
  'pipeline__baggingregressor__base_estimator__colsample_bylevel': 0.9593085973720467,
  'pipeline__baggingregressor__base_estimator__colsample_bytree': 0.8366869579732328,
  'pipeline__baggingregressor__base_estimator__learning_rate': 0.13118764001091077,
  'pipeline__baggingregressor__base_estimator__max_depth': 6,
  'pipeline__baggingregressor__base_estimator__n_estimators': 149,
  'pipeline__baggingregressor__base_estimator__reg_alpha': 0.07296244459829336,
  'pipeline__baggingregressor__base_estimator__reg_lambda': 0.11334834444556088,
  'pipeline__baggingregressor__base_estimator__subsample': 0.8285733635843882,
  'pipeline__baggingregressor__n_estimators': 7,
  'pipeline__correlationselector__threshold': 0.030411689885916048})

## Compare to FootyTipper

In [6]:
fitzroy = FitzroyDataReader()
match_results = fitzroy.match_results()

match_results

Unnamed: 0,game,date,round,home_team,home_goals,home_behinds,home_points,away_team,away_goals,away_behinds,away_points,venue,margin,season,round_type,round_number
1,1.0,-26535.0,R1,Fitzroy,6,13,49,Carlton,2,4,16,Brunswick St,33,1897.0,Regular,1
2,2.0,-26535.0,R1,Collingwood,5,11,41,St Kilda,2,4,16,Victoria Park,25,1897.0,Regular,1
3,3.0,-26535.0,R1,Geelong,3,6,24,Essendon,7,5,47,Corio Oval,-23,1897.0,Regular,1
4,4.0,-26535.0,R1,Sydney,3,9,27,Melbourne,6,8,44,Lake Oval,-17,1897.0,Regular,1
5,5.0,-26528.0,R2,Sydney,6,4,40,Carlton,5,6,36,Lake Oval,4,1897.0,Regular,2
6,6.0,-26528.0,R2,Essendon,4,6,30,Collingwood,8,2,50,East Melbourne,-20,1897.0,Regular,2
7,7.0,-26528.0,R2,St Kilda,3,8,26,Fitzroy,10,6,66,Junction Oval,-40,1897.0,Regular,2
8,8.0,-26528.0,R2,Melbourne,9,10,64,Geelong,3,1,19,M.C.G.,45,1897.0,Regular,2
9,9.0,-26521.0,R3,Collingwood,6,5,41,Geelong,5,7,37,Victoria Park,4,1897.0,Regular,3
10,10.0,-26521.0,R3,Fitzroy,5,9,39,Melbourne,7,8,50,Brunswick St,-11,1897.0,Regular,3


In [7]:
ft_predictions = (
    pd.read_csv(os.path.join(PATH, '..', 'data', 'footy_tipper_predictions_2018.csv'))
    .rename(columns=lambda col: col.lower().replace(' ', '_'))
    .assign(date=lambda df: df['date'].pipe(pd.to_datetime))
)

ft_predictions

Unnamed: 0,date,home_team,away_team,home_win_predicted
0,2018-08-24,Port Adelaide,Essendon,1.0
1,2018-08-25,Carlton,Adelaide,0.0
2,2018-08-25,Fremantle,Collingwood,0.0
3,2018-08-25,Geelong,Gold Coast,1.0
4,2018-08-25,Richmond,Western Bulldogs,1.0
5,2018-08-25,Sydney,Hawthorn,1.0
6,2018-08-26,Brisbane,West Coast,0.0
7,2018-08-26,Melbourne,GWS,1.0
8,2018-08-26,St Kilda,North Melbourne,0.0
9,2018-08-17,Richmond,Essendon,1.0


In [8]:
ft_accuracy_df = (
    ft_predictions
    .merge(
        match_results[match_results['season'] == 2018]
        .assign(date=lambda df: df['date'].pipe(pd.to_datetime, unit='D'))
        .loc[:, ['date', 'home_team', 'away_team', 'home_points', 'away_points', 'round_number']],
        on=['date', 'home_team', 'away_team'],
        how='inner',
    )
    .assign(
        home_win=lambda df: (df['home_points'] > df['away_points']).astype(float),
        draw=lambda df: (df['home_points'] == df['away_points']).astype(float),
    )
    .assign(correct=lambda df: (
        (df['draw'].astype(bool)) | (df['home_win_predicted'] == df['home_win'])).astype(float)
    )
    .drop(['home_points', 'away_points'], axis=1)
)

ft_accuracy_df

Unnamed: 0,date,home_team,away_team,home_win_predicted,round_number,home_win,draw,correct
0,2018-08-24,Port Adelaide,Essendon,1.0,23,0.0,0.0,0.0
1,2018-08-25,Carlton,Adelaide,0.0,23,0.0,0.0,1.0
2,2018-08-25,Fremantle,Collingwood,0.0,23,0.0,0.0,1.0
3,2018-08-25,Geelong,Gold Coast,1.0,23,1.0,0.0,1.0
4,2018-08-25,Richmond,Western Bulldogs,1.0,23,1.0,0.0,1.0
5,2018-08-25,Sydney,Hawthorn,1.0,23,0.0,0.0,0.0
6,2018-08-26,Brisbane,West Coast,0.0,23,0.0,0.0,1.0
7,2018-08-26,Melbourne,GWS,1.0,23,1.0,0.0,1.0
8,2018-08-26,St Kilda,North Melbourne,0.0,23,0.0,0.0,1.0
9,2018-08-17,Richmond,Essendon,1.0,22,1.0,0.0,1.0


In [9]:
print('2018 Footy Tipper Performance')
print('Tipping points:', int(ft_accuracy_df['correct'].sum()))
print('Tipping accuracy:', f"{round(ft_accuracy_df['correct'].mean() * 100, 2)}%")

2018 Footy Tipper Performance
Tipping points: 140
Tipping accuracy: 70.71%


In [10]:
data.train_years = (None, 2017)
data.test_years = (2018, 2018)

bag_cv.best_estimator_.fit(*data.train_data())

BaggingEstimator(name='BaggingEstimator',
         pipeline=Pipeline(memory=None,
     steps=[('correlationselector', CorrelationSelector(cols_to_keep=['team', 'oppo_team', 'round_type', 'venue'],
          labels=Adelaide          1991.0  1.0      86.0
                          2.0     -23.0
                          3.0      24.0
                          4.0     -45.0
             ...timators=7, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False))]))

In [13]:
X_train, y_train = data.train_data()
X_test, y_test = data.test_data()
y_pred = bag_cv.best_estimator_.predict(X_test)

pred_df = (
    pd.concat([y_test, pd.Series(y_pred, index=y_test.index).rename('predicted_margin')], axis=1)
    .assign(at_home=X_test['at_home'], oppo_team=X_test['oppo_team'])
    .reset_index()
    .rename(columns={'level_0': 'team', 'level_1': 'year', 'level_2': 'round_number'})
)

home_df = (
    pred_df[(pred_df['at_home'] == 1) & (pred_df['round_number'] <= 23)]
    .rename(
        columns={
            'team': 'home_team',
            'oppo_team': 'away_team',
            'margin': 'home_margin',
            'predicted_margin': 'predicted_home_margin'
        }
    )
)

away_df = (
    pred_df[(pred_df['at_home'] == 0) & (pred_df['round_number'] <= 23)]
    .rename(columns={'team': 'away_team', 'oppo_team': 'home_team', 'predicted_margin': 'predicted_away_margin'})
    .loc[:, ['home_team', 'year', 'round_number', 'away_team', 'predicted_away_margin']]
)

accuracy_df = (
    home_df
    .merge(away_df, on=['home_team', 'away_team', 'year', 'round_number'], how='outer')
    .assign(correct=lambda df: (df['home_margin'] == 0) |
            ((df['home_margin'] < 0) & (df['predicted_home_margin'] < df['predicted_away_margin'])) |
            ((df['home_margin'] > 0) & (df['predicted_home_margin'] > df['predicted_away_margin'])))
    .sort_values(['year', 'round_number'])
    .drop('at_home', axis=1)
)

accuracy_df

Unnamed: 0,home_team,year,round_number,home_margin,predicted_home_margin,away_team,predicted_away_margin,correct
44,Essendon,2018.0,1.0,12.0,8.751776,Adelaide,-4.301050,True
66,GWS,2018.0,1.0,82.0,24.073713,Western Bulldogs,-19.112989,True
88,Gold Coast,2018.0,1.0,16.0,-0.581312,North Melbourne,13.033484,False
99,Hawthorn,2018.0,1.0,34.0,25.603014,Collingwood,-14.570397,True
110,Melbourne,2018.0,1.0,-3.0,-6.163960,Geelong,4.811374,True
132,Port Adelaide,2018.0,1.0,50.0,45.313618,Fremantle,-47.893978,True
143,Richmond,2018.0,1.0,26.0,28.599710,Carlton,-29.713343,True
154,St Kilda,2018.0,1.0,25.0,23.891201,Brisbane,-21.670206,True
176,West Coast,2018.0,1.0,-29.0,-47.496193,Sydney,43.213036,True
0,Adelaide,2018.0,2.0,36.0,5.272498,Richmond,-5.845197,True


In [14]:
print('2018 Tipresias Performance')
print('Tipping points:', int(accuracy_df['correct'].sum()))
print('Tipping accuracy:', f"{round(accuracy_df['correct'].mean() * 100, 2)}%")

2018 Tipresias Performance
Tipping points: 141
Tipping accuracy: 71.21%


### Save final version of the model

In [15]:
data.train_years = (None, 2018)

bag_cv.best_estimator_.fit(*data.train_data())

BaggingEstimator(name='BaggingEstimator',
         pipeline=Pipeline(memory=None,
     steps=[('correlationselector', CorrelationSelector(cols_to_keep=['team', 'oppo_team', 'round_type', 'venue'],
          labels=Adelaide          1991.0  1.0      86.0
                          2.0     -23.0
                          3.0      24.0
                          4.0     -45.0
             ...timators=7, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False))]))

In [16]:
bag_cv.best_estimator_.dump(os.path.join(PATH, 'server/ml_estimators/bagging_estimator/tipresias.pkl'))

In [17]:
benchmark = BenchmarkEstimator(name='benchmark_estimator')
benchmark.fit(*data.train_data())
benchmark.dump(os.path.join(PATH, 'server/ml_estimators/benchmark_estimator/benchmark_estimator.pkl'))