In [1]:
import os
import sys
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso

project_path = os.path.abspath(os.path.join(os.getcwd(), '../'))
if project_path not in sys.path:
    sys.path.append(project_path)
    
# Our own code
from src.data.data_builder import DataBuilder, BettingData, MatchData
from src.data.data_transformer import DataTransformer
from src.data.feature_builder import FeatureBuilder
from src.model.metrics import measure_estimators, regression_accuracy

np.random.seed(42)

In [2]:
# Set up data and create cumulative features

csv_paths = ('data/afl_betting.csv', 'data/ft_match_list.csv')
data_classes = (BettingData, MatchData)

raw_df = DataBuilder(data_classes, csv_paths).concat()
model_df = DataTransformer(raw_df).stack_teams()
fb = FeatureBuilder(model_df)
fb.transform()
team_df = fb.df.dropna()
team_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,team,oppo_team,win_odds,line_odds,oppo_win_odds,oppo_line_odds,score,oppo_score,round_number,year,...,oppo_cum_percent,oppo_cum_win_points,rolling_pred_win_rate,rolling_last_week_win_rate,oppo_rolling_pred_win_rate,oppo_rolling_last_week_win_rate,ladder_position,oppo_ladder_position,win_streak,oppo_win_streak
team,year,round_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Adelaide,2010,2,Adelaide,Sydney,1.54,-12.5,2.49,12.5,75.0,118.0,2,2010,...,0.916667,0.0,1.000000,0.000000,0.000000,0.000000,15,9,-1.0,-1.0
Adelaide,2010,3,Adelaide,Melbourne,1.52,-13.5,2.55,13.5,41.0,57.0,3,2010,...,0.719212,0.0,1.000000,0.000000,0.000000,0.000000,14,12,-2.0,-2.0
Adelaide,2010,4,Adelaide,Carlton,1.81,-2.5,2.01,2.5,55.0,103.0,4,2010,...,1.063910,4.0,1.000000,0.000000,0.500000,0.333333,15,10,-3.0,-2.0
Adelaide,2010,5,Adelaide,Western Bulldogs,7.50,40.5,1.09,-40.5,72.0,121.0,5,2010,...,1.082873,8.0,0.800000,0.000000,1.000000,0.500000,15,8,-4.0,-1.0
Adelaide,2010,6,Adelaide,Port Adelaide,2.55,12.5,1.52,-12.5,74.0,97.0,6,2010,...,0.801670,12.0,0.666667,0.000000,0.500000,0.600000,15,10,-5.0,1.0
Adelaide,2010,7,Adelaide,Richmond,1.20,-29.5,4.66,29.5,104.0,54.0,7,2010,...,0.491413,0.0,0.714286,0.000000,0.000000,0.000000,15,16,-6.0,-6.0
Adelaide,2010,8,Adelaide,North Melbourne,2.38,9.5,1.59,-9.5,75.0,84.0,8,2010,...,0.780952,12.0,0.625000,0.142857,0.250000,0.428571,15,11,1.0,-1.0
Adelaide,2010,9,Adelaide,Brisbane,2.14,5.5,1.72,-5.5,93.0,81.0,9,2010,...,0.920792,16.0,0.555556,0.125000,0.666667,0.500000,15,9,-1.0,-4.0
Adelaide,2010,10,Adelaide,St Kilda,4.25,26.5,1.23,-26.5,76.0,123.0,10,2010,...,1.172144,24.0,0.500000,0.222222,0.900000,0.666667,15,5,1.0,1.0
Adelaide,2010,11,Adelaide,Fremantle,2.89,17.5,1.42,-17.5,105.0,82.0,11,2010,...,1.278221,32.0,0.454545,0.200000,0.363636,0.800000,15,2,-1.0,2.0


In [4]:
# Set up & split data for models

team_features = pd.get_dummies(team_df.drop(['score', 'oppo_score'], axis=1))
team_labels = pd.Series(team_df['score'] - team_df['oppo_score'], name='score_diff')

In [5]:
team_features

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,win_odds,line_odds,oppo_win_odds,oppo_line_odds,round_number,year,at_home,last_week_score,last_week_oppo_score,cum_percent,...,oppo_team_Gold Coast,oppo_team_Hawthorn,oppo_team_Melbourne,oppo_team_North Melbourne,oppo_team_Port Adelaide,oppo_team_Richmond,oppo_team_St Kilda,oppo_team_Sydney,oppo_team_West Coast,oppo_team_Western Bulldogs
team,year,round_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Adelaide,2010,2,1.54,-12.5,2.49,12.5,2,2010,1.0,62.0,118.0,0.525424,...,0,0,0,0,0,0,0,1,0,0
Adelaide,2010,3,1.52,-13.5,2.55,13.5,3,2010,0.0,75.0,118.0,0.580508,...,0,0,1,0,0,0,0,0,0,0
Adelaide,2010,4,1.81,-2.5,2.01,2.5,4,2010,1.0,41.0,57.0,0.607509,...,0,0,0,0,0,0,0,0,0,0
Adelaide,2010,5,7.50,40.5,1.09,-40.5,5,2010,0.0,55.0,103.0,0.588384,...,0,0,0,0,0,0,0,0,0,1
Adelaide,2010,6,2.55,12.5,1.52,-12.5,6,2010,1.0,72.0,121.0,0.589942,...,0,0,0,0,1,0,0,0,0,0
Adelaide,2010,7,1.20,-29.5,4.66,29.5,7,2010,1.0,74.0,97.0,0.617264,...,0,0,0,0,0,1,0,0,0,0
Adelaide,2010,8,2.38,9.5,1.59,-9.5,8,2010,0.0,104.0,54.0,0.723054,...,0,0,0,1,0,0,0,0,0,0
Adelaide,2010,9,2.14,5.5,1.72,-5.5,9,2010,1.0,75.0,84.0,0.742021,...,0,0,0,0,0,0,0,0,0,0
Adelaide,2010,10,4.25,26.5,1.23,-26.5,10,2010,0.0,93.0,81.0,0.781513,...,0,0,0,0,0,0,1,0,0,0
Adelaide,2010,11,2.89,17.5,1.42,-17.5,11,2010,1.0,76.0,123.0,0.760460,...,0,0,0,0,0,0,0,0,0,0


In [11]:
estimator = make_pipeline(StandardScaler(), Lasso())

predictions = []

for year in range(2011, 2017):
    X_train = team_features[team_features['year'] < year]
    X_test = team_features[team_features['year'] == year]
    y_train = team_labels.loc[X_train.index]
    y_test = team_labels.loc[X_test.index]
    
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    
    predictions.extend(y_pred)
        
predictions

[4.120010244589008,
 14.216497226826135,
 12.737937120304206,
 -22.629997479103064,
 7.484273208525358,
 -7.152197645394417,
 29.87799874043915,
 -55.819800296607546,
 31.32279411762868,
 -10.238322622771857,
 9.265881999063904,
 -26.616028563960626,
 -49.43210777113004,
 -6.998811993896782,
 11.377108439019027,
 -32.130650002730626,
 16.088595686437348,
 1.408648725163797,
 -40.977298714619245,
 18.35884320037108,
 24.62064120391849,
 -19.226033025675942,
 -5.657114471933867,
 -43.680686134335176,
 -37.96531131386334,
 -20.586726826961332,
 -20.317514530616688,
 17.46357280693845,
 -13.389650355058698,
 -4.743415017309672,
 -31.595972389964487,
 -3.357834882318836,
 -50.451568598789365,
 -2.9993646010041957,
 -37.616062570896844,
 14.10733174414391,
 -41.82351676314214,
 -45.463823343983016,
 -40.98578795490698,
 -1.048547023916926,
 7.8187060554537,
 -89.41729562627172,
 -9.049609544821381,
 -40.16486324940919,
 24.54188436423791,
 -35.33483251172128,
 14.827213853177888,
 21.7025858

In [16]:
pred_df = team_df[team_df['year'] > 2010].assign(predicted_margin=predictions)
home_df = pred_df[pred_df['at_home'] == 1]

In [29]:
betting_df = (home_df.loc[:, ['year', 'round_number', 'team', 'oppo_team']]
             .rename(columns={'team': 'home_team', 'oppo_team': 'away_team'})
              .assign(model='tipresias_betting',
                      predicted_home_margin=home_df['predicted_margin'].round(),
                      home_margin=home_df['score'] - home_df['oppo_score'],
                      predicted_home_win=(home_df['predicted_margin'] > 0).astype(int),
                      home_win=(home_df['score'] >
                                home_df['score']).astype(int),
                      draw=(home_df['score'] == home_df['score']).astype(int))
              .assign(tip_point=lambda x: ((x['predicted_home_win'] == x['home_win']) |
                                           (x['draw'])).astype(int))
             .reset_index(drop=True))
betting_df

Unnamed: 0,year,round_number,home_team,away_team,model,predicted_home_margin,home_margin,predicted_home_win,home_win,draw,tip_point
0,2011,1,Adelaide,Hawthorn,tipresias_betting,4.0,20.0,1,0,1,1
1,2011,3,Adelaide,Fremantle,tipresias_betting,14.0,-25.0,1,0,1,1
2,2011,6,Adelaide,St Kilda,tipresias_betting,7.0,19.0,1,0,1,1
3,2011,8,Adelaide,Gold Coast,tipresias_betting,30.0,57.0,1,0,1,1
4,2011,10,Adelaide,Brisbane,tipresias_betting,31.0,-40.0,1,0,1,1
5,2011,12,Adelaide,West Coast,tipresias_betting,9.0,-39.0,1,0,1,1
6,2011,15,Adelaide,Sydney,tipresias_betting,-7.0,7.0,0,0,1,1
7,2011,17,Adelaide,Essendon,tipresias_betting,11.0,-11.0,1,0,1,1
8,2011,19,Adelaide,Port Adelaide,tipresias_betting,16.0,32.0,1,0,1,1
9,2011,21,Adelaide,Geelong,tipresias_betting,-41.0,-11.0,0,0,1,1


In [31]:
df = DataTransformer(raw_df).clean()

odds_df = (df.loc[:, ['year', 'round_number', 'home_team', 'away_team']]
              .assign(model='oddsmakers',
                      predicted_home_margin=df['home_line_odds'].round() * -1,
                      home_margin=df['home_score'] - df['away_score'],
                      predicted_home_win=((df['home_win_odds'] < df['away_win_odds']) |
                                          (df['home_line_odds'] < df['away_line_odds']) |
                                          # If odds are all equal, predict home team
                                          ((df['home_win_odds'] == df['away_win_odds']) &
                                           (df['home_line_odds'] == df['away_line_odds']))).astype(int),
                      home_win=(df['home_score'] >
                                df['away_score']).astype(int),
                      draw=(df['home_score'] == df['away_score']).astype(int))
              .assign(tip_point=lambda x: ((x['predicted_home_win'] == x['home_win']) |
                                           (x['draw'])).astype(int)))
odds_df

Unnamed: 0,year,round_number,home_team,away_team,model,predicted_home_margin,home_margin,predicted_home_win,home_win,draw,tip_point
0,2010,1,Richmond,Carlton,oddsmakers,-16.0,-56.0,0,0,0,1
1,2010,1,Geelong,Essendon,oddsmakers,28.0,31.0,1,1,0,1
2,2010,1,Sydney,St Kilda,oddsmakers,-14.0,-8.0,0,0,0,1
3,2010,1,Brisbane,West Coast,oddsmakers,22.0,32.0,1,1,0,1
4,2010,1,Melbourne,Hawthorn,oddsmakers,-32.0,-56.0,0,0,0,1
5,2010,1,Port Adelaide,North Melbourne,oddsmakers,16.0,14.0,1,1,0,1
6,2010,1,Fremantle,Adelaide,oddsmakers,-2.0,56.0,0,1,0,0
7,2010,1,Western Bulldogs,Collingwood,oddsmakers,8.0,-36.0,1,0,0,0
8,2010,2,Brisbane,Carlton,oddsmakers,20.0,19.0,1,1,0,1
9,2010,2,West Coast,Port Adelaide,oddsmakers,10.0,-3.0,1,0,0,0


In [30]:
pd.concat([betting_df, odds_df]).reset_index()

Unnamed: 0,index,year,round_number,home_team,away_team,model,predicted_home_margin,home_margin,predicted_home_win,home_win,draw,tip_point
0,0,2011,1,Adelaide,Hawthorn,tipresias_betting,4.0,20.0,1,0,1,1
1,1,2011,3,Adelaide,Fremantle,tipresias_betting,14.0,-25.0,1,0,1,1
2,2,2011,6,Adelaide,St Kilda,tipresias_betting,7.0,19.0,1,0,1,1
3,3,2011,8,Adelaide,Gold Coast,tipresias_betting,30.0,57.0,1,0,1,1
4,4,2011,10,Adelaide,Brisbane,tipresias_betting,31.0,-40.0,1,0,1,1
5,5,2011,12,Adelaide,West Coast,tipresias_betting,9.0,-39.0,1,0,1,1
6,6,2011,15,Adelaide,Sydney,tipresias_betting,-7.0,7.0,0,0,1,1
7,7,2011,17,Adelaide,Essendon,tipresias_betting,11.0,-11.0,1,0,1,1
8,8,2011,19,Adelaide,Port Adelaide,tipresias_betting,16.0,32.0,1,0,1,1
9,9,2011,21,Adelaide,Geelong,tipresias_betting,-41.0,-11.0,0,0,1,1
