In [1]:
import os
import sys
import warnings
from functools import reduce, partial
import pandas as pd
import numpy as np
from sklearn.exceptions import DataConversionWarning
from sklearn.metrics import mean_absolute_error
import featuretools as ft
import featuretools.variable_types as vtypes

PROJECT_PATH = os.path.join(os.getcwd(), '../')

if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)
    
from server.ml_models.all_model import AllModelData
from server.ml_models.match_model import MatchModelData
from server.ml_models.player_model import PlayerModelData
from server.ml_models.betting_model import BettingModelData
from server.ml_models import EnsembleModel

from src.model.metrics import yearly_performance_scores
from src.model.charts import graph_yearly_model_performance
from src.data.feature_engineering import (match_id, ladder_position, add_elo_rating, city_lat_long,
                                          playing_for_team_match_id, player_team_match_id, home_away_df)
from server.ml_models.data_config import TEAM_CITIES, VENUE_CITIES

SEED = 42

np.random.seed(SEED)
warnings.simplefilter("ignore", DataConversionWarning)

## Prepare raw data for featuretools
featuretools handles a lot of the data transformation that I was doing myself, and things got messing when I was trying to use ft after doing all my own aggregations/transformations, so I'm taking a step back and passing raw data to ft and letting them take it from there.

In [2]:
data_kwargs = {'data_transformers': [], 'index_cols': ['home_team', 'year', 'round_number']}
betting_data = BettingModelData
player_data = PlayerModelData
match_data = MatchModelData

bd = betting_data(**data_kwargs)
pld = player_data(**data_kwargs)
md = match_data(**data_kwargs)

  res = PandasDataFrame.from_items(items)


In [3]:
SHARED_COLS = ['away_score', 'away_team', 'home_score', 'home_team', 'round_number', 'year']

raw_df = (md.data
          .merge(bd.data, how='left', on=SHARED_COLS)
          .sort_values(['year', 'round_number', 'home_team'])
          .reset_index(drop=True))
raw_df = raw_df[(raw_df['date'] > '2010-01-01') & (raw_df['date'] < '2015-12-31')]

raw_df

Unnamed: 0,date,home_team,home_goals,home_behinds,home_score,away_team,away_goals,away_behinds,away_score,venue,home_margin,year,round_type,round_number,home_win_odds,home_line_odds,away_win_odds,away_line_odds
13565,2010-03-27,Brisbane,16,18,114,West Coast,12,10,82,Gabba,32,2010,Regular,1,1.32,-21.5,3.42,21.5
13566,2010-03-28,Fremantle,17,16,118,Adelaide,9,8,62,Subiaco,56,2010,Regular,1,1.96,1.5,1.85,-1.5
13567,2010-03-26,Geelong,19,11,125,Essendon,13,16,94,M.C.G.,31,2010,Regular,1,1.21,-28.5,4.50,28.5
13568,2010-03-27,Melbourne,8,13,61,Hawthorn,17,15,117,M.C.G.,-56,2010,Regular,1,4.95,31.5,1.18,-31.5
13569,2010-03-28,Port Adelaide,13,18,96,North Melbourne,12,10,82,Football Park,14,2010,Regular,1,1.42,-16.5,2.89,16.5
13570,2010-03-25,Richmond,9,10,64,Carlton,18,12,120,M.C.G.,-56,2010,Regular,1,2.89,16.5,1.42,-16.5
13571,2010-03-27,Sydney,13,10,88,St Kilda,15,6,96,Stadium Australia,-8,2010,Regular,1,2.64,14.5,1.49,-14.5
13572,2010-03-28,Western Bulldogs,13,15,93,Collingwood,19,15,129,Docklands,-36,2010,Regular,1,1.62,-8.5,2.31,8.5
13573,2010-04-04,Adelaide,11,9,75,Sydney,18,10,118,Football Park,-43,2010,Regular,2,1.54,-12.5,2.49,12.5
13574,2010-04-01,Brisbane,16,11,107,Carlton,12,16,88,Gabba,19,2010,Regular,2,1.34,-19.5,3.30,19.5


In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1209 entries, 13565 to 14773
Data columns (total 18 columns):
date              1209 non-null datetime64[ns]
home_team         1209 non-null object
home_goals        1209 non-null int32
home_behinds      1209 non-null int32
home_score        1209 non-null int32
away_team         1209 non-null object
away_goals        1209 non-null int32
away_behinds      1209 non-null int32
away_score        1209 non-null int32
venue             1209 non-null object
home_margin       1209 non-null int32
year              1209 non-null int64
round_type        1209 non-null object
round_number      1209 non-null int32
home_win_odds     1192 non-null float64
home_line_odds    1192 non-null float64
away_win_odds     1192 non-null float64
away_line_odds    1192 non-null float64
dtypes: datetime64[ns](1), float64(4), int32(8), int64(1), object(4)
memory usage: 141.7+ KB


In [5]:
round_start = (raw_df.groupby(['year', 'round_number'])['date']
                  .min()
                  .rename('round_start_date')
                  .reset_index())
end_of_round = (
    (raw_df.groupby(['year', 'round_number'])['date'].max() + pd.Timedelta(hours=23, minutes=59, seconds=59))
    .rename('end_of_round')
    .reset_index()
)
end_of_season = end_of_round.groupby('year')['end_of_round'].max().rename('end_of_season').reset_index()

prev_df = raw_df.groupby('team')

clean_df = (raw_df
             .fillna(0)
             .assign(
                 match_id=match_id,
                 # By default dates w/o time have 00:00:00 as their timestamp
                 end_of_day=lambda df: df['date'] + pd.Timedelta(hours=23, minutes=59, seconds=59),
             )
             .merge(round_start, on=['year', 'round_number'], how='left')
             .merge(end_of_round, on=['year', 'round_number'], how='left')
             .merge(end_of_season, on=['year'], how='left')
             # Sort by date and drop duplicates to get rid of finals replays due to draws
             .sort_values('date')
             .drop_duplicates(subset='match_id', keep="last"))

clean_df

Unnamed: 0,date,home_team,home_goals,home_behinds,home_score,away_team,away_goals,away_behinds,away_score,venue,...,round_number,home_win_odds,home_line_odds,away_win_odds,away_line_odds,match_id,end_of_day,round_start_date,end_of_round,end_of_season
5,2010-03-25,Richmond,9,10,64,Carlton,18,12,120,M.C.G.,...,1,2.89,16.5,1.42,-16.5,2010.1.Carlton.Richmond,2010-03-25 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59
2,2010-03-26,Geelong,19,11,125,Essendon,13,16,94,M.C.G.,...,1,1.21,-28.5,4.50,28.5,2010.1.Essendon.Geelong,2010-03-26 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59
0,2010-03-27,Brisbane,16,18,114,West Coast,12,10,82,Gabba,...,1,1.32,-21.5,3.42,21.5,2010.1.Brisbane.West Coast,2010-03-27 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59
3,2010-03-27,Melbourne,8,13,61,Hawthorn,17,15,117,M.C.G.,...,1,4.95,31.5,1.18,-31.5,2010.1.Hawthorn.Melbourne,2010-03-27 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59
6,2010-03-27,Sydney,13,10,88,St Kilda,15,6,96,Stadium Australia,...,1,2.64,14.5,1.49,-14.5,2010.1.St Kilda.Sydney,2010-03-27 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59
1,2010-03-28,Fremantle,17,16,118,Adelaide,9,8,62,Subiaco,...,1,1.96,1.5,1.85,-1.5,2010.1.Adelaide.Fremantle,2010-03-28 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59
4,2010-03-28,Port Adelaide,13,18,96,North Melbourne,12,10,82,Football Park,...,1,1.42,-16.5,2.89,16.5,2010.1.North Melbourne.Port Adelaide,2010-03-28 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59
7,2010-03-28,Western Bulldogs,13,15,93,Collingwood,19,15,129,Docklands,...,1,1.62,-8.5,2.31,8.5,2010.1.Collingwood.Western Bulldogs,2010-03-28 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59
9,2010-04-01,Brisbane,16,11,107,Carlton,12,16,88,Gabba,...,2,1.34,-19.5,3.30,19.5,2010.2.Brisbane.Carlton,2010-04-01 23:59:59,2010-04-01,2010-04-05 23:59:59,2010-10-02 23:59:59
15,2010-04-03,West Coast,12,14,86,Port Adelaide,13,11,89,Subiaco,...,2,1.63,-10.5,2.30,10.5,2010.2.Port Adelaide.West Coast,2010-04-03 23:59:59,2010-04-01,2010-04-05 23:59:59,2010-10-02 23:59:59


In [67]:
MATCH_COLS = ['team_behinds', 'team_goals', 'match_points', 'match_result', 'score',
              'elo_rating', 'ladder_position']

team_df = (pd
           .concat([home_away_df(True, clean_df), home_away_df(False, clean_df)], sort=True)
           .sort_index()
           .rename(columns={'goals': 'team_goals', 'behinds': 'team_behinds'})
           .assign(home_city=lambda df: df['team'].map(TEAM_CITIES),
                   ladder_position=ladder_position,
                   elo_rating=add_elo_rating,
                   end_of_day=lambda df: df['date'] + pd.Timedelta(hours=23, minutes=59, seconds=59))
           .assign(home_lat_long=lambda df: df['home_city'].map(city_lat_long))
           .merge(end_of_round, on=['year', 'round_number'], how='left')
           # Dropping shared columns with match data frame (except match_id)
           .drop(['date', 'year', 'round_number', 'oppo_score'], axis=1)
           .set_index('team_match_id', drop=False)
           .rename_axis(None))

prev_df = (team_df
           .groupby('team')
           .shift()
           .loc[:, MATCH_COLS + ['margin']]
           .rename(columns=lambda col: 'prev_' + col))

team_match_df = pd.concat([team_df.drop(MATCH_COLS, axis=1), prev_df], axis=1).fillna(0)

team_match_df

distributed.core - INFO - Event loop was unresponsive in Nanny for 5.35s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Nanny for 5.59s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Nanny for 5.60s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


Unnamed: 0,at_home,line_odds,margin,match_id,round_start_date,team,team_match_id,win_odds,home_city,end_of_day,home_lat_long,end_of_round,prev_team_behinds,prev_team_goals,prev_match_points,prev_match_result,prev_score,prev_elo_rating,prev_ladder_position,prev_margin
2010.1.Adelaide,False,-1.5,-56,2010.1.Adelaide.Fremantle,2010-03-25,Adelaide,2010.1.Adelaide,1.85,Adelaide,2010-03-28 23:59:59,"(-34.9285, 138.6007)",2010-03-28 23:59:59,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2010.2.Adelaide,True,-12.5,-43,2010.2.Adelaide.Sydney,2010-04-01,Adelaide,2010.2.Adelaide,1.54,Adelaide,2010-04-04 23:59:59,"(-34.9285, 138.6007)",2010-04-05 23:59:59,8.0,9.0,0.0,0.0,62.0,994.462124,15.0,-56.0
2010.3.Adelaide,False,-13.5,-16,2010.3.Adelaide.Melbourne,2010-04-09,Adelaide,2010.3.Adelaide,1.52,Adelaide,2010-04-11 23:59:59,"(-34.9285, 138.6007)",2010-04-11 23:59:59,9.0,11.0,0.0,0.0,75.0,989.411656,14.0,-43.0
2010.4.Adelaide,True,-2.5,-48,2010.4.Adelaide.Carlton,2010-04-16,Adelaide,2010.4.Adelaide,1.81,Adelaide,2010-04-17 23:59:59,"(-34.9285, 138.6007)",2010-04-18 23:59:59,11.0,5.0,0.0,0.0,41.0,988.965446,15.0,-16.0
2010.5.Adelaide,False,40.5,-49,2010.5.Adelaide.Western Bulldogs,2010-04-23,Adelaide,2010.5.Adelaide,7.50,Adelaide,2010-04-23 23:59:59,"(-34.9285, 138.6007)",2010-04-26 23:59:59,19.0,6.0,0.0,0.0,55.0,983.893355,15.0,-48.0
2010.6.Adelaide,True,12.5,-23,2010.6.Adelaide.Port Adelaide,2010-04-30,Adelaide,2010.6.Adelaide,2.55,Adelaide,2010-05-01 23:59:59,"(-34.9285, 138.6007)",2010-05-02 23:59:59,12.0,10.0,0.0,0.0,72.0,980.670924,15.0,-49.0
2010.7.Adelaide,True,-29.5,50,2010.7.Adelaide.Richmond,2010-05-07,Adelaide,2010.7.Adelaide,1.20,Adelaide,2010-05-09 23:59:59,"(-34.9285, 138.6007)",2010-05-10 23:59:59,14.0,10.0,0.0,0.0,74.0,978.295575,15.0,-23.0
2010.8.Adelaide,False,9.5,-9,2010.8.Adelaide.North Melbourne,2010-05-14,Adelaide,2010.8.Adelaide,2.38,Adelaide,2010-05-15 23:59:59,"(-34.9285, 138.6007)",2010-05-16 23:59:59,14.0,15.0,4.0,1.0,104.0,980.489801,15.0,50.0
2010.9.Adelaide,True,5.5,12,2010.9.Adelaide.Brisbane,2010-05-21,Adelaide,2010.9.Adelaide,2.14,Adelaide,2010-05-23 23:59:59,"(-34.9285, 138.6007)",2010-05-23 23:59:59,9.0,11.0,0.0,0.0,75.0,980.784839,15.0,-9.0
2010.10.Adelaide,False,26.5,-47,2010.10.Adelaide.St Kilda,2010-05-28,Adelaide,2010.10.Adelaide,4.25,Adelaide,2010-05-29 23:59:59,"(-34.9285, 138.6007)",2010-05-30 23:59:59,15.0,13.0,4.0,1.0,93.0,981.967112,15.0,12.0


In [63]:
team_match_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2416 entries, 2010.1.Adelaide to 2015.24.Western Bulldogs
Data columns (total 19 columns):
at_home                 2416 non-null bool
line_odds               2416 non-null float64
margin                  2416 non-null int32
match_id                2416 non-null object
round_start_date        2416 non-null datetime64[ns]
team                    2416 non-null object
win_odds                2416 non-null float64
home_city               2416 non-null object
end_of_day              2416 non-null datetime64[ns]
home_lat_long           2416 non-null object
end_of_round            2416 non-null datetime64[ns]
prev_team_behinds       2416 non-null float64
prev_team_goals         2416 non-null float64
prev_match_points       2416 non-null float64
prev_match_result       2416 non-null float64
prev_score              2416 non-null float64
prev_elo_rating         2416 non-null float64
prev_ladder_position    2416 non-null float64
prev_margin             

In [8]:
team_cols = clean_df.filter(regex='^(home_|away_)').columns
match_df = (clean_df
            .drop(team_cols, axis=1)
            .assign(venue_city=lambda df: df['venue'].map(VENUE_CITIES))
            .assign(venue_lat_long=lambda df: df['venue_city'].map(city_lat_long)))

match_df

Unnamed: 0,date,venue,year,round_type,round_number,match_id,end_of_day,round_start_date,end_of_round,end_of_season,venue_city,venue_lat_long
5,2010-03-25,M.C.G.,2010,Regular,1,2010.1.Carlton.Richmond,2010-03-25 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59,Melbourne,"(-37.8136, 144.9631)"
2,2010-03-26,M.C.G.,2010,Regular,1,2010.1.Essendon.Geelong,2010-03-26 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59,Melbourne,"(-37.8136, 144.9631)"
0,2010-03-27,Gabba,2010,Regular,1,2010.1.Brisbane.West Coast,2010-03-27 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59,Brisbane,"(-27.4698, 153.0251)"
3,2010-03-27,M.C.G.,2010,Regular,1,2010.1.Hawthorn.Melbourne,2010-03-27 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59,Melbourne,"(-37.8136, 144.9631)"
6,2010-03-27,Stadium Australia,2010,Regular,1,2010.1.St Kilda.Sydney,2010-03-27 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59,Sydney,"(-33.8688, 151.2093)"
1,2010-03-28,Subiaco,2010,Regular,1,2010.1.Adelaide.Fremantle,2010-03-28 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59,Perth,"(-31.9505, 115.8605)"
4,2010-03-28,Football Park,2010,Regular,1,2010.1.North Melbourne.Port Adelaide,2010-03-28 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59,Adelaide,"(-34.9285, 138.6007)"
7,2010-03-28,Docklands,2010,Regular,1,2010.1.Collingwood.Western Bulldogs,2010-03-28 23:59:59,2010-03-25,2010-03-28 23:59:59,2010-10-02 23:59:59,Melbourne,"(-37.8136, 144.9631)"
9,2010-04-01,Gabba,2010,Regular,2,2010.2.Brisbane.Carlton,2010-04-01 23:59:59,2010-04-01,2010-04-05 23:59:59,2010-10-02 23:59:59,Brisbane,"(-27.4698, 153.0251)"
15,2010-04-03,Subiaco,2010,Regular,2,2010.2.Port Adelaide.West Coast,2010-04-03 23:59:59,2010-04-01,2010-04-05 23:59:59,2010-10-02 23:59:59,Perth,"(-31.9505, 115.8605)"


In [84]:
PLAYER_MATCH_COLS = [
    'kicks', 'marks', 'handballs', 'goals', 'behinds', 'hit_outs', 'tackles',
    'rebounds', 'inside_50s', 'clearances', 'clangers', 'frees_for', 'frees_against', 'contested_possessions',
    'uncontested_possessions', 'contested_marks', 'marks_inside_50', 'one_percenters', 'bounces',
    'goal_assists', 'time_on_ground'
]

player_dates = (team_match_df[['end_of_day', 'round_start_date', 'team_match_id']])
player_df = (pld.data
             .assign(team_match_id=playing_for_team_match_id,
                     player_team_match_id=player_team_match_id)
             .merge(player_dates, on='team_match_id', how='left')
             .merge(end_of_season, on='year', how='left')
             .drop(SHARED_COLS + ['player_name'], axis=1)
             # Normally, there wouldn't be NaNs, but since we filter team_match_df by date,
             # player_df has a lot more rows
             .dropna()
             .set_index('player_team_match_id', drop=False)
             .rename_axis(None))

prev_player_df = (player_df
                  .groupby('player_id')
                  .shift()
                  .loc[:, PLAYER_MATCH_COLS]
                  .rename(columns=lambda col: 'prev_' + col)
                  .fillna(0))

player_match_df = pd.concat([player_df, prev_player_df], axis=1).drop(PLAYER_MATCH_COLS, axis=1)

player_match_df = player_match_df[
    (player_match_df['end_of_day'] > '2010-01-01') & (player_match_df['end_of_day'] < '2015-12-31')
]

player_match_df

Unnamed: 0,player_id,playing_for,brownlow_votes,match_id,team_match_id,player_team_match_id,end_of_day,round_start_date,end_of_season,prev_kicks,...,prev_frees_for,prev_frees_against,prev_contested_possessions,prev_uncontested_possessions,prev_contested_marks,prev_marks_inside_50,prev_one_percenters,prev_bounces,prev_goal_assists,prev_time_on_ground
2010.2.Sydney.1012,1012,Sydney,0.0,13590,2010.2.Sydney,2010.2.Sydney.1012,2010-04-04 23:59:59,2010-04-01,2010-10-02 23:59:59,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010.2.Sydney.1013,1013,Sydney,0.0,13590,2010.2.Sydney,2010.2.Sydney.1013,2010-04-04 23:59:59,2010-04-01,2010-10-02 23:59:59,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010.2.Sydney.1015,1015,Sydney,0.0,13590,2010.2.Sydney,2010.2.Sydney.1015,2010-04-04 23:59:59,2010-04-01,2010-10-02 23:59:59,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010.2.Sydney.1024,1024,Sydney,2.0,13590,2010.2.Sydney,2010.2.Sydney.1024,2010-04-04 23:59:59,2010-04-01,2010-10-02 23:59:59,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010.2.Sydney.1025,1025,Sydney,0.0,13590,2010.2.Sydney,2010.2.Sydney.1025,2010-04-04 23:59:59,2010-04-01,2010-10-02 23:59:59,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010.2.Sydney.1090,1090,Sydney,0.0,13590,2010.2.Sydney,2010.2.Sydney.1090,2010-04-04 23:59:59,2010-04-01,2010-10-02 23:59:59,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010.2.Sydney.11546,11546,Sydney,0.0,13590,2010.2.Sydney,2010.2.Sydney.11546,2010-04-04 23:59:59,2010-04-01,2010-10-02 23:59:59,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010.2.Adelaide.11633,11633,Adelaide,0.0,13590,2010.2.Adelaide,2010.2.Adelaide.11633,2010-04-04 23:59:59,2010-04-01,2010-10-02 23:59:59,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010.2.Adelaide.11634,11634,Adelaide,0.0,13590,2010.2.Adelaide,2010.2.Adelaide.11634,2010-04-04 23:59:59,2010-04-01,2010-10-02 23:59:59,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010.2.Sydney.11656,11656,Sydney,0.0,13590,2010.2.Sydney,2010.2.Sydney.11656,2010-04-04 23:59:59,2010-04-01,2010-10-02 23:59:59,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
# Make match entity as base

es = ft.EntitySet('Matches')

# Match entity
es = es.entity_from_dataframe(
    entity_id='matches',
    dataframe=match_df,
    index='match_id',
    # Most of the fixture data is known at the beginning of the season, but not all,
    # so setting it to the start of the round simplifies things
    time_index='round_start_date',
    variable_types={
        'venue_city': vtypes.Categorical,
        'venue_lat_long': vtypes.LatLong,
        'date': vtypes.Datetime,
        'venue': vtypes.Categorical,
        'year': vtypes.Ordinal,
        'round_type': vtypes.Categorical,
        'round_number': vtypes.Ordinal,
    },
)

# TeamMatch entity
es = es.entity_from_dataframe(
    entity_id='team_matches',
    dataframe=team_match_df,
    index='team_match_id',
    # Most of the fixture data is known at the beginning of the season, but not all,
    # so setting it to the start of the round simplifies things
    time_index='round_start_date',
    variable_types={
        'at_home': vtypes.Boolean,
        'team': vtypes.Categorical,
        'home_city': vtypes.Categorical,
        'prev_ladder_position': vtypes.Ordinal,
        'home_lat_long': vtypes.LatLong,
    },
    secondary_time_index={
        'end_of_day': ['prev_' + col for col in MATCH_COLS],
        'end_of_round': ['prev_ladder_position']
    },
)

# Relationship between matches and team matches
es = es.add_relationship(
    ft.Relationship(es['matches']['match_id'], es['team_matches']['match_id'])
)

# Team entity
es.normalize_entity('team_matches', 'teams', 'team',
                    make_time_index=False,
                    make_secondary_time_index=False,
                    additional_variables=['home_city', 'home_lat_long'])

# Venue entity
es.normalize_entity('matches', 'venues', 'venue',
                    make_time_index=False,
                    make_secondary_time_index=False,
                    additional_variables=['venue_city', 'venue_lat_long'])
# Add year entity
es.normalize_entity('matches', 'years', 'year', make_time_index=False, make_secondary_time_index=False)

# Add round_number entity
es.normalize_entity('matches', 'round_numbers', 'round_number',
                    additional_variables=['round_type'],
                    make_time_index=False,
                    make_secondary_time_index=False)

es = es.entity_from_dataframe(
    entity_id='player_matches',
    dataframe=player_df,
    index='player_team_match_id',
    # Most of the fixture data is known at the beginning of the season, but not all,
    # so setting it to the start of the round simplifies things
    time_index='round_start_date',
    variable_types={
        'playing_for': vtypes.Categorical,
    },
    secondary_time_index={
        'end_of_day': ['prev_' + col for col in PLAYER_MATCH_COLS],
        'end_of_season': ['brownlow_votes']
    },
)

es = es.add_relationship(ft.Relationship(es['team_matches']['team_match_id'], es['player_matches']['team_match_id']))

# Add player entity
es.normalize_entity('player_matches', 'players', 'player_id', make_time_index=False)

es

Entityset: Matches
  Entities:
    matches [Rows: 1208, Columns: 9]
    team_matches [Rows: 2416, Columns: 18]
    teams [Rows: 18, Columns: 3]
    venues [Rows: 19, Columns: 3]
    years [Rows: 6, Columns: 1]
    round_numbers [Rows: 28, Columns: 2]
    player_matches [Rows: 53152, Columns: 30]
    players [Rows: 1121, Columns: 1]
  Relationships:
    team_matches.match_id -> matches.match_id
    team_matches.team -> teams.team
    matches.venue -> venues.venue
    matches.year -> years.year
    matches.round_number -> round_numbers.round_number
    player_matches.team_match_id -> team_matches.team_match_id
    player_matches.player_id -> players.player_id

In [88]:
cutoff_times = (es['team_matches']
                .df[['team_match_id', 'round_start_date', 'margin']]
                .rename(columns={'round_start_date': 'cutoff_time'}))
cutoff_times

Unnamed: 0,team_match_id,cutoff_time,margin
2010.1.Adelaide,2010.1.Adelaide,2010-03-25,-56
2010.1.Brisbane,2010.1.Brisbane,2010-03-25,32
2010.1.Carlton,2010.1.Carlton,2010-03-25,56
2010.1.Collingwood,2010.1.Collingwood,2010-03-25,36
2010.1.Essendon,2010.1.Essendon,2010-03-25,-31
2010.1.Fremantle,2010.1.Fremantle,2010-03-25,56
2010.1.Geelong,2010.1.Geelong,2010-03-25,31
2010.1.Hawthorn,2010.1.Hawthorn,2010-03-25,56
2010.1.Melbourne,2010.1.Melbourne,2010-03-25,-56
2010.1.North Melbourne,2010.1.North Melbourne,2010-03-25,-14


In [95]:
# Generate features using the constructed entityset
    
features = ft.dfs(
    entityset=es,
    target_entity='team_matches',
    agg_primitives=[
        'sum', 'trend', 'count', 'max', 'min', 'last', 'skew',
    ],
    trans_primitives=[
        'subtract_numeric', 'divide_numeric', 'haversine', 'add_numeric', 'greater_than', 'less_than', 'month',
    ],
    max_depth=2,
    cutoff_time=cutoff_times,
    cutoff_time_in_index=True,
    n_jobs=-1,
    chunk_size=0.1,
    training_window=ft.Timedelta(2, 'observations', entity='years'),
    features_only=True,
    ignore_entities=['player_matches'],
    ignore_variables={'team_matches': ['match_id', 'margin']},
    verbose=True,
)

Built 55453 features


In [96]:
features

[<Feature: line_odds>,
 <Feature: win_odds>,
 <Feature: prev_team_behinds>,
 <Feature: prev_team_goals>,
 <Feature: prev_match_points>,
 <Feature: prev_match_result>,
 <Feature: prev_score>,
 <Feature: prev_elo_rating>,
 <Feature: prev_margin>,
 <Feature: at_home>,
 <Feature: team>,
 <Feature: prev_ladder_position>,
 <Feature: prev_elo_rating - prev_team_behinds>,
 <Feature: prev_margin - prev_team_behinds>,
 <Feature: prev_elo_rating - prev_match_points>,
 <Feature: prev_match_result - prev_team_behinds>,
 <Feature: line_odds - prev_margin>,
 <Feature: prev_elo_rating - prev_margin>,
 <Feature: prev_match_result - win_odds>,
 <Feature: prev_margin - prev_score>,
 <Feature: prev_match_points - prev_score>,
 <Feature: line_odds - prev_elo_rating>,
 <Feature: prev_score - prev_team_behinds>,
 <Feature: prev_score - prev_team_goals>,
 <Feature: prev_team_behinds - prev_team_goals>,
 <Feature: line_odds - prev_score>,
 <Feature: prev_team_goals - win_odds>,
 <Feature: prev_team_behinds - w

In [97]:
fm.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2416 entries, (2010.1.Adelaide, 2010-03-25 00:00:00) to (2015.27.West Coast, 2015-10-03 00:00:00)
Columns: 802 entries, team_behinds to teams.COUNT(player_matches)
dtypes: bool(1), float64(788), int32(1), int64(6), object(6)
memory usage: 14.8+ MB


In [38]:
fm.filter(regex='matches.')

Unnamed: 0_level_0,Unnamed: 1_level_0,SUM(player_matches.kicks),SUM(player_matches.marks),SUM(player_matches.handballs),SUM(player_matches.goals),SUM(player_matches.behinds),SUM(player_matches.hit_outs),SUM(player_matches.tackles),SUM(player_matches.rebounds),SUM(player_matches.inside_50s),SUM(player_matches.clearances),...,teams.SUM(player_matches.brownlow_votes),teams.SUM(player_matches.contested_possessions),teams.SUM(player_matches.uncontested_possessions),teams.SUM(player_matches.contested_marks),teams.SUM(player_matches.marks_inside_50),teams.SUM(player_matches.one_percenters),teams.SUM(player_matches.bounces),teams.SUM(player_matches.goal_assists),teams.SUM(player_matches.time_on_ground),teams.COUNT(player_matches)
team_match_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2010.1.Adelaide,2010-03-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
2010.1.Brisbane,2010-03-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
2010.1.Carlton,2010-03-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
2010.1.Collingwood,2010-03-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
2010.1.Essendon,2010-03-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
2010.1.Fremantle,2010-03-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
2010.1.Geelong,2010-03-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
2010.1.Hawthorn,2010-03-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
2010.1.Melbourne,2010-03-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
2010.1.North Melbourne,2010-03-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22


In [34]:
prim = ft.primitives.list_primitives()
prim[prim['type'] == 'transform'].sort_values('name')

Unnamed: 0,name,type,description
33,absolute,transform,
65,add_numeric,transform,
38,add_numeric_scalar,transform,
40,and,transform,
37,characters,transform,
28,day,transform,
55,days_since,transform,
34,diff,transform,
19,divide_by_feature,transform,
24,divide_numeric,transform,
