#Footy tipping!

We're going to first use a logistic regression to tip footy games

We want to tip for the **whole season**. No updating mid year!


In [149]:
# Render our plots inline
%matplotlib inline
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Make the graphs a bit prettier
pd.set_option('display.mpl_style', 'default') 
plt.rcParams['figure.figsize'] = (15, 5)
# Limit the size of the dataframe html output in the ipython notebook.
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 10)
# Loading the data
teams = pd.read_csv('data/teams.csv')
seasons = pd.read_csv('data/seasons.csv')
unplayed = pd.read_csv('data/unplayed.csv')

So the variables we want are:
+ Team id
+ Last year's percentage for both teams
+ Last year's points for both teams

In [150]:
# Simplifying the data set
df_train = seasons[["mid", 'season', 'round', 'tid1_loc', 'tid1', 'tid2', 'win_tid', 'h_tid', 'h_score', 'a_tid', 'a_score', 'margin' ]]

In [151]:
sLength = len(df_train['mid'])
df_train.loc[:, 'tid1_score'] = 0 #pd.Series(np.zeros(sLength), index=df_train.index)
df_train.loc[:, 'tid2_score'] = 0 #pd.Series(np.zeros(sLength), index=df_train.index)

In [152]:
home_is_tid1 = (df_train['h_tid'] == df_train['tid1'])
away_is_tid1 = (df_train['a_tid'] == df_train['tid1'])
df_train.ix[home_is_tid1,'tid1_score'] = df_train['h_score']
df_train.ix[away_is_tid1,'tid1_score'] = df_train['a_score']
df_train.ix[away_is_tid1,'tid2_score'] = df_train['h_score']
df_train.ix[home_is_tid1,'tid2_score'] = df_train['a_score']

In [153]:
# Calculating last year's percentage
tid1_scores = df_train.groupby(['season', 'tid1']).sum()['tid1_score']
tid2_scores = df_train.groupby(['season', 'tid2']).sum()['tid2_score']
scores_for = pd.concat([tid1_scores, tid2_scores], axis=1)
scores_for['scores_for'] = 0
scores_for.scores_for = scores_for.tid1_score.fillna(0) + scores_for.tid2_score.fillna(0)
tid1_game_tid2_opponent_score = df_train.groupby(['season', 'tid1']).sum()['tid2_score']
tid2_game_tid1_opponent_score = df_train.groupby(['season', 'tid2']).sum()['tid1_score']
scores_against = pd.concat([tid1_game_tid2_opponent_score, tid2_game_tid1_opponent_score], axis=1)
scores_against['scores_against'] = 0
scores_against.scores_against = scores_against.tid2_score.fillna(0) + scores_against.tid2_score.fillna(0)
last_year = pd.concat([scores_for, scores_against], axis=1)
last_year = last_year[['scores_for', 'scores_against']]
last_year['percentage'] = last_year['scores_for'] / last_year['scores_against']
last_year = last_year[['percentage']]

In [154]:
df_train = df_train[df_train['season']>= 2010]
df_train['tid1_last_season_percentage'] = 0
df_train['tid2_last_season_percentage'] = 0
teams_list = pd.unique(teams['tid'].ravel())
teams_list_pre_2011 = np.delete(teams_list, [7,8])
teams_list_pre_2012 = np.delete(teams_list, [8])
seasons_list = pd.unique(df.season.ravel())
base_season = seasons_list[0]

for year in range(2010, 2015+1):
    for team in teams_list_pre_2011:
        df_train.loc[(df_train['season']==year) & (df_train['tid1']==team), 'tid1_last_season_percentage'] = last_year.loc[year-1, team][0]            
        df_train.loc[(df_train['season']==year) & (df_train['tid2']==team), 'tid2_last_season_percentage'] = last_year.loc[year-1, team][0]            

# Bloody Gold Coast
df_train.loc[(df_train['season']==2011) & (df_train['tid1']==108), 'tid1_last_season_percentage'] = 1            
df_train.loc[(df_train['season']==2011) & (df_train['tid2']==108), 'tid2_last_season_percentage'] = 1 
for year in range(2012, 2015+1):
    df_train.loc[(df_train['season']==year) & (df_train['tid1']==108), 'tid1_last_season_percentage'] = last_year.loc[year-1, 108][0]            
    df_train.loc[(df_train['season']==year) & (df_train['tid2']==108), 'tid2_last_season_percentage'] = last_year.loc[year-1, 108][0] 

# Bloody GWS
df_train.loc[(df_train['season']==2012) & (df_train['tid1']==109), 'tid1_last_season_percentage'] = 1            
df_train.loc[(df_train['season']==2012) & (df_train['tid2']==109), 'tid2_last_season_percentage'] = 1            
for year in range(2013, 2015+1):
    df_train.loc[(df_train['season']==year) & (df_train['tid1']==109), 'tid1_last_season_percentage'] = last_year.loc[year-1, 109][0]             
    df_train.loc[(df_train['season']==year) & (df_train['tid2']==109), 'tid2_last_season_percentage'] = last_year.loc[year-1, 109][0]

A column indicating whether *tid1* won

In [155]:
def get_tid1_prob(r):
    if r["win_tid"] == r["tid1"]:
        prob = 1.0 # tid1 win
    else:
        prob = 0.0 # tid1 loss
    return prob

df_train["prob"] = df_train.apply(get_tid1_prob, axis=1)

Generating team dummies, and home or away for tid1

In [156]:
df_train_dum = pd.get_dummies(df_train, columns=["tid1", "tid2", "tid1_loc"])

Extracting the round from the data

In [157]:
df_train["round"] = df_train["round"].str.slice(1).astype(int)

Developing the training, cross validation and test data

(The test data will need to be added after round 23)

In [158]:
# Constructing a training, validation and hold out sets
df_cv = df_train[df_train["season"] == 2014].reset_index(drop=True)
df_test = df_train[df_train["season"] == 2015].reset_index(drop=True)
df_train = df_train[df_train["season"] <= 2013].reset_index(drop=True)

In [159]:
df_train.save('afl_train_full_season')
df_cv.save('afl_cval_full_season')
df_test.save('afl_test_full_season')

