#Footy tipping! 

Predicting a whole season's games

In [1]:
%matplotlib inline
from IPython.display import Image
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import svm, tree, ensemble
from sklearn.naive_bayes import GaussianNB
from scipy.optimize import minimize

In [2]:
df_train = pd.read_pickle('afl_train_full_season.pkl')
df_train = df_train[df_train['season']>=2001]
X_train = df_train.drop(['mid', 'win_tid', 'tid1_points', 'tid2_points', 
                         'h_tid', 'h_score', 'a_tid', 'a_score', 'margin', 
                         'tid1_score', 'tid2_score', 'prob'], axis=1)
y_train = df_train['prob']

In [3]:
df_cv = pd.read_pickle('afl_cval_full_season.pkl')
X_cv = df_cv.drop(['mid', 'win_tid', 'tid1_points', 'tid2_points', 'h_tid', 
                   'h_score', 'a_tid', 'a_score', 'margin', 'tid1_score', 
                   'tid2_score', 'prob'], axis=1)
y_cv = df_cv['prob']

In [4]:
df_test = pd.read_pickle('afl_test_full_season.pkl')
X_test = df_test.drop(['mid', 'win_tid', 'tid1_points', 'tid2_points', 
                       'h_tid', 'h_score', 'a_tid', 'a_score', 'margin', 
                       'tid1_score', 'tid2_score', 'prob'], axis=1)
y_test = df_test['prob']

### the error function

$\textrm{Score} = - \frac{1}{n} \sum_{i=1}^n \left[ y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i)\right]$

In [5]:
def error_function(predicted, actual):
    """
    Computes the error function given error probabilities (predicted)
    and actual win loss
    """
    a = actual * np.log(predicted) 
    b = (1 - actual) * np.log(1-predicted)
    n = float(len(a))
    return - (sum(a) + sum(b)) / n

# Collecting predictions for stacking model

Plan: Logistic regression stacking of a bunch of models
+ SVM
+ Logistic regression
+ Decision trees
+ Gradient boosting


###1 a) SVM
Using Radial Basis Functions

In [6]:
clf_1a = svm.SVC(kernel='rbf', probability = True, C=1)
clf_1a.fit(X_train, y_train)
pred_prob_1a = clf_1a.predict_proba(X_train)[:,1]
pred_1a = clf_1a.predict(X_train)
e_1a = error_function(pred_prob_1a, y_train)

In [7]:
pred_prob_1a_cv = clf_1a.predict_proba(X_cv)[:,1]
pred_1a_cv = clf_1a.predict(X_cv)
e_1a_cv = error_function(pred_prob_1a_cv, y_cv)
e_1a_cv

0.60331954558542222

In [8]:
pred_prob_1a_test = clf_1a.predict_proba(X_test)[:,1]
pred_1a_test = clf_1a.predict(X_test)

### 1 b) SVM
Using sigmoid kernel

In [9]:
clf_1b = svm.SVC(kernel='linear', probability = True, C=1)
clf_1b.fit(X_train, y_train)
pred_prob_1b = clf_1b.predict_proba(X_train)[:,1]
pred_1b = clf_1b.predict(X_train)
e_1b = error_function(pred_prob_1b, y_train)

In [10]:
pred_prob_1b_cv = clf_1b.predict_proba(X_cv)[:,1]
pred_1b_cv = clf_1b.predict(X_cv)
e_1b_cv = error_function(pred_prob_1b_cv, y_cv)
e_1b_cv

0.63971137356905983

In [11]:
pred_prob_1b_test = clf_1b.predict_proba(X_test)[:,1]
pred_1b_test = clf_1b.predict(X_test)

###1 c) SVM
Using polynomial kernel

In [12]:
clf_1c = svm.SVC(kernel='poly', probability = True, C=1)
clf_1c.fit(X_train, y_train)
pred_prob_1c = clf_1c.predict_proba(X_train)[:,1]
pred_1c = clf_1c.predict(X_train)
e_1c = error_function(pred_prob_1c, y_train)

In [13]:
pred_prob_1c_cv = clf_1c.predict_proba(X_cv)[:,1]
pred_1c_cv = clf_1c.predict(X_cv)
e_1c_cv = error_function(pred_prob_1c_cv, y_cv)
e_1c_cv

0.62993112115170669

In [14]:
pred_prob_1c_test = clf_1c.predict_proba(X_test)[:,1]
pred_1c_test = clf_1c.predict(X_test)

###2. Logistic regression

In [15]:
clf_2 = LogisticRegression()
clf_2.fit(X_train, y_train)
pred_prob_2 = clf_2.predict_proba(X_train)[:,1]
pred_2 = clf_2.predict(X_train)
e_2 = error_function(pred_prob_2, y_train)

In [16]:
pred_prob_2_cv = clf_2.predict_proba(X_cv)[:,1]
pred_2_cv = clf_2.predict(X_cv)
e_2_cv = error_function(pred_prob_2_cv, y_cv)
e_2_cv

0.62252653755696608

In [17]:
pred_prob_2_test = clf_2.predict_proba(X_test)[:,1]
pred_2_test = clf_2.predict(X_test)

###3. Decision trees
**3 a) Gini criterion for tree split**

In [18]:
clf_3a = tree.DecisionTreeClassifier(criterion="gini", max_depth=3)
clf_3a.fit(X_train, y_train)
pred_prob_3a = clf_3a.predict_proba(X_train)[:,1]
pred_3a = clf_3a.predict(X_train)
e_3a = error_function(pred_prob_3a, y_train)

In [19]:
pred_prob_3a_cv = clf_3a.predict_proba(X_cv)[:,1]
pred_3a_cv = clf_3a.predict(X_cv)
e_3a_cv = error_function(pred_prob_3a_cv, y_cv)
e_3a_cv

0.64884466569673893

In [20]:
pred_prob_3a_test = clf_3a.predict_proba(X_test)[:,1]
pred_3a_test = clf_3a.predict(X_test)

** 3 b) Entropy criterion for tree split**

In [21]:
clf_3b = tree.DecisionTreeClassifier(criterion="entropy", max_depth=2)
clf_3b.fit(X_train, y_train)
pred_prob_3b = clf_3b.predict_proba(X_train)[:,1]
pred_3b = clf_3b.predict(X_train)
e_3b = error_function(pred_prob_3b, y_train)

In [22]:
pred_prob_3b_cv = clf_3b.predict_proba(X_cv)[:,1]
pred_3b_cv = clf_3b.predict(X_cv)
e_3b_cv = error_function(pred_prob_3b_cv, y_cv)
e_3b_cv

0.65708510890766691

In [23]:
pred_prob_3b_test = clf_3b.predict_proba(X_test)[:,1]
pred_3b_test = clf_3b.predict(X_test)

**3 c) Random tree classifier, Gini criterion**

In [24]:
np.random.seed(96)
clf_3c = tree.ExtraTreeClassifier(criterion="gini", max_depth=2)
clf_3c.fit(X_train, y_train)
pred_prob_3c = clf_3c.predict_proba(X_train)[:,1]
pred_3c = clf_3c.predict(X_train)
e_3c = error_function(pred_prob_3c, y_train)

In [25]:
pred_prob_3c_cv = clf_3c.predict_proba(X_cv)[:,1]
pred_3c_cv = clf_3c.predict(X_cv)
e_3c_cv = error_function(pred_prob_3c_cv, y_cv)
e_3c_cv

0.68191260332709591

In [26]:
pred_prob_3c_test = clf_3c.predict_proba(X_test)[:,1]
pred_3c_test = clf_3c.predict(X_test)

**3 d) Random tree classifier, Entropy criterion**

In [27]:
clf_3d = tree.ExtraTreeClassifier(criterion="entropy", max_depth=2)
clf_3d.fit(X_train, y_train)
pred_prob_3d = clf_3d.predict_proba(X_train)[:,1]
pred_3d = clf_3d.predict(X_train)
e_3d = error_function(pred_prob_3d, y_train)

In [28]:
pred_prob_3d_cv = clf_3d.predict_proba(X_cv)[:,1]
pred_3d_cv = clf_3d.predict(X_cv)
e_3d_cv = error_function(pred_prob_3d_cv, y_cv)
e_3d_cv

0.68710318687022387

In [29]:
pred_prob_3d_test = clf_3d.predict_proba(X_test)[:,1]
pred_3d_test = clf_3d.predict(X_test)

###4 Gradient Boosting

In [30]:
clf_4 = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=0).fit(X_train, y_train)
pred_prob_4 = clf_4.predict_proba(X_train)[:,1]
e_4 = error_function(pred_prob_4, y_train)

In [31]:
pred_prob_4_cv = clf_4.predict_proba(X_cv)[:,1]
e_4_cv = error_function(pred_prob_4_cv, y_cv)
e_4_cv

0.58598735475438057

In [32]:
pred_prob_4_test = clf_4.predict_proba(X_test)[:,1]

###Ensembling!

**'Unweighted' average**

In [33]:
X_ensemble = np.array((pred_prob_1a, pred_prob_1b, pred_prob_1c, pred_prob_2, 
                       pred_prob_3a, pred_prob_3b, pred_prob_3c, pred_prob_3d,
                       pred_prob_4))
X_ensemble_cv = np.array((pred_prob_1a_cv, pred_prob_1b_cv, pred_prob_1c_cv, pred_prob_2_cv, 
                       pred_prob_3a_cv, pred_prob_3b_cv, pred_prob_3c_cv, pred_prob_3d_cv,
                       pred_prob_4_cv))
votes = np.array([np.mean(X_ensemble_cv[:,i]) for i in range(len(X_ensemble_cv.T))])
error_function(votes, y_cv)

0.61757261157246202

**Average weighted by error function**

In [34]:
errors = np.array((e_1a_cv, e_1b_cv, e_1c_cv, e_2_cv, 
                   e_3a_cv, e_3b_cv, e_3c_cv, e_3d_cv, e_4_cv))
errors = 1./errors
weights = errors / sum(errors)
votes = np.dot(weights, X_ensemble_cv)
error_function(votes, y_cv)

0.61619663722006801

**Logistic regression**

In [35]:
clf_ensemble = LogisticRegression()
clf_ensemble.fit(X_ensemble.T, y_train)
pred_clf_ensemble = clf_ensemble.predict_proba(X_ensemble.T)[:,1]

In [36]:
pred_clf_ensemble_cv = clf_ensemble.predict_proba(X_ensemble_cv.T)[:,1]
error_function(pred_clf_ensemble_cv, y_cv)

0.57976222488686113

In [37]:
X_ensemble_test = np.array((pred_prob_1a_test, pred_prob_1b_test, pred_prob_1c_test, pred_prob_2_test, 
                       pred_prob_3a_test, pred_prob_3b_test, pred_prob_3c_test, pred_prob_3d_test,
                         pred_prob_4_test))
pred_clf_ensemble_test = clf_ensemble.predict_proba(X_ensemble_test.T)[:,1]

In [38]:
error_function(pred_clf_ensemble_test, y_test)

0.52206774987716575

###Submission

In [89]:
df_submission = pd.DataFrame()

# Extract the `rid` column (ie. <round>_<tid1>_<tid2>) from the `mid` column of the test set.
df_submission["rid"] = df_test["mid"].str.slice(5)

# Assign the predicted `tid1` win probabilities to the `prob` column.
df_submission["prob"] = pred_clf_ensemble_test

In [92]:
df_submission.shape

(197, 2)

In [310]:
df_submission.to_csv("sportsbet_submission_doupe.csv", index=False)