#Footy tipping! 

Trying to predict a whole season's games!

In [252]:
%matplotlib inline
from IPython.display import Image
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import svm, tree, ensemble
from sklearn.naive_bayes import GaussianNB
from scipy.optimize import minimize

In [2]:
df_train = pd.read_pickle('afl_train_full_season.pkl')
df_train = df_train[df_train['season']>=2005]
X_train = df_train.drop(['mid', 'win_tid', 'tid1_points', 'tid2_points', 'h_tid', 'h_score', 'a_tid', 'a_score', 'margin', 'tid1_score', 'tid2_score', 'prob'], axis=1)
y_train = df_train['prob']

In [3]:
df_cv = pd.read_pickle('afl_cval_full_season.pkl')
X_cv = df_cv.drop(['mid', 'win_tid', 'tid1_points', 'tid2_points', 'h_tid', 'h_score', 'a_tid', 'a_score', 'margin', 'tid1_score', 'tid2_score', 'prob'], axis=1)
y_cv = df_cv['prob']

In [4]:
df_test = pd.read_pickle('afl_test_full_season.pkl')
X_test = df_test.drop(['mid', 'win_tid', 'tid1_points', 'tid2_points', 'h_tid', 'h_score', 'a_tid', 'a_score', 'margin', 'tid1_score', 'tid2_score', 'prob'], axis=1)
y_test = df_test['prob']

### the error function

$\textrm{Score} = - \frac{1}{n} \sum_{i=1}^n \left[ y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i)\right]$

In [267]:
def error_function(predicted, actual):
    """
    Computes the error function given error probabilities (predicted)
    and actual win loss
    """
    a = actual * np.log(predicted) 
    b = (1 - actual) * np.log(1-predicted)
    n = float(len(a))
    return - (sum(a) + sum(b)) / n

# Collecting predictions for stacking model

Plan: Logistic regression stacking of a bunch of models
+ SVM
+ Logistic regression
+ Decision trees
+ Gradient boosting


###1 a) SVM
Using Radial Basis Functions

In [268]:
clf_1a = svm.SVC(kernel='rbf', probability = True)
clf_1a.fit(X_train, y_train)
pred_prob_1a = clf_1a.predict_proba(X_train)[:,1]
pred_1a = clf_1a.predict(X_train)
e_1a = error_function(pred_prob_1a, y_train)
e_1a

0.57528301180941399

In [271]:
pred_prob_1a_cv = clf_1a.predict_proba(X_cv)[:,1]
pred_1a_cv = clf_1a.predict(X_cv)
e_1a_cv = error_function(pred_prob_1a_cv, y_cv)
e_1a_cv

0.55209356059127834

In [272]:
pred_prob_1a_test = clf_1a.predict_proba(X_test)[:,1]
pred_1a_test = clf_1a.predict(X_test)
e_1a_test = error_function(pred_prob_1a_test, y_test)

### 1 b) SVM
Using sigmoid kernel

In [273]:
clf_1b = svm.SVC(kernel='linear', probability = True, C=1)
clf_1b.fit(X_train, y_train)
pred_prob_1b = clf_1b.predict_proba(X_train)[:,1]
pred_1b = clf_1b.predict(X_train)
e_1b = error_function(pred_prob_1b, y_train)
e_1b

0.61115802051649348

In [274]:
pred_prob_1b_cv = clf_1b.predict_proba(X_cv)[:,1]
pred_1b_cv = clf_1b.predict(X_cv)
e_1b_cv = error_function(pred_prob_1b_cv, y_cv)
e_1b_cv

0.59508313314165762

In [275]:
pred_prob_1b_test = clf_1b.predict_proba(X_test)[:,1]
pred_1b_test = clf_1b.predict(X_test)
e_1b_test = error_function(pred_prob_1b_test, y_test)

###1 c) SVM
Using polynomial kernel

In [276]:
clf_1c = svm.SVC(kernel='poly', probability = True, C=1)
clf_1c.fit(X_train, y_train)
pred_prob_1c = clf_1c.predict_proba(X_train)[:,1]
pred_1c = clf_1c.predict(X_train)
e_1c = error_function(pred_prob_1c, y_train)
e_1c

0.60158875954925506

In [277]:
pred_prob_1c_cv = clf_1c.predict_proba(X_cv)[:,1]
pred_1c_cv = clf_1c.predict(X_cv)
e_1c_cv = error_function(pred_prob_1c_cv, y_cv)
e_1c_cv

0.56219470129535276

In [278]:
pred_prob_1c_test = clf_1c.predict_proba(X_test)[:,1]
pred_1c_test = clf_1c.predict(X_test)
e_1c_test = error_function(pred_prob_1c_test, y_test)

###2. Logistic regression

In [279]:
clf_2 = LogisticRegression()
clf_2.fit(X_train, y_train)
pred_prob_2 = clf_2.predict_proba(X_train)[:,1]
pred_2 = clf_2.predict(X_train)
e_2 = error_function(pred_prob_2, y_train)
e_2

0.59388641132894315

In [281]:
pred_prob_2_cv = clf_2.predict_proba(X_cv)[:,1]
pred_2_cv = clf_2.predict(X_cv)
e_2_cv = error_function(pred_prob_2_cv, y_cv)
e_2_cv

0.55965521757691283

In [283]:
pred_prob_2_test = clf_2.predict_proba(X_test)[:,1]
pred_2_test = clf_2.predict(X_test)
e_2_test = error_function(pred_prob_2_test, y_test)

###3. Decision trees
**3 a) Gini criterion for tree split**

In [284]:
clf_3a = tree.DecisionTreeClassifier(criterion="gini", max_depth=3)
clf_3a.fit(X_train, y_train)
pred_prob_3a = clf_3a.predict_proba(X_train)[:,1]
pred_3a = clf_3a.predict(X_train)
e_3a = error_function(pred_prob_3a, y_train)
e_3a

0.61877907174943358

In [285]:
pred_prob_3a_cv = clf_3a.predict_proba(X_cv)[:,1]
pred_3a_cv = clf_3a.predict(X_cv)
e_3a_cv = error_function(pred_prob_3a_cv, y_cv)
e_3a_cv

0.59994216362392883

In [286]:
pred_prob_3a_test = clf_3a.predict_proba(X_test)[:,1]
pred_3a_test = clf_3a.predict(X_test)
e_3a_test = error_function(pred_prob_3a_test, y_test)

** 3 b) Entropy criterion for tree split**

In [287]:
clf_3b = tree.DecisionTreeClassifier(criterion="entropy", max_depth=2)
clf_3b.fit(X_train, y_train)
pred_prob_3b = clf_3b.predict_proba(X_train)[:,1]
pred_3b = clf_3b.predict(X_train)
e_3b = error_function(pred_prob_3b, y_train)
e_3b

0.63822908936327138

In [288]:
pred_prob_3b_cv = clf_3b.predict_proba(X_cv)[:,1]
pred_3b_cv = clf_3b.predict(X_cv)
e_3b_cv = error_function(pred_prob_3b_cv, y_cv)
e_3b_cv

0.61812184838629691

In [289]:
pred_prob_3b_test = clf_3b.predict_proba(X_test)[:,1]
pred_3b_test = clf_3b.predict(X_test)
e_3b_test = error_function(pred_prob_3b_test, y_test)

**3 c) Random tree classifier, Gini criterion**

In [290]:
np.random.seed(96)
clf_3c = tree.ExtraTreeClassifier(criterion="gini", max_depth=2)
clf_3c.fit(X_train, y_train)
pred_prob_3c = clf_3c.predict_proba(X_train)[:,1]
pred_3c = clf_3c.predict(X_train)
e_3c = error_function(pred_prob_3c, y_train)
e_3c

0.67396786310877366

In [291]:
pred_prob_3c_cv = clf_3c.predict_proba(X_cv)[:,1]
pred_3c_cv = clf_3c.predict(X_cv)
e_3c_cv = error_function(pred_prob_3c_cv, y_cv)
e_3c_cv

0.67846329788740345

In [292]:
pred_prob_3c_test = clf_3c.predict_proba(X_test)[:,1]
pred_3c_test = clf_3c.predict(X_test)
e_3c_test = error_function(pred_prob_3c_test, y_test)

**3 d) Random tree classifier, Entropy criterion**

In [293]:
clf_3d = tree.ExtraTreeClassifier(criterion="entropy", max_depth=2)
clf_3d.fit(X_train, y_train)
pred_prob_3d = clf_3d.predict_proba(X_train)[:,1]
pred_3d = clf_3d.predict(X_train)
e_3d = error_function(pred_prob_3d, y_train)
e_3d

0.68237128936823932

In [294]:
pred_prob_3d_cv = clf_3d.predict_proba(X_cv)[:,1]
pred_3d_cv = clf_3d.predict(X_cv)
e_3d_cv = error_function(pred_prob_3d_cv, y_cv)
e_3d_cv

0.6784441434664551

In [295]:
pred_prob_3d_test = clf_3d.predict_proba(X_test)[:,1]
pred_3d_test = clf_3d.predict(X_test)
e_3d_test = error_function(pred_prob_3d_test, y_test)

###4 Gradient Boosting

In [296]:
clf_4 = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=0).fit(X_train, y_train)
pred_prob_4 = clf_4.predict_proba(X_train)[:,1]
e_4 = error_function(pred_prob_4, y_train)
e_4

0.55097543743183608

In [298]:
pred_prob_4_cv = clf_4.predict_proba(X_cv)[:,1]
e_4_cv = error_function(pred_prob_4_cv, y_cv)
e_4_cv

0.52676526590485706

In [299]:
pred_prob_4_test = clf_4.predict_proba(X_test)[:,1]
e_4_test = error_function(pred_prob_4_test, y_test)

###Ensembling!

**'Unweighted' average**

In [300]:
X_ensemble = np.array((pred_prob_1a, pred_prob_1b, pred_prob_1c, pred_prob_2, 
                       pred_prob_3a, pred_prob_3b, pred_prob_3c, pred_prob_3d,
                       pred_prob_4))
votes = np.array([np.mean(X_ensemble[:,i]) for i in range(len(X_ensemble.T))])
error_function(votes, y_train)

0.59543234039186743

**Average weighted by error function**

In [301]:
errors = np.array((e_1a, e_1b, e_1c, e_2, e_3a, e_3b, e_3c, e_3d, e_4))
errors = 1./errors
weights = errors / sum(errors)
votes = np.dot(weights, X_ensemble)
error_function(votes, y_train)

0.59307566902074249

**Logistic regression**

In [302]:
clf_ensemble = LogisticRegression()
clf_ensemble.fit(X_ensemble.T, y_train)
pred_clf_ensemble = clf_ensemble.predict_proba(X_ensemble.T)[:,1]
error_function(pred_clf_ensemble, y_train)

0.5404689821407217

*Cross validation set*

In [304]:
X_ensemble_cv = np.array((pred_prob_1a_cv, pred_prob_1b_cv, pred_prob_1c_cv, pred_prob_2_cv, 
                       pred_prob_3a_cv, pred_prob_3b_cv, pred_prob_3c_cv, pred_prob_3d_cv,
                         pred_prob_4_cv))
pred_clf_ensemble_cv = clf_ensemble.predict_proba(X_ensemble_cv.T)[:,1]
error_function(pred_clf_ensemble_cv, y_cv)

0.51325083239646618

In [306]:
X_ensemble_test = np.array((pred_prob_1a_test, pred_prob_1b_test, pred_prob_1c_test, pred_prob_2_test, 
                       pred_prob_3a_test, pred_prob_3b_test, pred_prob_3c_test, pred_prob_3d_test,
                         pred_prob_4_test))
pred_clf_ensemble_test = clf_ensemble.predict_proba(X_ensemble_test.T)[:,1]

###Submission

In [308]:
df_submission = pd.DataFrame()

# Extract the `rid` column (ie. <round>_<tid1>_<tid2>) from the `mid` column of the test set.
df_submission["rid"] = df_test["mid"].str.slice(5)

# Assign the predicted `tid1` win probabilities to the `prob` column.
df_submission["prob"] = pred_clf_ensemble_test

In [309]:
df_submission

Unnamed: 0,rid,prob
0,R01_103_114,0.270309
1,R01_108_111,0.359224
2,R01_105_116,0.120215
3,R01_102_104,0.287823
4,R01_117_118,0.539268
5,R01_109_115,0.644048
6,R01_101_112,0.518249
7,R01_106_113,0.587261
8,R01_107_110,0.175839
9,R02_103_117,0.137694


In [310]:
df_submission.to_csv("sportsbet_submission_doupe.csv", index=False)