# FOOTY TIPPING FINALS TIME

#\#nmfc 

Tipping for the finals. 

In [1]:
%matplotlib inline
from IPython.display import Image
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import svm, tree, ensemble
from sklearn.naive_bayes import GaussianNB
from scipy.optimize import minimize

In [2]:
df_train = pd.read_pickle('afl_train_finals.pkl')
X_train = df_train.drop(['mid_x', 'season', 'round', 'tid1_loc_h', 'tid1_loc_a', 'win_tid', 'prob', 'week'], axis=1)
y_train = df_train['prob']

In [3]:
df_cv = pd.read_pickle('afl_cval_finals.pkl')
X_cv = df_cv.drop(['mid_x', 'season', 'round', 'tid1_loc_h', 'tid1_loc_a', 'win_tid', 'prob', 'week'], axis=1)
y_cv = df_cv['prob']

In [4]:
df_test = pd.read_pickle('afl_test_finals.pkl')
X_test = df_test.drop(['mid', 'season'], axis=1)
X_test = pd.get_dummies(X_test, columns=["tid1", "tid2"])
X_test = X_test.drop(['tid1_108', 'tid1_109', 'tid2_108', 'tid2_109', 'tid1_114'], axis=1)

### the error function

$\textrm{Score} = - \frac{1}{n} \sum_{i=1}^n \left[ y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i)\right]$

In [5]:
def error_function(predicted, actual):
    """
    Computes the error function given error probabilities (predicted)
    and actual win loss
    """
    a = actual * np.log(predicted) 
    b = (1 - actual) * np.log(1-predicted)
    n = float(len(a))
    return - (sum(a) + sum(b)) / n

# Collecting predictions for stacking model

Plan: use various models and stack!

###1 a) SVM
Using Radial Basis Functions

In [6]:
clf_1a = svm.SVC(kernel='rbf', probability = True)
clf_1a.fit(X_train, y_train)
pred_prob_1a = clf_1a.predict_proba(X_train)[:,1]
pred_1a = clf_1a.predict(X_train)
e_1a = error_function(pred_prob_1a, y_train)
e_1a

0.5225713332765326

In [7]:
pred_prob_1a_cv = clf_1a.predict_proba(X_cv)[:,1]
pred_1a_cv = clf_1a.predict(X_cv)
e_1a_cv = error_function(pred_prob_1a_cv, y_cv)
e_1a_cv

0.5225713332765326

In [8]:
pred_prob_1a_test = clf_1a.predict_proba(X_test)[:,1]
pred_1a_test = clf_1a.predict(X_test)

### 1 b) SVM
Using sigmoid kernel

In [9]:
clf_1b = svm.SVC(kernel='linear', probability = True, C=10)
clf_1b.fit(X_train, y_train)
pred_prob_1b = clf_1b.predict_proba(X_train)[:,1]
pred_1b = clf_1b.predict(X_train)
e_1b = error_function(pred_prob_1b, y_train)
e_1b

0.51407769436861728

In [10]:
pred_prob_1b_cv = clf_1b.predict_proba(X_cv)[:,1]
pred_1b_cv = clf_1b.predict(X_cv)
e_1b_cv = error_function(pred_prob_1b_cv, y_cv)
e_1b_cv

0.51407769436861728

In [11]:
pred_prob_1b_test = clf_1b.predict_proba(X_test)[:,1]
pred_1b_test = clf_1b.predict(X_test)

###1 c) SVM
Using polynomial kernel

In [12]:
clf_1c = svm.SVC(kernel='poly', probability = True, C=1)
clf_1c.fit(X_train, y_train)
pred_prob_1c = clf_1c.predict_proba(X_train)[:,1]
pred_1c = clf_1c.predict(X_train)
e_1c = error_function(pred_prob_1c, y_train)
e_1c

0.58425826462208352

In [13]:
pred_prob_1c_cv = clf_1c.predict_proba(X_cv)[:,1]
pred_1c_cv = clf_1c.predict(X_cv)
e_1c_cv = error_function(pred_prob_1c_cv, y_cv)
e_1c_cv

0.58425826462208352

In [14]:
pred_prob_1c_test = clf_1c.predict_proba(X_test)[:,1]
pred_1c_test = clf_1c.predict(X_test)

###2. Logistic regression

In [15]:
clf_2 = LogisticRegression()
clf_2.fit(X_train, y_train)
pred_prob_2 = clf_2.predict_proba(X_train)[:,1]
pred_2 = clf_2.predict(X_train)
e_2 = error_function(pred_prob_2, y_train)
e_2

0.44919257536801244

In [16]:
pred_prob_2_cv = clf_2.predict_proba(X_cv)[:,1]
pred_2_cv = clf_2.predict(X_cv)
e_2_cv = error_function(pred_prob_2_cv, y_cv)
e_2_cv

0.44919257536801244

In [17]:
pred_prob_2_test = clf_2.predict_proba(X_test)[:,1]
pred_2_test = clf_2.predict(X_test)

###3. Decision trees
**3 a) Gini criterion for tree split**

In [18]:
clf_3a = tree.DecisionTreeClassifier(criterion="gini", max_depth=2)
clf_3a.fit(X_train, y_train)
pred_prob_3a = clf_3a.predict_proba(X_train)[:,1]
pred_3a = clf_3a.predict(X_train)
e_3a = error_function(pred_prob_3a, y_train)
e_3a

0.55373021734176409

In [19]:
pred_prob_3a_cv = clf_3a.predict_proba(X_cv)[:,1]
pred_3a_cv = clf_3a.predict(X_cv)
e_3a_cv = error_function(pred_prob_3a_cv, y_cv)
e_3a_cv

0.55373021734176409

In [20]:
pred_prob_3a_test = clf_3a.predict_proba(X_test)[:,1]
pred_3a_test = clf_3a.predict(X_test)

** 3 b) Entropy criterion for tree split**

In [21]:
clf_3b = tree.DecisionTreeClassifier(criterion="entropy", max_depth=2)
clf_3b.fit(X_train, y_train)
pred_prob_3b = clf_3b.predict_proba(X_train)[:,1]
pred_3b = clf_3b.predict(X_train)
e_3b = error_function(pred_prob_3b, y_train)
e_3b

0.55373021734176409

In [22]:
pred_prob_3b_cv = clf_3b.predict_proba(X_cv)[:,1]
pred_3b_cv = clf_3b.predict(X_cv)
e_3b_cv = error_function(pred_prob_3b_cv, y_cv)
e_3b_cv

0.55373021734176409

In [23]:
pred_prob_3b_test = clf_3b.predict_proba(X_test)[:,1]
pred_3b_test = clf_3b.predict(X_test)

**3 c) Random tree classifier, Gini criterion**

In [24]:
np.random.seed(0)
clf_3c = tree.ExtraTreeClassifier(criterion="gini", max_depth=2)
clf_3c.fit(X_train, y_train)
pred_prob_3c = clf_3c.predict_proba(X_train)[:,1]
pred_3c = clf_3c.predict(X_train)
e_3c = error_function(pred_prob_3c, y_train)
e_3c

0.63785542036103249

In [25]:
pred_prob_3c_cv = clf_3c.predict_proba(X_cv)[:,1]
pred_3c_cv = clf_3c.predict(X_cv)
e_3c_cv = error_function(pred_prob_3c_cv, y_cv)
e_3c_cv

0.63785542036103249

In [26]:
pred_prob_3c_test = clf_3c.predict_proba(X_test)[:,1]
pred_3c_test = clf_3c.predict(X_test)

**3 d) Random tree classifier, Entropy criterion**

In [27]:
np.random.seed(2)
clf_3d = tree.ExtraTreeClassifier(criterion="entropy", max_depth=2)
clf_3d.fit(X_train, y_train)
pred_prob_3d = clf_3d.predict_proba(X_train)[:,1]
pred_prob_3d[pred_prob_3d==0.] = 1e-12
pred_3d = clf_3d.predict(X_train)
e_3d = error_function(pred_prob_3d, y_train)
e_3d

0.65265442530169082

In [28]:
pred_prob_3d_cv = clf_3d.predict_proba(X_cv)[:,1]
pred_3d_cv = clf_3d.predict(X_cv)
pred_prob_3d_cv[pred_prob_3d_cv==0.] = 1e-12
e_3d_cv = error_function(pred_prob_3d_cv, y_cv)
e_3d_cv

0.65265442530169082

In [29]:
pred_prob_3d_test = clf_3d.predict_proba(X_test)[:,1]
pred_prob_3d_test[pred_prob_3d_test==0.] = 1e-12
pred_3d_test = clf_3d.predict(X_test)

###4 Gradient Boosting

In [30]:
clf_4 = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=0).fit(X_train, y_train)
pred_prob_4 = clf_4.predict_proba(X_train)[:,1]
e_4 = error_function(pred_prob_4, y_train)
e_4

0.18152995458055599

In [31]:
pred_prob_4_cv = clf_4.predict_proba(X_cv)[:,1]
e_4_cv = error_function(pred_prob_4_cv, y_cv)
e_4_cv

0.18152995458055599

In [32]:
pred_prob_4_test = clf_4.predict_proba(X_test)[:,1]

##Ensembling

**'Unweighted' average**

In [33]:
X_ensemble = np.array((pred_prob_1a, pred_prob_1b, pred_prob_1c, pred_prob_2, 
                       pred_prob_3a, pred_prob_3b, pred_prob_3c, pred_prob_3d,
                       pred_prob_4))
votes = np.array([np.mean(X_ensemble[:,i]) for i in range(len(X_ensemble.T))])
error_function(votes, y_train)

0.48164235725882465

**Average weighted by error function**

In [34]:
errors = np.array((e_1a, e_1b, e_1c, e_2, e_3a, e_3b, e_3c, e_3d, e_4))
errors = 1./errors
weights = errors / sum(errors)
votes = np.dot(weights, X_ensemble)
error_function(votes, y_train)

0.41235229143956087

**Logistic regression**

In [35]:
clf_ensemble = LogisticRegression(C=0.1)
clf_ensemble.fit(X_ensemble.T, y_train)
pred_clf_ensemble = clf_ensemble.predict_proba(X_ensemble.T)[:,1]
error_function(pred_clf_ensemble, y_train)

0.46509651719785838

*Cross validation set*

In [36]:
X_ensemble_cv = np.array((pred_prob_1a_cv, pred_prob_1b_cv, pred_prob_1c_cv, pred_prob_2_cv,
                       pred_prob_3a_cv, pred_prob_3b_cv, pred_prob_3c_cv, pred_prob_3d_cv,
                         pred_prob_4_cv))
pred_clf_ensemble_cv = clf_ensemble.predict_proba(X_ensemble_cv.T)[:,1]
error_function(pred_clf_ensemble_cv, y_cv)

0.46509651719785838

In [37]:
X_ensemble_test = np.array((pred_prob_1a_test, pred_prob_1b_test, pred_prob_1c_test, pred_prob_2_test, 
                       pred_prob_3a_test, pred_prob_3b_test, pred_prob_3c_test, pred_prob_3d_test,
                         pred_prob_4_test))
pred_clf_ensemble_test = clf_ensemble.predict_proba(X_ensemble_test.T)[:,1]

###Submission

In [100]:
output = df_test[['mid']]
output.loc[:, 'prob'] = 0
output.columns = ['id', 'prob']
output.loc[:, 'prob'] = pred_clf_ensemble_test
output = output.sort(['id'])

In [101]:
output.to_csv("sportsbet_submission_finals_doupe.csv", index=False)

In [102]:
teams = pd.read_csv('data/teams.csv')
teams

Unnamed: 0,tid,tname
0,101,Adelaide
1,102,Brisbane Lions
2,103,Carlton
3,104,Collingwood
4,105,Essendon
5,106,Fremantle
6,107,Geelong
7,108,Gold Coast
8,109,Greater Western Sydney
9,110,Hawthorn


In [103]:
print "Hawthorn to beat West Coast"
output[output['id'] == '110_117']

Hawthorn to beat West Coast


Unnamed: 0,id,prob
2750,110_117,0.733884


In [104]:
print "Fremantle to beat Sydney"
output[output['id'] == '106_116']

Fremantle to beat Sydney


Unnamed: 0,id,prob
2806,106_116,0.561514


In [105]:
print "Adelaide to beat Footscray"
output[output['id'] == '101_118']

Adelaide to beat Footscray


Unnamed: 0,id,prob
2850,101_118,0.512975


In [106]:
print "Richmond to beat North :("
output[output['id'] == '112_114']

Richmond to beat North :(


Unnamed: 0,id,prob
2731,112_114,0.406917


Week Two

In [107]:
print "West Coast to beat Adelaide"
output[output['id'] == '101_117']

West Coast to beat Adelaide


Unnamed: 0,id,prob
2876,101_110,0.324636


In [108]:
print "Sydney to beat Richmond"
output[output['id'] == '114_116']

Sydney to beat Richmond


Unnamed: 0,id,prob
2813,112_116,0.347445


Week Three

In [109]:
print "Fremantle to beat West Coast"
output[output['id'] == '106_117']

Fremantle to beat West Coast


Unnamed: 0,id,prob
2877,106_110,0.474647


In [110]:
print "Hawthorn to beat Sydney"
output[output['id'] == '110_112']

Hawthorn to beat Sydney


Unnamed: 0,id,prob
2749,112_117,0.360976


Grand Final

In [52]:
output[output['id'] == '106_110']

Unnamed: 0,id,prob
2877,106_110,0.470295


### Hawthorn are premiers. Bullshit :(