# Pipeline

1. Preprocessing
2. Data Extraction
3. Data Exploration
4. **Model**

This file initializes the model and makes predictions too.

# Imports

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
# Get feature dataframe from pickle file
df = pd.read_pickle('feature_frame.pkl')

In [3]:
df.sample(5)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,Date,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,...,REL_PTS_1_away,REL_PTS_2_away,REL_PTS_3_away,REL_PTS_4_away,REL_PTS_5_away,REL_PTS_N-0_away,REL_PTS_N-1_away,REL_PTS_N-2_away,REL_PTS_N-3_away,REL_PTS_N-4_away
11984,107,98,0,0,0,2016-11-19,0.815789,0.078947,0.736842,0.131579,...,1.166667,0.833333,0.666667,0.575758,0.416667,-0.69697,-0.515152,-0.424242,-0.424242,-0.424242
5884,61,56,1,1,0,2018-09-02,0.617647,0.176471,0.411765,0.235294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,14,16,2,1,1,2010-10-23,0.5,0.5,0.25,0.25,...,1.444444,1.125,0.75,0.75,0.666667,-0.25,-0.25,0.0,0.0,0.111111
12357,119,106,3,1,1,2017-11-05,0.447368,0.210526,0.263158,0.157895,...,1.818182,1.454545,1.090909,1.0,0.727273,-0.454545,-0.4,-0.2,0.0,0.0
883,4,14,2,4,2,2012-11-24,0.342857,0.285714,0.236842,0.289474,...,0.333333,0.307692,0.0,0.0,-0.384615,-1.333333,-1.230769,-1.181818,-1.083333,-1.0


# Extracting target variables and features

# _WICHTIG: Das hier muss auf neue columns angepasst werden (z.B. season)_

In [130]:
# If True, the goal difference for a specific game is seen as the target variable.
# (e.g. -3 for a game outcome of 1:4, or 2 for 3:1) .
#
# If False, we just want to predict the winner.
# 1 = Home team wins, 0 = Draw, 2 = Away team wins
predict_goal_difference = False

if predict_goal_difference:
    y = df['FTHG'] - df['FTAG']
else:
    y = df['FTR']
    
# Remove unnecessary columns (IDs etc.) from features
X = df.iloc[:,6:].drop(['season', 'GAME_CNT_AFTER_GAME_home', 'GAME_CNT_AFTER_GAME_away', 'league'], axis=1)    
#X['FTR'] = df['FTR']

In [None]:
df.iloc[:,6:].drop(['season', 'GAME_CNT_AFTER_GAME_home', 'GAME_CNT_AFTER_GAME_away'], axis=1).columns

# Ranked Probability Score (RPS)

In [116]:
def rps(pred, actual_value, r=3):
    '''Returns the ranked probability score for a single given game.
    (see Hubacek paper for formula)
    
    Arguments:
    pred -- predicted results; in vector form (e.g. [0.1, 0.6, 0.3])
    actual_value -- actual result (0, 1 or 2); not in vector form yet
    r -- number of categories (3 for football)
    '''
    value_vec = [0, 0, 0]
    
    # Bring value_vec into 1, 0, 2 order
    if actual_value == 0:
        value_vec[1] = 1
    elif actual_value == 2:
        value_vec[2] = 1
    elif actual_value == 1:
        value_vec[0] = 1
    else:
        print(actual_value)
        raise Exception('Prediction was not in [1, 0, 2].')
    #value_vec = [0, 0, 1]
    
    #print(pred)
    #print("pred:", pred, "vec:", value_vec, "actualval:", actual_value)
    pred[0], pred[1], pred[2] = pred[1], pred[0], pred[2]   # order: loss, draw, win
    #print("pred:", pred, "vec:", value_vec, "actualval:", actual_value)
    
    rps = 0
    
    for i in range(0, r-1):
        inner_sum = 0
        for j in range(0, i+1):
            inner_sum += (pred[j] - value_vec[j])
        rps += np.square(inner_sum)
    
    rps /= (r-1)
    
    return rps


# To be used as eval_metric parameter
def rps_eval_metric(y_true, y_pred):
    return rps(y_pred, y_true)

# Model fit

In [138]:
# Apparently the random states are pretty important. 21 works very well on RPS and test accuracy, 16 only on accuracy.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=543)

## Find best value for *n_estimators*

In [None]:
# We start with n_estimators=50...
xgb_cl = xgb.XGBClassifier(objective='multi:softmax', n_estimators=50, seed=16)
# ...and then validate it to find the lowest loss.
op = xgb_cl.fit(X_train, y_train, early_stopping_rounds=900, eval_metric='mlogloss', eval_set=[(X_test, y_test)])

In [24]:
learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
n_estimators = [4, 8, 16, 32]
param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)

In [26]:
# I tried to do Grid Search here, but the 

grid = GridSearchCV(estimator=xgb_cl, param_grid=param_grid, n_jobs=8, cv=3)
grid_result = grid.fit(X, y)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))



Best: 0.475635 using {'learning_rate': 0.2, 'n_estimators': 8}


## Use best parameters to fit the model

In [132]:
# n_estimators=8 had the lowest loss, so we overwrite the previous model.b
xgb_cl = xgb.XGBClassifier(objective='multi:softprob', n_estimators=8, seed=16, learning_rate=0.3)

In [133]:
xgb_cl.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=8, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=16, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, seed=16, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Evaluation

In [139]:
preds = xgb_cl.predict(X_test)
accuracy = float(np.sum(preds == y_test))/y_test.shape[0]

print(f'accuracy: {accuracy}')

accuracy: 0.6594950603732163


In [140]:
# Looking at train set accuracy to get an intuition of how much the model overfits
preds = xgb_cl.predict(X_train)
accuracy = float(np.sum(preds == y_train))/y_train.shape[0]

print(f'accuracy on train set: {accuracy}')

accuracy on train set: 0.6648929880572307


In [141]:
# Predictions in form of [0.2, 0.5, 0.3] in the order of draw, home win, away win.
# This is because it gets ordered like 0, 1, 2.
proba_preds = xgb_cl.predict_proba(X_test)

# List of RPS scores for every game in the test set.
# Important to use iloc for y_test, otherwise indices would be wrong
rps_list = [rps(pred, y_test.iloc[i]) for i, pred in enumerate(proba_preds)]

#proba_preds

#rps(proba_preds[0], y_test.iloc[0])

# Average ranked probability score.
np.mean(rps_list)
#for i, pred in enumerate(proba_preds):
#    print(y_test.iloc[i], pred)

0.145805292347738

In [142]:
sample_list = list()
for i in range(1000):
    if i < 460:
        sample_list.append(1)
    elif i < 753:
        sample_list.append(2)
    else:
        sample_list.append(0)

In [143]:
# Das hier probiert ungefähr geaveragedte priors und zeigt das rps ergebnis.
r_list = []
for i in range(len(y_test)):    
    r_list.append(rps([0.293, 0.46, 0.247], y_test.iloc[i]))
    
np.mean(r_list)

0.22693501591657522