# Pipeline

1. Preprocessing
2. Data Extraction
3. Data Exploration
4. **Model**

This file initializes the model and makes predictions too.

# Imports

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
# Get feature dataframe from pickle file
df = pd.read_pickle('feature_frame.pkl')

In [3]:
df.sample(5)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,H_WIN_PCT_away,...,A_RTG_away,EGD,EPTS_PR_home,EPTS_PR_away,RED_home,RED_away,H_ST_home,H_ST_away,A_ST_home,A_ST_away
4694,47,42,2,1,1,0.606061,0.212121,0.371429,0.228571,0.571429,...,0.0623,-0.156056,1.0,0.25,2,0,16.242424,19.114286,11.885714,14.939394
8867,63,77,1,0,1,0.368421,0.236842,0.184211,0.236842,0.394737,...,-0.279358,0.419571,1.25,1.25,0,0,12.5,12.552632,10.552632,11.157895
10031,102,101,2,2,0,0.52381,0.333333,0.136364,0.181818,0.5,...,-0.713681,0.76614,0.5,2.0,0,0,14.52381,13.363636,11.545455,11.666667
3274,9,29,2,2,0,0.657895,0.263158,0.512821,0.205128,0.351351,...,0.027107,-0.007425,0.25,0.25,0,1,14.868421,11.0,12.076923,9.175
7714,81,65,0,1,2,0.702703,0.216216,0.5,0.263158,0.289474,...,0.118904,-0.073598,1.0,1.0,0,0,17.135135,10.868421,13.552632,9.810811


# Extracting target variables and features

# _WICHTIG: Das hier muss auf neue columns angepasst werden (z.B. season)_

In [3]:
# If True, the goal difference for a specific game is seen as the target variable.
# (e.g. -3 for a game outcome of 1:4, or 2 for 3:1) .
#
# If False, we just want to predict the winner.
# 1 = Home team wins, 0 = Draw, 2 = Away team wins
predict_goal_difference = False

if predict_goal_difference:
    y = df['FTHG'] - df['FTAG']
else:
    y = df['FTR']

X = df.iloc[:,5:]    # Remove unnecessary columns (IDs etc.) from features

# Ranked Probability Score (RPS)

In [4]:
def rps(pred, actual_value, r=3):
    '''Returns the ranked probability score for a single given game.
    (see Hubacek paper for formula)
    
    Arguments:
    pred -- predicted results; in vector form (e.g. [0.1, 0.6, 0.3])
    actual_value -- actual result (0, 1 or 2); not in vector form yet
    r -- number of categories (3 for football)
    '''
    value_vec = [0, 0, 0]
    
    # Bring value_vec into 1, 0, 2 order
    if actual_value == 0:
        value_vec[1] = 1
    elif actual_value == 2:
        value_vec[0] == 1
    elif actual_value == 1:
        value_vec[2] == 1
    else:
        raise Exception('Prediction was not in [1, 0, 2].')
    #value_vec = [0, 0, 1]
    
    #pred[0], pred[1] = pred[1], pred[0]
    
    pred[0], pred[1], pred[2] = pred[2], pred[0], pred[1]   # order: loss, draw, win
    
    #print(pred)
    
    rps = 0
    
    for i in range(0, r-1):    # r-1 becomes r because of the exclusion of range()
        bracket_part = 0
        for j in range(0, i+1):    # same for i and i+1
            bracket_part += pred[j] - value_vec[j]
        
        rps += np.square(bracket_part)
    
    rps *= 1 / (r - 1)
    
    return rps


# To be used as eval_metric parameter
def rps_eval_metric(y_true, y_pred):
    return rps(y_pred, y_true)

# Model fit

In [5]:
# Apparently the random states are pretty important. 21 works very well on RPS and test accuracy, 16 only on accuracy.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.222, random_state=20)

## Find best value for *n_estimators*

In [13]:
# We start with n_estimators=50...
xgb_cl = xgb.XGBClassifier(objective='multi:softmax', n_estimators=50, seed=16)
# ...and then validate it to find the lowest loss.
op = xgb_cl.fit(X_train, y_train, early_stopping_rounds=900, eval_metric='mlogloss', eval_set=[(X_test, y_test)])

[0]	validation_0-mlogloss:1.05159




[1]	validation_0-mlogloss:1.02520
[2]	validation_0-mlogloss:1.00897
[3]	validation_0-mlogloss:0.99943
[4]	validation_0-mlogloss:0.99461
[5]	validation_0-mlogloss:0.98913
[6]	validation_0-mlogloss:0.98577
[7]	validation_0-mlogloss:0.98409
[8]	validation_0-mlogloss:0.98478
[9]	validation_0-mlogloss:0.98430
[10]	validation_0-mlogloss:0.98471
[11]	validation_0-mlogloss:0.98532
[12]	validation_0-mlogloss:0.98546
[13]	validation_0-mlogloss:0.98485
[14]	validation_0-mlogloss:0.98523
[15]	validation_0-mlogloss:0.98631
[16]	validation_0-mlogloss:0.98643
[17]	validation_0-mlogloss:0.98745
[18]	validation_0-mlogloss:0.98898
[19]	validation_0-mlogloss:0.99127
[20]	validation_0-mlogloss:0.99189
[21]	validation_0-mlogloss:0.99234
[22]	validation_0-mlogloss:0.99355
[23]	validation_0-mlogloss:0.99401
[24]	validation_0-mlogloss:0.99533
[25]	validation_0-mlogloss:0.99560
[26]	validation_0-mlogloss:0.99618
[27]	validation_0-mlogloss:0.99740
[28]	validation_0-mlogloss:0.99772
[29]	validation_0-mlogloss:0.

In [24]:
learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
n_estimators = [4, 8, 16, 32]
param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)

In [26]:
# I tried to do Grid Search here, but the 

grid = GridSearchCV(estimator=xgb_cl, param_grid=param_grid, n_jobs=8, cv=3)
grid_result = grid.fit(X, y)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))



Best: 0.475635 using {'learning_rate': 0.2, 'n_estimators': 8}


## Use best parameters to fit the model

In [7]:
# n_estimators=8 had the lowest loss, so we overwrite the previous model.b
xgb_cl = xgb.XGBClassifier(objective='multi:softprob', n_estimators=8, seed=16, learning_rate=0.3)

In [8]:
xgb_cl.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=8, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=16, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, seed=16, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Evaluation

In [9]:
preds = xgb_cl.predict(X_test)
accuracy = float(np.sum(preds == y_test))/y_test.shape[0]

print(f'accuracy: {accuracy}')

accuracy: 0.5309795777085496


In [10]:
# Looking at train set accuracy to get an intuition of how much the model overfits
preds = xgb_cl.predict(X_train)
accuracy = float(np.sum(preds == y_train))/y_train.shape[0]

print(f'accuracy on train set: {accuracy}')

accuracy on train set: 0.6137508643682703


In [11]:
# Predictions in form of [0.2, 0.5, 0.3] in the order of draw, home win, away win.
# This is because it gets ordered like 0, 1, 2.
proba_preds = xgb_cl.predict_proba(X_test)

# List of RPS scores for every game in the test set.
# Important to use iloc for y_test, otherwise indices would be wrong
rps_list = [rps(pred, y_test.iloc[i]) for i, pred in enumerate(proba_preds)]

# Average ranked probability score.
np.mean(rps_list)

0.20160546311142438