# Pipeline

1. Preprocessing
2. Data Extraction
3. Data Exploration
4. **Model**

This file initializes the model and makes predictions too.

# Imports

In [58]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [90]:
# Get feature dataframe from pickle file
df = pd.read_pickle('feature_frame.pkl')

In [3]:
df.sample(5)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,H_WIN_PCT_away,...,H_WIN_PCT_league,DRAW_PCT_league,TEAM_CNT_league,GD_STD_league,RND_CNT_league,H_RTG_home,A_RTG_home,H_RTG_away,A_RTG_away,EGD
2649,24,9,0,0,0,0.472222,0.222222,0.333333,0.25641,0.513514,...,0.455863,0.251647,20,1.190396,38,0.0,-0.035899,0.0,-0.119665,0.096196
9579,74,65,2,0,1,0.552632,0.263158,0.5,0.289474,0.210526,...,0.437337,0.244125,20,1.198641,38,-0.16114,-0.045249,-0.048342,-0.150829,-0.008921
5023,43,42,0,2,2,0.484848,0.242424,0.285714,0.257143,0.558824,...,0.465686,0.23366,18,1.237795,34,0.0,-0.117725,0.0,-0.392417,0.351468
10065,108,113,3,0,1,0.863636,0.045455,0.666667,0.208333,0.608696,...,0.508658,0.227273,20,1.320991,38,0.355858,-0.001664,0.106757,-0.005547,0.31834
4390,50,40,2,1,1,0.411765,0.294118,0.272727,0.242424,0.235294,...,0.451987,0.253311,18,1.298297,34,0.215888,-0.196777,0.064766,-0.655924,0.834622


# Extracting target variables and features

In [91]:
# If True, the goal difference for a specific game is seen as the target variable.
# (e.g. -3 for a game outcome of 1:4, or 2 for 3:1) .
#
# If False, we just want to predict the winner.
# 1 = Home team wins, 0 = Draw, 2 = Away team wins
predict_goal_difference = False

if predict_goal_difference:
    y = df['FTHG'] - df['FTAG']
else:
    y = df['FTR']

X = df.iloc[:,5:]    # Remove unnecessary columns (IDs etc.) from features

# Ranked Probability Score (RPS)

In [92]:
def rps(pred, actual_value, r=3):
    '''Returns the ranked probability score for a single given game.
    (see Hubacek paper for formula)
    
    Arguments:
    pred -- predicted results; in vector form (e.g. [0.1, 0.6, 0.3])
    actual_value -- actual result (0, 1 or 2); not in vector form yet
    r -- number of categories (3 for football)
    '''
    value_vec = [0, 0, 0]
    
    # Bring value_vec into 1, 0, 2 order
    if actual_value == 0:
        value_vec[1] = 1
    elif actual_value == 2:
        value_vec[0] == 1
    elif actual_value == 1:
        value_vec[2] == 1
    else:
        raise Exception('Prediction was not in [1, 0, 2].')
    #value_vec = [0, 0, 1]
    
    #pred[0], pred[1] = pred[1], pred[0]
    
    pred[0], pred[1], pred[2] = pred[2], pred[0], pred[1]   # order: loss, draw, win
    
    #print(pred)
    
    rps = 0
    
    for i in range(0, r-1):    # r-1 becomes r because of the exclusion of range()
        bracket_part = 0
        for j in range(0, i+1):    # same for i and i+1
            bracket_part += pred[j] - value_vec[j]
        
        rps += np.square(bracket_part)
    
    rps *= 1 / (r - 1)
    
    return rps

# Model fit

In [138]:
# Apparently the random states are pretty important. 21 works very well on RPS and test accuracy, 16 only on accuracy.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.222, random_state=21)

## Find best value for *n_estimators*

In [162]:
# We start with n_estimators=50...
xgb_cl = xgb.XGBClassifier(objective='multi:softmax', n_estimators=50, seed=16)
# ...and then validate it to find the lowest loss.
op = xgb_cl.fit(X_train, y_train, early_stopping_rounds=900, eval_metric='mlogloss', eval_set=[(X_test, y_test)], verbose=True)

[0]	validation_0-mlogloss:1.05159
[1]	validation_0-mlogloss:1.02520
[2]	validation_0-mlogloss:1.00897
[3]	validation_0-mlogloss:0.99943
[4]	validation_0-mlogloss:0.99461
[5]	validation_0-mlogloss:0.98913
[6]	validation_0-mlogloss:0.98577
[7]	validation_0-mlogloss:0.98409
[8]	validation_0-mlogloss:0.98478
[9]	validation_0-mlogloss:0.98430
[10]	validation_0-mlogloss:0.98471
[11]	validation_0-mlogloss:0.98532
[12]	validation_0-mlogloss:0.98546
[13]	validation_0-mlogloss:0.98485
[14]	validation_0-mlogloss:0.98523
[15]	validation_0-mlogloss:0.98631
[16]	validation_0-mlogloss:0.98643
[17]	validation_0-mlogloss:0.98745
[18]	validation_0-mlogloss:0.98898
[19]	validation_0-mlogloss:0.99127
[20]	validation_0-mlogloss:0.99189
[21]	validation_0-mlogloss:0.99234
[22]	validation_0-mlogloss:0.99355
[23]	validation_0-mlogloss:0.99401
[24]	validation_0-mlogloss:0.99533
[25]	validation_0-mlogloss:0.99560
[26]	validation_0-mlogloss:0.99618
[27]	validation_0-mlogloss:0.99740
[28]	validation_0-mlogloss:0.9

## Use best parameters to fit the model

In [216]:
# n_estimators=8 had the lowest loss, so we overwrite the previous model.b
xgb_cl = xgb.XGBClassifier(objective='multi:softprob', n_estimators=8, seed=16, learning_rate=0.28)

In [194]:
# TODO: Grid Search mit LearningRates

In [217]:
xgb_cl.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.28, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=8, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=16, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, seed=16, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Evaluation

In [218]:
preds = xgb_cl.predict(X_test)
accuracy = float(np.sum(preds == y_test))/y_test.shape[0]

print(f'accuracy: {accuracy}')

accuracy: 0.5299411561093804


In [219]:
# Looking at train set accuracy to get an intuition of how much the model overfits
preds = xgb_cl.predict(X_train)
accuracy = float(np.sum(preds == y_train))/y_train.shape[0]

print(f'accuracy on train set: {accuracy}')

accuracy on train set: 0.6172083374493728


In [146]:
# Predictions in form of [0.2, 0.5, 0.3] in the order of draw, home win, away win.
# This is because it gets ordered like 0, 1, 2.
proba_preds = xgb_cl.predict_proba(X_test)

# List of RPS scores for every game in the test set.
# Important to use iloc for y_test, otherwise indices would be wrong
rps_list = [rps(pred, y_test.iloc[i]) for i, pred in enumerate(proba_preds)]

# Average ranked probability score.
np.mean(rps_list)

0.20503750632251014