# Pipeline

1. Preprocessing
2. Data Extraction
3. Data Exploration
4. **Model**

This file initializes the model and makes predictions too.

# Imports

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
# Get feature dataframe from pickle file
df = pd.read_pickle('feature_frame.pkl')

In [12]:
df.sample(5)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,Date,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,...,REL_PTS_1_away,REL_PTS_2_away,REL_PTS_3_away,REL_PTS_4_away,REL_PTS_5_away,REL_PTS_N-0_away,REL_PTS_N-1_away,REL_PTS_N-2_away,REL_PTS_N-3_away,REL_PTS_N-4_away
1889,10,4,0,0,0,2015-05-20,0.666667,0.25,0.552632,0.157895,...,1.243243,1.027027,0.918919,0.837838,0.648649,-0.216216,-0.108108,-0.054054,0.0,0.0
6121,42,43,2,1,1,2019-04-13,0.685714,0.2,0.424242,0.333333,...,1.147783,1.137931,0.826355,0.719212,0.576355,-0.517241,-0.387931,-0.245074,-0.206897,0.0
11171,104,109,1,1,0,2014-09-28,0.526316,0.289474,0.342105,0.342105,...,0.333333,0.0,0.0,-0.166667,-0.333333,-1.666667,-1.666667,-1.666667,-1.666667,-1.5
12328,122,100,0,0,0,2017-10-15,0.473684,0.184211,0.184211,0.236842,...,1.75,1.25,1.125,1.0,1.0,-0.625,-0.142857,-0.142857,-0.125,0.0
4157,49,51,1,4,2,2012-11-28,0.424242,0.30303,0.205882,0.294118,...,0.714286,0.0,-0.071429,-0.214286,-0.214286,-1.357143,-1.071429,-0.857143,-0.857143,-0.785714


# Extracting target variables and features

# _WICHTIG: Das hier muss auf neue columns angepasst werden (z.B. season)_

In [17]:
# If True, the goal difference for a specific game is seen as the target variable.
# (e.g. -3 for a game outcome of 1:4, or 2 for 3:1) .

# If False, we just want to predict the winner.
# 1 = Home team wins, 0 = Draw, 2 = Away team wins
predict_goal_difference = False

if predict_goal_difference:
    y = df['FTHG'] - df['FTAG']
else:
    y = df['FTR']
    
# Remove unnecessary columns (IDs etc.) from features
X = df.iloc[:,6:].drop(['season', 'GAME_CNT_AFTER_GAME_home', 'GAME_CNT_AFTER_GAME_away', 'league'], axis=1)    
#X['FTR'] = df['FTR']

In [None]:
df.iloc[:,6:].drop(['season', 'GAME_CNT_AFTER_GAME_home', 'GAME_CNT_AFTER_GAME_away'], axis=1).columns

# Ranked Probability Score (RPS)

In [18]:
def rps(pred, actual_value, r=3):
    '''Returns the ranked probability score for a single given game.
    (see Hubacek paper for formula)
    
    Arguments:
    pred -- predicted results; in vector form (e.g. [0.1, 0.6, 0.3])
    actual_value -- actual result (0, 1 or 2); not in vector form yet
    r -- number of categories (3 for football)
    '''
    value_vec = [0, 0, 0]
    
    # Bring value_vec into 1, 0, 2 order
    if actual_value == 0:
        value_vec[1] = 1
    elif actual_value == 2:
        value_vec[2] = 1
    elif actual_value == 1:
        value_vec[0] = 1
    else:
        print(actual_value)
        raise Exception('Prediction was not in [1, 0, 2].')
    #value_vec = [0, 0, 1]
    
    #print(pred)
    #print("pred:", pred, "vec:", value_vec, "actualval:", actual_value)
    pred[0], pred[1], pred[2] = pred[1], pred[0], pred[2]   # order: loss, draw, win
    #print("pred:", pred, "vec:", value_vec, "actualval:", actual_value)
    
    rps = 0
    
    for i in range(0, r-1):
        inner_sum = 0
        for j in range(0, i+1):
            inner_sum += (pred[j] - value_vec[j])
        rps += np.square(inner_sum)
    
    rps /= (r-1)
    
    return rps


# To be used as eval_metric parameter
def rps_eval_metric(y_true, y_pred):
    return rps(y_pred, y_true)

# Model fit

In [None]:
list_test_accu = []
list_train_accu = []
list_rs = []

In [41]:
# Apparently the random states are pretty important. 21 works very well on RPS and test accuracy, 16 only on accuracy.
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=543)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Find best value for *n_estimators*

In [None]:
# We start with n_estimators=50...
xgb_cl = xgb.XGBClassifier(objective='multi:softprob', n_estimators=50, seed=16)
# ...and then validate it to find the lowest loss.
op = xgb_cl.fit(X_train, y_train, early_stopping_rounds=900, eval_metric='mlogloss', eval_set=[(X_test, y_test)])

In [None]:
learning_rates = [0.2, 0.3, 0.4]
n_estimators = [8, 16, 32]
subsamples = [0.6, 0.8, 0.9, 1]
colsample_bytree = [0.5, 0.75, 0.9, 1]
max_depth = [3, 6, 8, 12]
param_grid = dict(n_estimators=n_estimators, learning_rate=learning_rates, subsample=subsamples, colsample_bytree=colsample_bytree, max_depth=max_depth)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
grid = GridSearchCV(estimator=xgb_cl, param_grid=param_grid, n_jobs=8, cv=3)
grid_result = grid.fit(X, y)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

## Use best parameters to fit the model

In [42]:
# n_estimators=8 had the lowest loss, so we overwrite the previous model.b
#xgb_cl = xgb.XGBClassifier(objective='multi:softprob', n_estimators=64, seed=16, learning_rate=0.2, subsample=1, colsample_bytree=0.6, max_depth=4)
xgb_cl = xgb.XGBClassifier(objective='multi:softprob', n_estimators=64, learning_rate=0.2, subsample=1, colsample_bytree=0.6, max_depth=4)

In [49]:
xgb_cl.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=64, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Evaluation

In [50]:
preds = xgb_cl.predict(X_test)
accuracy = float(np.sum(preds == y_test))/y_test.shape[0]

print(f'accuracy: {accuracy}')
#list_test_accu.append(accuracy)

accuracy: 0.6326844262295082


In [45]:
# Looking at train set accuracy to get an intuition of how much the model overfits
preds = xgb_cl.predict(X_train)
accuracy = float(np.sum(preds == y_train))/y_train.shape[0]

print(f'accuracy on train set: {accuracy}')
#list_train_accu.append(accuracy)

accuracy on train set: 0.7132191480017567


In [32]:
# Predictions in form of [0.2, 0.5, 0.3] in the order of draw, home win, away win.
# This is because it gets ordered like 0, 1, 2.
proba_preds = xgb_cl.predict_proba(X_test)

# List of RPS scores for every game in the test set.
# Important to use iloc for y_test, otherwise indices would be wrong
rps_list = [rps(pred, y_test.iloc[i]) for i, pred in enumerate(proba_preds)]

#proba_preds

#rps(proba_preds[0], y_test.iloc[0])

# Average ranked probability score.
print(np.mean(rps_list))
#list_rs.append(np.mean(rps_list))

0.15456079021452532


In [None]:
print(np.mean(list_test_accu))
print(np.mean(list_train_accu))
print(np.mean(list_rs))

In [15]:
# league priors -> rps (DO NOT RUN AGAIN)
r_list = []
for i in range(len(X_test.index)):
    league = X_test.iloc[i]['league']
    prob_vector = []
    if league == 'D1':
        prob_vector = [0.2422, 0.4582, 0.2996]
    elif league == 'E1':
        prob_vector = [0.2468, 0.4576, 0.2956]
    elif league == 'I1':
        prob_vector = [0.2575, 0.4538, 0.2888]
    elif league == 'SP1':
        prob_vector = [0.2371, 0.4781, 0.2848]
    
    r_list.append(rps(prob_vector, y_test.iloc[i]))
    
np.mean(r_list)

0.22677309555199798

In [16]:
# global priors -> rps
r_list = []
for i in range(len(y_test.index)):
    r_list.append(rps([0.2921845, 0.46192225, 0.24589325], y_test.iloc[i]))
np.mean(r_list)

0.22746123051232067

## Feature importances

In [56]:
for k in zip(X.columns, xgb_cl.feature_importances_):
    print(k)

('H_WIN_PCT_home', 0.009758653)
('H_DRAW_PCT_home', 0.005967357)
('A_WIN_PCT_home', 0.01051141)
('A_DRAW_PCT_home', 0.0063104676)
('H_WIN_PCT_away', 0.008923647)
('H_DRAW_PCT_away', 0.0070527554)
('A_WIN_PCT_away', 0.008313777)
('A_DRAW_PCT_away', 0.0072810515)
('H_GS_AVG_home', 0.007245459)
('H_GC_AVG_home', 0.00672335)
('A_GS_AVG_home', 0.006291983)
('A_GC_AVG_home', 0.0053140293)
('H_GS_AVG_away', 0.006189141)
('H_GC_AVG_away', 0.00663039)
('A_GS_AVG_away', 0.005314836)
('A_GC_AVG_away', 0.0070118564)
('H_GS_STD_home', 0.006671467)
('H_GC_STD_home', 0.007369442)
('A_GS_STD_home', 0.007244088)
('A_GC_STD_home', 0.0064879674)
('H_GS_STD_away', 0.006913153)
('H_GC_STD_away', 0.006340397)
('A_GS_STD_away', 0.0058359765)
('A_GC_STD_away', 0.004883841)
('WIN_PCT_home', 0.013500997)
('WIN_PCT_away', 0.013642241)
('DRAW_PCT_home', 0.0074263546)
('DRAW_PCT_away', 0.010306111)
('GS_AVG_home', 0.0057456787)
('GC_AVG_home', 0.007634116)
('GS_AVG_away', 0.006804413)
('GC_AVG_away', 0.007638291)


In [69]:
fbuckets = []
for i in range(len(xgb_cl.feature_importances_)):
    if i < 24:
        fbuckets.append('Historical strength')
    elif 24 <= i < 38:
        fbuckets.append('Current form')
    elif 38 <= i < 47:
        fbuckets.append('League')
    elif 47 <= i < 52:
        fbuckets.append('Pi-ratings')
    elif 52 <= i < 54:
        fbuckets.append('Current form')
    elif 54 <= i < 58:
        fbuckets.append('Historical strength')
    elif i >= 58:
        fbuckets.append('Match importance')
    else:
        fbuckets.append('blah')

In [70]:
for k in zip(X.columns, xgb_cl.feature_importances_, fbuckets):
    print(k)

('H_WIN_PCT_home', 0.009758653, 'Historical strength')
('H_DRAW_PCT_home', 0.005967357, 'Historical strength')
('A_WIN_PCT_home', 0.01051141, 'Historical strength')
('A_DRAW_PCT_home', 0.0063104676, 'Historical strength')
('H_WIN_PCT_away', 0.008923647, 'Historical strength')
('H_DRAW_PCT_away', 0.0070527554, 'Historical strength')
('A_WIN_PCT_away', 0.008313777, 'Historical strength')
('A_DRAW_PCT_away', 0.0072810515, 'Historical strength')
('H_GS_AVG_home', 0.007245459, 'Historical strength')
('H_GC_AVG_home', 0.00672335, 'Historical strength')
('A_GS_AVG_home', 0.006291983, 'Historical strength')
('A_GC_AVG_home', 0.0053140293, 'Historical strength')
('H_GS_AVG_away', 0.006189141, 'Historical strength')
('H_GC_AVG_away', 0.00663039, 'Historical strength')
('A_GS_AVG_away', 0.005314836, 'Historical strength')
('A_GC_AVG_away', 0.0070118564, 'Historical strength')
('H_GS_STD_home', 0.006671467, 'Historical strength')
('H_GC_STD_home', 0.007369442, 'Historical strength')
('A_GS_STD_hom

In [74]:
from collections import defaultdict
f_impor_perbucket = defaultdict(lambda: 0)
for k in zip(xgb_cl.feature_importances_, fbuckets):
    f_impor_perbucket[k[1]] += k[0]

In [75]:
f_impor_perbucket

defaultdict(<function __main__.<lambda>()>,
            {'Historical strength': 0.19617299921810627,
             'Current form': 0.12099576368927956,
             'League': 0.04486588528379798,
             'Pi-ratings': 0.03545925905928016,
             'Match importance': 0.6025060405954719})