# Pipeline

1. Preprocessing
2. Data Extraction
3. **Model**

This file initializes the model and makes predictions too.

# Imports

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [29]:
# Get feature dataframe from pickle file
df = pd.read_pickle('feature_frame.pkl')

In [30]:
df.sample(5)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,H_WIN_PCT_away,...,H_WIN_PCT_league,DRAW_PCT_league,TEAM_CNT_league,GD_STD_league,RND_CNT_league,H_RTG_home,A_RTG_home,H_RTG_away,A_RTG_away,EGD
11263,98,119,1,0,1,0.459459,0.216216,0.289474,0.236842,0.342105,...,0.466045,0.235686,20,1.313968,38,0.0,0.001663,0.0,0.005543,-0.004263
8793,72,85,1,0,1,0.526316,0.157895,0.210526,0.157895,0.058824,...,0.463863,0.232589,20,1.157507,38,-0.250904,-0.031377,-0.075271,-0.104591,-0.128783
10980,120,97,1,1,0,0.333333,0.366667,0.15625,0.25,0.447368,...,0.492208,0.216883,20,1.318311,38,0.733127,-0.237925,0.219938,-0.793083,1.593465
4767,58,37,1,1,0,0.428571,0.428571,0.166667,0.166667,0.375,...,0.463866,0.230252,18,1.263812,34,0.146993,-0.089933,0.044098,-0.299778,0.378143
4667,40,50,1,1,0,0.382353,0.294118,0.205882,0.264706,0.411765,...,0.445351,0.241436,18,1.258408,34,0.047093,-0.060595,0.014128,-0.201984,0.204497


# Extracting target variables and features

In [31]:
# If True, the goal difference for a specific game is seen as the target variable.
# (e.g. -3 for a game outcome of 1:4, or 2 for 3:1) .
#
# If False, we just want to predict the winner.
# 1 = Home team wins, 0 = Draw, 2 = Away team wins
predict_goal_difference = False

if predict_goal_difference:
    y = df['FTHG'] - df['FTAG']
else:
    y = df['FTR']

X = df.iloc[:,5:]    # Remove unnecessary columns (IDs etc.) from features

# Ranked Probability Score (RPS)

In [9]:
def rps(pred, actual_value, r=3):
    '''Returns the ranked probability score for a single given game.
    (see Hubacek paper for formula)
    
    Arguments:
    pred -- predicted results; in vector form (e.g. [0.1, 0.6, 0.3])
    actual_value -- actual result (0, 1 or 2); not in vector form yet
    r -- number of categories (3 for football)
    '''
    value_vec = [0, 0, 0]
    
    # Bring value_vec into 1, 0, 2 order
    if actual_value == 0:
        value_vec[1] = 1
    elif actual_value == 2:
        value_vec[0] == 1
    elif actual_value == 1:
        value_vec[2] == 1
    else:
        raise Exception('Prediction was not in [1, 0, 2].')
    #value_vec = [0, 0, 1]
    
    #pred[0], pred[1] = pred[1], pred[0]
    
    pred[0], pred[1], pred[2] = pred[2], pred[0], pred[1]   # order: loss, draw, win
    
    #print(pred)
    
    rps = 0
    
    for i in range(0, r-1):    # r-1 becomes r because of the exclusion of range()
        bracket_part = 0
        for j in range(0, i+1):    # same for i and i+1
            bracket_part += pred[j] - value_vec[j]
        
        rps += np.square(bracket_part)
    
    rps *= 1 / (r - 1)
    
    return rps

# Model fit

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.222, random_state=16)

In [33]:
# n_estimators=5 seems to be the best choice for now
xgb_cl = xgb.XGBClassifier(objective='multi:softprob', n_estimators=5, seed=16)

In [34]:
xgb_cl.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=5, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=16, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, seed=16, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Evaluation

In [35]:
preds = xgb_cl.predict(X_test)
accuracy = float(np.sum(preds == y_test))/y_test.shape[0]

print(f'accuracy: {accuracy}')

accuracy: 0.520595361716857


In [37]:
# Predictions in form of [0.2, 0.5, 0.3] in the order of draw, home win, away win.
# This is because it gets ordered like 0, 1, 2.
proba_preds = xgb_cl.predict_proba(X_test)

# List of RPS scores for every game in the test set.
# Important to use iloc for y_test, otherwise indices would be wrong!
rps_list = [rps(pred, y_test.iloc[i]) for i, pred in enumerate(proba_preds)]

# Average ranked probability score.
np.mean(rps_list)

0.20766103885510892

In [36]:
# Looking at train set accuracy to get an intuition of how much the model overfits
preds = xgb_cl.predict(X_train)
accuracy = float(np.sum(preds == y_train))/y_train.shape[0]

print(f'accuracy on train set: {accuracy}')

accuracy on train set: 0.5956732194013632
