In [24]:
import pickle
import pandas as pd

match_df = pickle.load(open("prepped_match_df.pck", "rb"))

In [25]:
from sklearn.preprocessing import *

enc = LabelEncoder()
match_df["resultClass"] = enc.fit_transform(match_df["resultClass"])

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *
X_train, X_test, y_train, y_test = train_test_split(match_df.drop("resultClass", axis=1), match_df["resultClass"], test_size=0.2, shuffle=False)

In [27]:
from sklearn.metrics import make_scorer

def score(y_true, y_pred, **kwargs):
    y_true = [x.split(":") for x in enc.inverse_transform(y_true)]
    y_pred = [x.split(":") for x in enc.inverse_transform(y_pred)]
    score_value = 0
    for true, pred in zip(y_true, y_pred):
        if true[0] == pred[0] and true[1] == pred[1]:
            score_value += 5
        elif (int(true[0]) - int(true[1])) == (int(pred[0]) - int(pred[1])):
            score_value += 3
        elif ((true[0] > true[1]) and (pred[0] > pred[1])) or ((true[0] < true[1]) and (pred[0] < pred[1])):
            score_value += 1
    return round(score_value / (len(y_true)/306))

kicktipp_scorer = make_scorer(score, greater_is_better=True)

In [28]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)

# Baseline
Most frequent class

In [29]:
import numpy as np

y_pred = np.full(len(y_test), y_train.value_counts().idxmax())
score(y_test, y_pred)

299

# XGBoost

In [30]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from scipy.stats import uniform, randint

# Define the parameter search space for classification
param_dist = {
    # Learning task parameters
    'objective': ['multi:softprob'],
    'num_class': [len(enc.classes_)],  # Number of unique classes
    
    # Booster parameters
    'max_depth': randint(3, 10),  # Maximum tree depth
    'learning_rate': uniform(0.01, 0.3),  # Eta (step size shrinkage)
    'n_estimators': randint(100, 500),  # Number of boosting rounds
    
    # Regularization parameters
    'gamma': uniform(0, 5),  # Minimum loss reduction for a split
    'min_child_weight': randint(1, 7),  # Minimum sum of instance weight needed in a child
    
    # Sampling parameters
    'subsample': uniform(0.6, 0.4),  # Fraction of samples used for fitting the individual trees
    'colsample_bytree': uniform(0.6, 0.4),  # Fraction of features used for fitting the individual trees
    
    # Regularization L1 and L2
    'reg_alpha': uniform(0, 1),  # L1 regularization term
    'reg_lambda': uniform(1, 5)  # L2 regularization term
}

# Create the XGBoost classifier
xgb_model = xgb.XGBClassifier(
    random_state=42, 
    n_jobs=-1, # Use all available cores
)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings sampled
    scoring=kicktipp_scorer,  # Use the custom scoring function
    cv=tscv,  # Use TimeSeriesSplit cross-validation
    random_state=42,
    verbose=2
)

enc_2 = LabelEncoder()
# Fit the random search
xgb_model.fit(X_train, enc_2.fit_transform(y_train))

# Print the best parameters and best score
#print("Best parameters:", random_search.best_params_)
#print("Best cross-validated score:", random_search.best_score_)

# Get the best model
#best_xgb_model = random_search.best_estimator_

# Predict on test set
y_pred = xgb_model.predict(X_test)

# Evaluate using the custom scoring function
test_score = score(y_test, enc_2.inverse_transform(y_pred))
print("Test set score:", test_score)

Test set score: 267


In [31]:
saving = False
#if saving:
    #pickle.dump(best_clf, open("classifier.pck", "wb"))
    #pickle.dump(enc, open("encoder.pck", "wb"))

In [32]:
len(enc.classes_)

57