In [1]:
ROOT_DIR = '/code/premier-league-match-predictions'

import sys, os
sys.path.append(ROOT_DIR)
os.chdir(ROOT_DIR)

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from src.config import Models
from src.config import END_YEAR, NUM_SEASONS, SPORTSBOOK, N_MATCHES
from src.util.DFImputer import DFImputer
from src.data.load_data import load_all_seasons
from src.data.scrape_values import merge_valuations_into_dataframe
from src.data.scrape_possession import merge_possession_into_dataframe
from src.data.build_features import build_rolling_features
from src.data.split import chrono_split
from src.models.hyperparameter_search import search_models, tune_model

In [2]:
# Load the data
df_raw = load_all_seasons(end_year=END_YEAR, num_seasons=NUM_SEASONS, sportsbook=SPORTSBOOK)
df_raw = merge_possession_into_dataframe(df_raw)
df_raw = merge_valuations_into_dataframe(df_raw, "data/raw/tm_pl_all_columns.csv", "2015-07-01")
df = build_rolling_features(df=df_raw, n_matches=N_MATCHES)
X_train, y_train, X_test, y_test = chrono_split(df, train_ratio=0.7)

### Parameter Grid to Search
Here you should define the parameter grid for each of the models, so the program can search the best hyperparameters for each model. Use this program to determine the optimal hyperparameters to use for a given dataset, and update the src/ directory accordingly.

In [3]:
param_grids = {

    Models.NAIVE_BAYES: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("clf", GaussianNB(var_smoothing=1e-9))
        ]),
        "params": {
            "clf__var_smoothing": [1e-9, 1e-8, 1e-7]
        }
    },
    
    Models.LOGISTIC_REGRESSION: {
        "model": Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(
                solver="lbfgs",
                max_iter=5000
            ))
        ]),
        "params": {
            "clf__C": [0.001, 0.01, 0.1, 1, 10, 100],
            "clf__penalty": ["l2"],
            "clf__solver": ["lbfgs", "saga"]
        }
    },

    Models.RANDOM_FOREST: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("clf", RandomForestClassifier(
                n_estimators=600,
                max_depth=None,
                min_samples_leaf=2,
                class_weight="balanced",
                random_state=0
            ))
        ]),
        "params": {
            "clf__n_estimators": [300, 600, 900],
            "clf__max_depth": [None, 10, 20],
            "clf__min_samples_leaf": [1, 2, 4]
        }
    },

    Models.SVM: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("clf", SVC(
                kernel="rbf",
                C=2.0,
                gamma="scale",
                probability=True,
                class_weight="balanced",
                random_state=0
            ))
        ]),
        "params": {
            "clf__C": [0.5, 1.0, 2.0, 4.0],
            "clf__kernel": ["rbf", "linear"],
            "clf__gamma": ["scale", "auto"]
        }
    },

    Models.MLPFFNN: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("clf", MLPClassifier(
                hidden_layer_sizes=(64, 32),
                activation="relu",
                solver="adam",
                alpha=1e-3,
                learning_rate_init=1e-2,
                max_iter=5000,
                random_state=0
            ))
        ]),
        "params": {
            "clf__hidden_layer_sizes": [(64,32), (128,64), (128,64,32)],
            "clf__alpha": [1e-4, 1e-3, 1e-2],
            "clf__learning_rate_init": [1e-3, 1e-2],
            "clf__activation": ["relu", "tanh"]
        }
    },

    Models.XGBOOST: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("clf", XGBClassifier(
                objective="multi:softprob",
                num_class=3,
                n_estimators=600,
                max_depth=4,
                learning_rate=0.05,
                subsample=0.9,
                colsample_bytree=0.9,
                reg_lambda=1.0,
                tree_method="hist",
                eval_metric="mlogloss",
                random_state=0
            ))
        ]),
        "params": {
            "clf__n_estimators": [300, 600],
            "clf__max_depth": [3, 4],
            "clf__learning_rate": [0.05],
            "clf__subsample": [0.9],
            "clf__colsample_bytree": [0.8, 0.9],
            "clf__reg_lambda": [0.5, 1.0]
        }
    }
}

# Running Grid Search Over Specific Models
This section will be used if just want to explore the best parameters for one model type, since this is much quicker than comparing all models.

In [4]:
def individual_grid_search(model_type):
    tune_model(
        name=model_type,
        model=param_grids[model_type]['model'],
        params=param_grids[model_type]['params'],
        X_train=X_train,
        y_train=y_train
    )

In [5]:
# Logistic Regression
individual_grid_search(Models.LOGISTIC_REGRESSION)

Tuning Models.LOGISTIC_REGRESSION...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Finished tuning Models.LOGISTIC_REGRESSION in 4.40 seconds.
Best score = 0.5853137787434366.
Best params = {'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}




In [6]:
# XGBoost
individual_grid_search(Models.XGBOOST)

Tuning Models.XGBOOST...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Finished tuning Models.XGBOOST in 16.85 seconds.
Best score = 0.5677834510229948.
Best params = {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.05, 'clf__max_depth': 3, 'clf__n_estimators': 300, 'clf__reg_lambda': 1.0, 'clf__subsample': 0.9}




In [7]:
# Random Forest
individual_grid_search(Models.RANDOM_FOREST)

Tuning Models.RANDOM_FOREST...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Finished tuning Models.RANDOM_FOREST in 112.89 seconds.
Best score = 0.5704646025710665.
Best params = {'clf__max_depth': 20, 'clf__min_samples_leaf': 1, 'clf__n_estimators': 600}




In [8]:
# Support Vector Machine
individual_grid_search(Models.SVM)

Tuning Models.SVM...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Finished tuning Models.SVM in 26.88 seconds.
Best score = 0.5228585913452835.
Best params = {'clf__C': 0.5, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}




In [9]:
# MLP Feedforward Neural Network
individual_grid_search(Models.MLPFFNN)

Tuning Models.MLPFFNN...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Finished tuning Models.MLPFFNN in 221.67 seconds.
Best score = 0.502667390910737.
Best params = {'clf__activation': 'tanh', 'clf__alpha': 0.0001, 'clf__hidden_layer_sizes': (128, 64), 'clf__learning_rate_init': 0.01}




In [10]:
# Naive Bayes
individual_grid_search(Models.NAIVE_BAYES)

Tuning Models.NAIVE_BAYES...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Finished tuning Models.NAIVE_BAYES in 0.35 seconds.
Best score = 0.4737396342567445.
Best params = {'clf__var_smoothing': 1e-09}




Exception ignored in: <function ResourceTracker.__del__ at 0x778815b8cf40>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x76277a098f40>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7d0cf8194f40>
Traceback (most recent call last):
  File "/usr

# Running the Entire Grid Search Over All Models
This automates running the grid search for each of the models and sorting by accuracy, but takes a lot longer.

In [None]:
grid_searches = search_models(param_grids, X_train, y_train)

summary_df = pd.DataFrame([
    {
        'Model': grid_search['name'],
        'Best Accuracy': grid_search['best_score'],
        'Best Params': grid_search['best_params']
    }
    for grid_search in grid_searches
])

summary_df.sort_values('Best Accuracy', ascending=False)
summary_df

# Voting
After tuning all the individual models, update the models in the source code and in the following param_grids object, and tune the voting classification weights

In [13]:
def weight_vectors():
    return [[i, j, k] for i in range(1, 4)
                      for j in range(1, 4)
                      for k in range(1, 4)]

voting_model = Pipeline([
        ("imputer", DFImputer(strategy="median")),
        ("clf", VotingClassifier(
            estimators=[
                ("lr", Pipeline([
                        ("scaler", StandardScaler()),
                        ("clf", LogisticRegression(
                            solver='lbfgs',
                            penalty='l2',
                            C=0.1,
                            max_iter=5000,
                            random_state=0
                        ))
                    ])),
                ("rf", Pipeline([
                        ("clf", RandomForestClassifier(
                            n_estimators=600,
                            max_depth=20,
                            min_samples_leaf=1,
                            class_weight="balanced",
                            random_state=0
                        ))
                    ])),
                ("xgb", Pipeline([
                        ("clf", XGBClassifier(
                            objective="multi:softprob",
                            num_class=3,
                            n_estimators=300,
                            max_depth=3,
                            learning_rate=0.05,
                            subsample=0.9,
                            colsample_bytree=0.8,
                            reg_lambda=1.0,
                            tree_method="hist",
                            eval_metric="mlogloss",
                            random_state=0,
                            n_jobs=-1
                        ))
                    ])),
            ],
            voting="soft",
            n_jobs=-1
        ))
    ])

voting_params = {"clf__weights": weight_vectors()}

tune_model(
    name=Models.VOTING,
    model=voting_model,
    params=voting_params,
    X_train=X_train,
    y_train=y_train
)

Tuning Models.VOTING...
Fitting 5 folds for each of 27 candidates, totalling 135 fits


Exception ignored in: <function ResourceTracker.__del__ at 0x71e73a888f40>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7d526af94f40>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7b5982e98f40>
Traceback (most recent call last):
  File "/usr

Finished tuning Models.VOTING in 138.86 seconds.
Best score = 0.5845468042730401.
Best params = {'clf__weights': [3, 1, 2]}




{'name': <Models.VOTING: 7>,
 'best_params': {'clf__weights': [3, 1, 2]},
 'best_score': np.float64(0.5845468042730401),
 'grid_search_object': GridSearchCV(cv=5,
              estimator=Pipeline(steps=[('imputer',
                                         DFImputer(strategy='median')),
                                        ('clf',
                                         VotingClassifier(estimators=[('lr',
                                                                       Pipeline(steps=[('scaler',
                                                                                        StandardScaler()),
                                                                                       ('clf',
                                                                                        LogisticRegression(C=0.1,
                                                                                                           max_iter=5000,
                                                    

Exception ignored in: <function ResourceTracker.__del__ at 0x718375988f40>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x76ca68388f40>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/usr/local/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7fc809f88f40>
Traceback (most recent call last):
  File "/usr