In [2]:
ROOT_DIR = '/code/premier-league-match-predictions'

import sys, os
sys.path.append(ROOT_DIR)
os.chdir(ROOT_DIR)

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb

from src.config import Models
from src.config import END_YEAR, NUM_SEASONS, SPORTSBOOK, N_MATCHES
from src.data.load_data import load_all_seasons
from src.data.build_features import build_rolling_features
from src.data.split import chrono_split
from src.models.hyperparameter_search import search_models

In [3]:
df_raw = load_all_seasons(end_year=END_YEAR, num_seasons=NUM_SEASONS, sportsbook=SPORTSBOOK)
df = build_rolling_features(df=df_raw, n_matches=N_MATCHES)

In [4]:
X_train, y_train, X_test, y_test = chrono_split(df, train_ratio=0.7)

### Parameter Grid to Search
Here you should define the parameter grid for each of the models, so the program can search the best hyperparameters for each model. Use this program to determine the optimal hyperparameters to use for a given dataset, and update the src/ directory accordingly.

In [5]:
param_grids = {

    Models.NAIVE_BAYES: {
        "model": Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("clf", GaussianNB(var_smoothing=1e-9))
        ]),
        "params": {
            "clf__var_smoothing": [1e-9, 1e-8, 1e-7]
        }
    },
    
    Models.LOGISTIC_REGRESSION: {
        "model": Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(
                solver="lbfgs",
                max_iter=5000
            ))
        ]),
        "params": {
            "clf__C": [0.01, 0.1, 1, 10],
            "clf__penalty": ["l2"],
            "clf__solver": ["lbfgs"]
        }
    },

    Models.RANDOM_FOREST: {
        "model": Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("clf", RandomForestClassifier(
                n_estimators=600,
                max_depth=None,
                min_samples_leaf=2,
                class_weight="balanced",
                random_state=0
            ))
        ]),
        "params": {
            "clf__n_estimators": [300, 600, 900],
            "clf__max_depth": [None, 10, 20],
            "clf__min_samples_leaf": [1, 2, 4]
        }
    },

    Models.SVM: {
        "model": Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("clf", SVC(
                kernel="rbf",
                C=2.0,
                gamma="scale",
                probability=True,
                class_weight="balanced",
                random_state=0
            ))
        ]),
        "params": {
            "clf__C": [0.5, 1.0, 2.0, 4.0],
            "clf__kernel": ["rbf", "linear"],
            "clf__gamma": ["scale", "auto"]
        }
    },

    Models.MLPFFNN: {
        "model": Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("clf", MLPClassifier(
                hidden_layer_sizes=(64, 32),
                activation="relu",
                solver="adam",
                alpha=1e-3,
                learning_rate_init=1e-2,
                max_iter=5000,
                random_state=0
            ))
        ]),
        "params": {
            "clf__hidden_layer_sizes": [(64,32), (128,64), (128,64,32)],
            "clf__alpha": [1e-4, 1e-3, 1e-2],
            "clf__learning_rate_init": [1e-3, 1e-2],
            "clf__activation": ["relu", "tanh"]
        }
    },

    Models.LIGHTGBM: {
        "model": Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("clf", lgb.LGBMClassifier(
                objective="multiclass",
                num_class=3,
                n_estimators=600,
                learning_rate=0.05,
                num_leaves=30,
                subsample=0.9,
                colsample_bytree=0.9,
                reg_lambda=1.0,
                random_state=0
            ))
        ]),
        "params": {
            "clf__n_estimators": [300, 600, 900],
            "clf__learning_rate": [0.01, 0.05, 0.1],
            "clf__num_leaves": [20, 30, 40],
            "clf__subsample": [0.8, 0.9, 1.0],
            "clf__colsample_bytree": [0.8, 0.9, 1.0],
            "clf__reg_lambda": [0.5, 1.0, 2.0]
        }
    },

    Models.XGBOOST: {
        "model": Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("clf", XGBClassifier(
                objective="multi:softprob",
                num_class=3,
                n_estimators=600,
                max_depth=4,
                learning_rate=0.05,
                subsample=0.9,
                colsample_bytree=0.9,
                reg_lambda=1.0,
                tree_method="hist",
                eval_metric="mlogloss",
                random_state=0
            ))
        ]),
        "params": {
            "clf__n_estimators": [300, 600, 900],
            "clf__max_depth": [3, 4, 5],
            "clf__learning_rate": [0.05, 0.1],
            "clf__subsample": [0.9, 1.0],
            "clf__colsample_bytree": [0.8, 0.9, 1.0],
            "clf__reg_lambda": [0.5, 1.0, 2.0]
        }
    }
}

### Running the Grid Search

In [None]:
grid_searches = search_models(param_grids, X_train, y_train)

summary_df = pd.DataFrame([
    {
        'Model': grid_search['name'],
        'Best Accuracy': grid_search['best_score'],
        'Best Params': grid_search['best_params']
    }
    for grid_search in grid_searches
])

summary_df.sort_values('Best Accuracy', ascending=False)
summary_df

Tuning Models.NAIVE_BAYES...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Finished tuning Models.NAIVE_BAYES in 1.44 seconds.
Best score = 0.46613249699261045.
Best params = {'clf__var_smoothing': 1e-09}


Tuning Models.LOGISTIC_REGRESSION...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Finished tuning Models.LOGISTIC_REGRESSION in 0.24 seconds.
Best score = 0.5512545110843787.
Best params = {'clf__C': 0.01, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}


Tuning Models.RANDOM_FOREST...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
