In [None]:
ROOT_DIR = '/code/premier-league-match-predictions'

import sys, os
sys.path.append(ROOT_DIR)
os.chdir(ROOT_DIR)

import pandas as pd
from scipy.stats import randint

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from src.util.DFImputer import DFImputer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from src.config import Models
from src.config import END_YEAR, NUM_SEASONS, SPORTSBOOK, N_MATCHES
from src.data.load_data import load_all_seasons
from src.data.build_features import build_rolling_features
from src.data.split import chrono_split
from src.models.hyperparameter_search import search_models, tune_model

In [None]:
df_raw = load_all_seasons(end_year=END_YEAR, num_seasons=NUM_SEASONS, sportsbook=SPORTSBOOK)
df = build_rolling_features(df=df_raw, n_matches=N_MATCHES)

In [None]:
X_train, y_train, X_test, y_test = chrono_split(df, train_ratio=0.7)

### Parameter Grid to Search
Here you should define the parameter grid for each of the models, so the program can search the best hyperparameters for each model. Use this program to determine the optimal hyperparameters to use for a given dataset, and update the src/ directory accordingly.

In [None]:
param_grids = {

    Models.NAIVE_BAYES: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("clf", GaussianNB(var_smoothing=1e-9))
        ]),
        "params": {
            "clf__var_smoothing": [1e-9, 1e-8, 1e-7]
        }
    },
    
    Models.LOGISTIC_REGRESSION: {
        "model": Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(
                solver="lbfgs",
                max_iter=5000
            ))
        ]),
        "params": {
            "clf__C": [0.001, 0.01, 0.1, 1, 10, 100],
            "clf__penalty": ["l2"],
            "clf__solver": ["lbfgs", "saga"]
        }
    },

    Models.RANDOM_FOREST: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("clf", RandomForestClassifier(
                n_estimators=600,
                max_depth=None,
                min_samples_leaf=2,
                class_weight="balanced",
                random_state=0
            ))
        ]),
        "params": {
            "clf__n_estimators": [300, 600, 900],
            "clf__max_depth": [None, 10, 20],
            "clf__min_samples_leaf": [1, 2, 4]
        }
    },

    Models.SVM: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("clf", SVC(
                kernel="rbf",
                C=2.0,
                gamma="scale",
                probability=True,
                class_weight="balanced",
                random_state=0
            ))
        ]),
        "params": {
            "clf__C": [0.5, 1.0, 2.0, 4.0],
            "clf__kernel": ["rbf", "linear"],
            "clf__gamma": ["scale", "auto"]
        }
    },

    Models.MLPFFNN: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("clf", MLPClassifier(
                hidden_layer_sizes=(64, 32),
                activation="relu",
                solver="adam",
                alpha=1e-3,
                learning_rate_init=1e-2,
                max_iter=5000,
                random_state=0
            ))
        ]),
        "params": {
            "clf__hidden_layer_sizes": [(64,32), (128,64), (128,64,32)],
            "clf__alpha": [1e-4, 1e-3, 1e-2],
            "clf__learning_rate_init": [1e-3, 1e-2],
            "clf__activation": ["relu", "tanh"]
        }
    },

    Models.XGBOOST: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("clf", XGBClassifier(
                objective="multi:softprob",
                num_class=3,
                n_estimators=600,
                max_depth=4,
                learning_rate=0.05,
                subsample=0.9,
                colsample_bytree=0.9,
                reg_lambda=1.0,
                tree_method="hist",
                eval_metric="mlogloss",
                random_state=0
            ))
        ]),
        "params": {
            "clf__n_estimators": [300, 600],
            "clf__max_depth": [3, 4],
            "clf__learning_rate": [0.05],
            "clf__subsample": [0.9],
            "clf__colsample_bytree": [0.8, 0.9],
            "clf__reg_lambda": [0.5, 1.0]
        }
    }
}

# Running Grid Search Over Specific Models
This section will be used if just want to explore the best parameters for one model type, since this is much quicker than comparing all models.

In [None]:
def individual_grid_search(model_type):
    tune_model(
        name=model_type,
        model=param_grids[model_type]['model'],
        params=param_grids[model_type]['params'],
        X_train=X_train,
        y_train=y_train
    )

In [None]:
# Logistic Regression
individual_grid_search(Models.LOGISTIC_REGRESSION)

In [None]:
# XGBoost
individual_grid_search(Models.XGBOOST)

In [None]:
# Random Forest
individual_grid_search(Models.RANDOM_FOREST)

In [None]:
# Voting
individual_grid_search(Models.VOTING)

# Running the Entire Grid Search Over All Models
This automates running the grid search for each of the models and sorting by accuracy, but takes a lot longer.

In [None]:
grid_searches = search_models(param_grids, X_train, y_train)

summary_df = pd.DataFrame([
    {
        'Model': grid_search['name'],
        'Best Accuracy': grid_search['best_score'],
        'Best Params': grid_search['best_params']
    }
    for grid_search in grid_searches
])

summary_df.sort_values('Best Accuracy', ascending=False)
summary_df

# Voting
After tuning all the individual models, update the models in the source code and in the following param_grids object, and tune the voting classification weights

In [None]:
def random_weight_vectors(num=20, k=4):
    """
    Generate <num> vectors of size <k> to tune weights in the
    voting model.
    """
    weights = np.random.randint(1, k + 1, size=(num, k))
    return [list(weight) for weight in weights]


# TODO: update with tuned hyperparameters
voting_params = {
    "model": Pipeline([
        ("imputer", DFImputer(strategy="median")),
        ("clf", VotingClassifier(
            estimators=[
                ("lr", Pipeline([
                        ("scaler", StandardScaler()),
                        ("clf", LogisticRegression(
                            solver="lbfgs",
                            max_iter=5000,
                            C=1.0,
                            random_state=0
                        ))
                    ])),
                ("rf", Pipeline([
                        ("imputer", DFImputer(strategy="median")),
                        ("clf", RandomForestClassifier(
                            n_estimators=500,
                            max_depth=None,
                            max_features="sqrt",
                            n_jobs=-1,
                            random_state=0
                        ))
                    ])),
                ("hgb", Pipeline([
                        ("imputer", DFImputer(strategy="median")),
                        ("clf", HistGradientBoostingClassifier(
                            loss="log_loss",
                            learning_rate=0.06,
                            max_depth=6,
                            max_iter=600,
                            random_state=0
                        ))
                    ])),
                ("xgb", Pipeline([
                        ("imputer", DFImputer(strategy="median")),
                        ("clf", xgb.XGBClassifier(
                            objective="multi:softprob",
                            num_class=3,
                            n_estimators=500,
                            learning_rate=0.06,
                            max_depth=6,
                            subsample=0.9,
                            colsample_bytree=0.9,
                            reg_lambda=1.0,
                            tree_method="hist",
                            n_jobs=-1,
                            random_state=0,
                            eval_metric="mlogloss",
                            verbosity=0
                        ))
                    ])),
            ],
            voting="soft",
            n_jobs=-1
        ))
    ])
    "params": {
        "clf__weights": random_weight_vectors()
    }
}