# Model Tuning
This notebook will be used in order to tune the hyperparameters for each of the models we are evaluating. After we determine the optimal set of hyperparameters for each model, we will update each model's source code accordingly.

In [1]:
import os
import sys
from pathlib import Path

# Get current file path and change the working directory to the project root
NOTEBOOK_DIR = Path().resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent
os.chdir(PROJECT_ROOT)
sys.path.append(str(PROJECT_ROOT))

import time

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier

from src.config import Models
from src.util.DFImputer import DFImputer
from src.util.notebook_utils import get_feature_matrix, weight_vectors

### Load the Data
Load the dataset, with the configuration of N_MATCHES=5, with the training set including the 7 seasons from 2015/2016 to 2021/2022 and the holdout set including the 3 seasons from 2022/2023 to 2024/2025.

In [2]:
X_train, y_train, X_test, y_test = get_feature_matrix()

### Parameter Grid to Search
Here you should define the parameter grid for each of the models, so the program can search the best hyperparameters for each model. Use this program to determine the optimal hyperparameters to use for a given dataset, and update the src file accordingly.

This section can be hidden by default since it's extremely long.

In [3]:
param_grids = {    
    Models.LOGISTIC_REGRESSION: {
        "model": Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(
                solver="lbfgs",
                max_iter=5000
            ))
        ]),
        "params": {
            "clf__C": [0.001, 0.01, 0.1, 1, 10, 100],
            "clf__penalty": ["l2"],
            "clf__solver": ["lbfgs", "saga"]
        }
    },

    Models.XGBOOST: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("clf", XGBClassifier(
                objective="multi:softprob",
                num_class=3,
                n_estimators=600,
                max_depth=4,
                learning_rate=0.05,
                subsample=0.9,
                colsample_bytree=0.9,
                reg_lambda=1.0,
                tree_method="hist",
                eval_metric="mlogloss",
                random_state=0
            ))
        ]),
        "params": {
            "clf__n_estimators": [300, 600],
            "clf__max_depth": [3, 4],
            "clf__learning_rate": [0.05],
            "clf__subsample": [0.9],
            "clf__colsample_bytree": [0.8, 0.9],
            "clf__reg_lambda": [0.5, 1.0]
        }
    },

    Models.RANDOM_FOREST: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("clf", RandomForestClassifier(
                n_estimators=600,
                max_depth=None,
                min_samples_leaf=2,
                class_weight="balanced",
                random_state=0
            ))
        ]),
        "params": {
            "clf__n_estimators": [300, 600, 900],
            "clf__max_depth": [None, 10, 20],
            "clf__min_samples_leaf": [1, 2, 4]
        }
    },

    Models.SVM: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("clf", SVC(
                kernel="rbf",
                C=2.0,
                gamma="scale",
                probability=True,
                class_weight="balanced",
                random_state=0
            ))
        ]),
        "params": {
            "clf__C": [0.5, 1.0, 2.0, 4.0],
            "clf__kernel": ["rbf", "linear"],
            "clf__gamma": ["scale", "auto"]
        }
    },

    Models.MLPFFNN: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("clf", MLPClassifier(
                hidden_layer_sizes=(64, 32),
                activation="relu",
                solver="adam",
                alpha=1e-3,
                learning_rate_init=1e-2,
                max_iter=5000,
                random_state=0
            ))
        ]),
        "params": {
            "clf__hidden_layer_sizes": [(64,32), (128,64), (128,64,32)],
            "clf__alpha": [1e-4, 1e-3, 1e-2],
            "clf__learning_rate_init": [1e-3, 1e-2],
            "clf__activation": ["relu", "tanh"]
        }
    },
    
    Models.NAIVE_BAYES: {
        "model": Pipeline([
            ("imputer", DFImputer(strategy="median")),
            ("clf", GaussianNB(var_smoothing=1e-9))
        ]),
        "params": {
            "clf__var_smoothing": [1e-9, 1e-8, 1e-7]
        }
    }
}

### Define the Model Tuning Function
This function will automate the grid search to tune a hyperparameter for a given model.

In [4]:
def tune_model(model_type: int, cv=5):
    """
    Goes through the parameter grid configuration for a given model and determines
    its best performing hyperparameters.
    
    Args:
        name: The name of the model.
        model: The model object corresponding to the model.
        params: The hyperparameter configuration to search.
        X_train: The features for the training data.
        y_train: The labels for the training data.
        cv: The number of cross-validation folds.
        
    Returns:
        Dictionary containing the model's best hyperparameters.
    """
    print(f'Tuning {model_type}...')
    
    start = time.time()
    grid_search = GridSearchCV(
        estimator=param_grids[model_type]['model'],
        param_grid=param_grids[model_type]['params'],
        cv=cv,
        scoring='accuracy',
        n_jobs=4,
        verbose=1,
        return_train_score=True
    )
    
    grid_search.fit(X_train, y_train)
    
    elapsed = time.time() - start
    print(f'Finished tuning {model_type} in {elapsed:.2f} seconds.')
    print(f'Best score = {grid_search.best_score_:.5f}.')
    print(f'Best params = {grid_search.best_params_}\n\n')

### Running Grid Search Over Specific Models
Each of the below cells will perform the grid search and print the best score and hyperparameters for each model.

In [5]:
# Logistic Regression
tune_model(Models.LOGISTIC_REGRESSION)

Tuning Models.LOGISTIC_REGRESSION...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Finished tuning Models.LOGISTIC_REGRESSION in 4.12 seconds.
Best score = 0.58681.
Best params = {'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}




In [6]:
# XGBoost
tune_model(Models.XGBOOST)

Tuning Models.XGBOOST...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Finished tuning Models.XGBOOST in 16.74 seconds.
Best score = 0.57225.
Best params = {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.05, 'clf__max_depth': 3, 'clf__n_estimators': 300, 'clf__reg_lambda': 0.5, 'clf__subsample': 0.9}




In [7]:
# Random Forest
tune_model(Models.RANDOM_FOREST)

Tuning Models.RANDOM_FOREST...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Finished tuning Models.RANDOM_FOREST in 113.29 seconds.
Best score = 0.57072.
Best params = {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__n_estimators': 900}




In [8]:
# Support Vector Machine
tune_model(Models.SVM)

Tuning Models.SVM...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Finished tuning Models.SVM in 26.40 seconds.
Best score = 0.52587.
Best params = {'clf__C': 0.5, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}




In [9]:
# MLP Feedforward Neural Network
tune_model(Models.MLPFFNN)

Tuning Models.MLPFFNN...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Finished tuning Models.MLPFFNN in 228.93 seconds.
Best score = 0.50939.
Best params = {'clf__activation': 'tanh', 'clf__alpha': 0.01, 'clf__hidden_layer_sizes': (128, 64, 32), 'clf__learning_rate_init': 0.01}




In [10]:
# Naive Bayes
tune_model(Models.NAIVE_BAYES)

Tuning Models.NAIVE_BAYES...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Finished tuning Models.NAIVE_BAYES in 0.13 seconds.
Best score = 0.47145.
Best params = {'clf__var_smoothing': 1e-09}




### Tune the Voting Ensemble Weights
After tuning all the individual models, update the models in the source code and in the Models.VOTING param_grids object, and tune the voting ensemble weights.

In [11]:
param_grids[Models.VOTING] = {
    "model": Pipeline([
        ("imputer", DFImputer(strategy="median")),
        ("clf", VotingClassifier(
            estimators=[
                ("lr", Pipeline([
                        ("scaler", StandardScaler()),
                        ("clf", LogisticRegression(
                            solver='lbfgs',
                            penalty='l2',
                            C=0.1,
                            max_iter=5000,
                            random_state=0
                        ))
                    ])),
                ("xgb", Pipeline([
                        ("clf", XGBClassifier(
                            objective="multi:softprob",
                            num_class=3,
                            n_estimators=300,
                            max_depth=3,
                            learning_rate=0.05,
                            subsample=0.9,
                            colsample_bytree=0.8,
                            reg_lambda=1.0,
                            tree_method="hist",
                            eval_metric="mlogloss",
                            random_state=0,
                            n_jobs=-1
                        ))
                    ])),
                ("rf", Pipeline([
                        ("clf", RandomForestClassifier(
                            n_estimators=900,
                            max_depth=None,
                            min_samples_leaf=1,
                            class_weight="balanced",
                            random_state=0
                        ))
                    ]))
            ],
            voting="soft",
            n_jobs=1
        ))
    ]),
    "params": {"clf__weights": weight_vectors()}
}

In [12]:
tune_model(Models.VOTING)

Tuning Models.VOTING...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Finished tuning Models.VOTING in 202.57 seconds.
Best score = 0.58605.
Best params = {'clf__weights': [3, 1, 1]}


