In [41]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [42]:
from mlxtend.regressor import StackingCVRegressor
from skopt import BayesSearchCV
from sklearn.model_selection import RepeatedKFold
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
import pandas as pd
import os
from pyprojroot import here
import numpy as np
from skimpy import clean_columns
from data_cleaning.fun_hot_encode_limit import fun_hot_encode_limit
from pmdarima.model_selection import train_test_split
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import joblib
path_data = here("./data")
os.chdir(path_data)
data_iu_analysis = pd.read_parquet("data_iu_analysis.parquet")
data_iu_analysis = data_iu_analysis.drop(columns="season")

In [43]:
train_size = round(data_iu_analysis.shape[0] * 0.8)
train_size

59

In [44]:
train, test = train_test_split(data_iu_analysis, test_size=train_size)

In [45]:
x_train = train.drop(columns="tournament_make")
x_test = test.drop(columns="tournament_make")
y_train = train["tournament_make"]
y_test = test["tournament_make"]

In [48]:
# Define objective function for Optuna optimization
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "booster": "gbtree",
        "verbosity": 0,
        "tree_method": "auto",
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "max_depth": trial.suggest_int("max_depth", 3, 9),
        "eta": trial.suggest_loguniform("eta", 1e-8, 1.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.1, 1, 0.05),
        "colsample_bytree": trial.suggest_discrete_uniform(
            "colsample_bytree", 0.1, 1, 0.05
        ),
    }

    # Set up TimeSeriesSplit cross-validation
    tscv = TimeSeriesSplit(n_splits=5)

    # Train XGBoost model with the given hyperparameters using cross-validation
    model = xgb.XGBRegressor(**params)
    scores = cross_val_score(
        model, x_train, y_train, cv=tscv, scoring="neg_root_mean_squared_error"
    )
    rmse = -1.0 * scores.mean()

    return rmse


# Run hyperparameter optimization using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1000)

[32m[I 2023-04-10 20:06:41,766][0m A new study created in memory with name: no-name-cf0c7033-7581-4ea5-b3ac-a2e0179fb9ad[0m
  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
  "eta": trial.suggest_loguniform("eta", 1e-8, 1.0),
  "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
  "subsample": trial.suggest_discrete_uniform("subsample", 0.1, 1, 0.05),
  "colsample_bytree": trial.suggest_discrete_uniform(
[32m[I 2023-04-10 20:06:41,864][0m Trial 0 finished with value: 0.5 and parameters: {'lambda': 0.00011232055138057229, 'alpha': 4.065852044589215e-06, 'max_depth': 6, 'eta': 1.730568859834321e-05, 'gamma': 2.7300869679521182e-08, 'min_child_weight': 288, 'subsample': 0.45000000000000007, 'colsample_bytree': 0.7000000000000001}. Best is trial 0 with value: 0.5.[0m
  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
  "eta": trial.suggest_loguniform(

In [50]:
# Train final XGBoost model using the best hyperparameters found by Optuna
best_params = study.best_params
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(x_train, y_train)

# Return the best model
best_model

In [51]:
path_outputs = here("./outputs")
joblib.dump(best_model, "model_iu_bball.jlib")

['model_iu_bball.jlib']

In [53]:
prob = best_model.predict(x_test)

In [56]:
from sklearn.metrics import f1_score


def find_best_threshold(y_true, y_pred_prob):
    """
    Find the best threshold to use for converting predicted probabilities to binary predictions,
    using the F1 score as the metric to optimize for.

    :param y_true: true labels, array-like of shape (n_samples,)
    :param y_pred_prob: predicted probabilities, array-like of shape (n_samples,)
    :return: best threshold value
    """
    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    best_f1 = 0
    best_threshold = 0
    for threshold in thresholds:
        y_pred = (y_pred_prob >= threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

In [58]:
threshold = find_best_threshold(y_true=y_test, y_pred_prob=prob)
threshold

0.1

In [60]:
prob_threshold = np.where(prob > threshold, 1, 0)

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [64]:
from sklearn.metrics import balanced_accuracy_score, f1_score

In [65]:
balanced_accuracy = balanced_accuracy_score(y_true=y_test, y_pred=prob_threshold)
balanced_accuracy

0.8823529411764706

In [66]:
f1_accuracy = f1_score(y_true=y_test, y_pred=prob_threshold)
f1_accuracy

0.8666666666666666