In [7]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [8]:
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
import pandas as pd
import os
from sklearn.metrics import balanced_accuracy_score, f1_score
from pyprojroot import here
import numpy as np
from skimpy import clean_columns
from data_cleaning.fun_hot_encode_limit import fun_hot_encode_limit
import optuna
from pmdarima.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import joblib
import sklearn
import lightgbm as lgb

path_data = here("./data")
os.chdir(path_data)
data_iu_analysis = pd.read_parquet("data_iu_analysis.parquet")

In [9]:
print(joblib.__version__)
print(sklearn.__version__)
print(lgb.__version__)

1.2.0
1.0.2
3.2.1


In [10]:
train_size = round(data_iu_analysis.shape[0] * 0.8)
train_size

59

In [11]:
train, test = train_test_split(data_iu_analysis, test_size=train_size)

In [12]:
x_train = train.drop(columns="tournament_make")
x_test = test.drop(columns="tournament_make")
y_train = train["tournament_make"]
y_test = test["tournament_make"]

In [13]:
# Define objective function for Optuna optimization
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": trial.suggest_categorical(
            "boosting_type", ["gbdt", "dart", "goss"]
        ),
        "verbosity": -1,
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
        "max_depth": trial.suggest_int("max_depth", 3, 9),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-8, 1.0),
        "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", 1e-8, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.1, 1, 0.05),
        "colsample_bytree": trial.suggest_discrete_uniform(
            "colsample_bytree", 0.1, 1, 0.05
        ),
        "feature_fraction": trial.suggest_discrete_uniform(
            "feature_fraction", 0.1, 1, 0.05
        ),
        "bagging_fraction": trial.suggest_discrete_uniform(
            "bagging_fraction", 0.1, 1, 0.05
        ),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "num_leaves": trial.suggest_int("num_leaves", 10, 300),
        "max_bin": trial.suggest_int("max_bin", 100, 1000),
        "extra_trees": trial.suggest_categorical("extra_trees", [True, False]),
    }

    # Set up TimeSeriesSplit cross-validation
    tscv = TimeSeriesSplit(n_splits=5)

    # Train LightGBM model with the given hyperparameters using cross-validation
    model = lgb.LGBMRegressor(**params)
    scores = cross_val_score(
        model, x_train, y_train, cv=tscv, scoring="neg_root_mean_squared_error"
    )
    rmse = -1.0 * scores.mean()

    return rmse


# Run hyperparameter optimization using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

[32m[I 2023-04-14 15:27:42,624][0m A new study created in memory with name: no-name-3841546b-0c2a-43ba-a8b7-ce553ac50d3a[0m
  "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
  "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-8, 1.0),
  "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", 1e-8, 1.0),
  "subsample": trial.suggest_discrete_uniform("subsample", 0.1, 1, 0.05),
  "colsample_bytree": trial.suggest_discrete_uniform(
  "feature_fraction": trial.suggest_discrete_uniform(
  "bagging_fraction": trial.suggest_discrete_uniform(
[LightGBM] [Fatal] Check failed: (num_data) > (0) at /__w/1/s/python-package/compile/src/io/dataset.cpp, line 33 .

[LightGBM] [Fatal] Cannot use bagging in GOSS
[LightGBM] [Fatal] Cannot use bagging in GOSS
[LightGBM] [Fatal] Cannot use bagging in GOSS
[LightGBM] [Fatal] Cannot use bagging in GOSS
5 fits failed out of a total of 5.
The score on these



In [14]:
# Train final XGBoost model using the best hyperparameters found by Optuna
best_params = study.best_params
best_model = lgb.LGBMRegressor(**best_params)
best_model.fit(x_train, y_train)

# Return the best model
best_model

LGBMRegressor(bagging_fraction=0.65, bagging_freq=9, boosting_type='dart',
              colsample_bytree=0.6, extra_trees=True, feature_fraction=0.5,
              lambda_l1=0.0011522929173528225, lambda_l2=0.038427594585116506,
              learning_rate=0.11641217126390932, max_bin=162, max_depth=8,
              min_child_weight=14, min_gain_to_split=0.0007630314304458636,
              n_estimators=148, num_leaves=131, subsample=0.9500000000000001)

In [15]:
path_outputs = here("./outputs")
os.chdir(path_outputs)
joblib.dump(best_model, "model_iu_bball_lgb.jlib")

['model_iu_bball_lgb.jlib']

In [16]:
model_iu_bball_lgb = joblib.load("model_iu_bball_lgb.jlib")

In [17]:
model_iu_bball_lgb

LGBMRegressor(bagging_fraction=0.65, bagging_freq=9, boosting_type='dart',
              colsample_bytree=0.6, extra_trees=True, feature_fraction=0.5,
              lambda_l1=0.0011522929173528225, lambda_l2=0.038427594585116506,
              learning_rate=0.11641217126390932, max_bin=162, max_depth=8,
              min_child_weight=14, min_gain_to_split=0.0007630314304458636,
              n_estimators=148, num_leaves=131, subsample=0.9500000000000001)

In [18]:
prob = model_iu_bball_lgb.predict(x_test)

In [19]:
from sklearn.metrics import f1_score


def find_best_threshold(y_true, y_pred_prob):
    """
    Find the best threshold to use for converting predicted probabilities to binary predictions,
    using the F1 score as the metric to optimize for.

    :param y_true: true labels, array-like of shape (n_samples,)
    :param y_pred_prob: predicted probabilities, array-like of shape (n_samples,)
    :return: best threshold value
    """
    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    best_f1 = 0
    best_threshold = 0
    for threshold in thresholds:
        y_pred = (y_pred_prob >= threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

In [20]:
threshold = find_best_threshold(y_true=y_test, y_pred_prob=prob)
threshold

0.1

In [21]:
prob_threshold = np.where(prob > threshold, 1, 0)

In [22]:
balanced_accuracy = balanced_accuracy_score(y_true=y_test, y_pred=prob_threshold)
balanced_accuracy

0.5

In [23]:
f1_accuracy = f1_score(y_true=y_test, y_pred=prob_threshold)
f1_accuracy

0.7311827956989247