In [None]:
%load_ext lab_black

In [None]:
%cd /mnt/batch/tasks/shared/LS_root/mounts/clusters/matthewhanauer991/code/Users/matthewhanauer99/iu_basketball

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
import pandas as pd
import os
from sklearn.metrics import balanced_accuracy_score, f1_score
from pyprojroot import here
import numpy as np
from skimpy import clean_columns
from data_cleaning.fun_hot_encode_limit import fun_hot_encode_limit
import optuna
from pmdarima.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import joblib
import sklearn
import lightgbm as lgb

path_data = here("./data")
os.chdir(path_data)
data_iu_analysis = pd.read_parquet("data_iu_analysis.parquet")

In [None]:
print(joblib.__version__)
print(sklearn.__version__)
print(lgb.__version__)

In [None]:
train_size = round(data_iu_analysis.shape[0] * 0.8)
train_size

In [None]:
train, test = train_test_split(data_iu_analysis, test_size=train_size)

In [None]:
x_train = train.drop(columns="tournament_make")
x_test = test.drop(columns="tournament_make")
y_train = train["tournament_make"]
y_test = test["tournament_make"]

In [51]:
# Define objective function for Optuna optimization
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": trial.suggest_categorical(
            "boosting_type", ["gbdt", "dart", "goss"]
        ),
        "verbosity": -1,
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
        "max_depth": trial.suggest_int("max_depth", 3, 9),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-8, 1.0),
        "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", 1e-8, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.1, 1, 0.05),
        "colsample_bytree": trial.suggest_discrete_uniform(
            "colsample_bytree", 0.1, 1, 0.05
        ),
        "feature_fraction": trial.suggest_discrete_uniform(
            "feature_fraction", 0.1, 1, 0.05
        ),
        "bagging_fraction": trial.suggest_discrete_uniform(
            "bagging_fraction", 0.1, 1, 0.05
        ),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "num_leaves": trial.suggest_int("num_leaves", 10, 300),
        "max_bin": trial.suggest_int("max_bin", 100, 1000),
        "extra_trees": trial.suggest_categorical("extra_trees", [True, False]),
    }

    # Set up TimeSeriesSplit cross-validation
    tscv = TimeSeriesSplit(n_splits=5)

    # Train LightGBM model with the given hyperparameters using cross-validation
    model = lgb.LGBMRegressor(**params)
    scores = cross_val_score(
        model, x_train, y_train, cv=tscv, scoring="neg_root_mean_squared_error"
    )
    rmse = -1.0 * scores.mean()

    return rmse


# Run hyperparameter optimization using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

[32m[I 2023-04-14 14:26:58,305][0m A new study created in memory with name: no-name-61236700-eb57-44ea-ad7d-37797178a181[0m
  "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
  "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-8, 1.0),
  "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", 1e-8, 1.0),
  "subsample": trial.suggest_discrete_uniform("subsample", 0.1, 1, 0.05),
  "colsample_bytree": trial.suggest_discrete_uniform(
  "feature_fraction": trial.suggest_discrete_uniform(
  "bagging_fraction": trial.suggest_discrete_uniform(
[32m[I 2023-04-14 14:26:58,351][0m Trial 0 finished with value: 0.5161925984721047 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 8.309005215505601e-05, 'lambda_l2': 3.86400612512729e-05, 'max_depth': 7, 'learning_rate': 0.6957066932659153, 'min_gain_to_split': 0.3856594878516562, 'min_child_weight': 241, 'subsample': 0.15000000000000002, 'c



[32m[I 2023-04-14 14:26:58,539][0m Trial 4 finished with value: 0.5161925984721047 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 0.007978471064140935, 'lambda_l2': 0.0002687610153824241, 'max_depth': 9, 'learning_rate': 0.12962207179381247, 'min_gain_to_split': 0.0004898034579376329, 'min_child_weight': 122, 'subsample': 0.15000000000000002, 'colsample_bytree': 0.65, 'feature_fraction': 0.25, 'bagging_fraction': 0.45000000000000007, 'bagging_freq': 1, 'n_estimators': 454, 'num_leaves': 116, 'max_bin': 874, 'extra_trees': False}. Best is trial 0 with value: 0.5161925984721047.[0m
  "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
  "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-8, 1.0),
  "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", 1e-8, 1.0),
  "subsample": trial.suggest_discrete_uniform("subsample", 0.1, 1, 0.05),
  "colsample_bytree": trial.suggest_discrete



  "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
  "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-8, 1.0),
  "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", 1e-8, 1.0),
  "subsample": trial.suggest_discrete_uniform("subsample", 0.1, 1, 0.05),
  "colsample_bytree": trial.suggest_discrete_uniform(
  "feature_fraction": trial.suggest_discrete_uniform(
  "bagging_fraction": trial.suggest_discrete_uniform(
[LightGBM] [Fatal] Cannot use bagging in GOSS
[LightGBM] [Fatal] Cannot use bagging in GOSS
[LightGBM] [Fatal] Cannot use bagging in GOSS
[LightGBM] [Fatal] Cannot use bagging in GOSS
[LightGBM] [Fatal] Cannot use bagging in GOSS
5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failu



  "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
  "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-8, 1.0),
  "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", 1e-8, 1.0),
  "subsample": trial.suggest_discrete_uniform("subsample", 0.1, 1, 0.05),
  "colsample_bytree": trial.suggest_discrete_uniform(
  "feature_fraction": trial.suggest_discrete_uniform(
  "bagging_fraction": trial.suggest_discrete_uniform(
[LightGBM] [Fatal] Check failed: (num_data) > (0) at /__w/1/s/python-package/compile/src/io/dataset.cpp, line 33 .

[LightGBM] [Fatal] Check failed: (num_data) > (0) at /__w/1/s/python-package/compile/src/io/dataset.cpp, line 33 .

[LightGBM] [Fatal] Check failed: (num_data) > (0) at /__w/1/s/python-package/compile/src/io/dataset.cpp, line 33 .

[LightGBM] [Fatal] Cannot use bagging in GOSS
[LightGBM] [Fatal] Cannot use bagging in GOSS
5 fits failed out of a total of 5.
The s

In [52]:
# Train final XGBoost model using the best hyperparameters found by Optuna
best_params = study.best_params
best_model = lgb.LGBMRegressor(**best_params)
best_model.fit(x_train, y_train)

# Return the best model
best_model

LGBMRegressor(bagging_fraction=0.35, bagging_freq=6, extra_trees=True,
              feature_fraction=1.0, lambda_l1=8.309005215505601e-05,
              lambda_l2=3.86400612512729e-05, learning_rate=0.6957066932659153,
              max_bin=826, max_depth=7, min_child_weight=241,
              min_gain_to_split=0.3856594878516562, n_estimators=234,
              num_leaves=89, subsample=0.15000000000000002)

In [53]:
path_outputs = here("./outputs")
os.chdir(path_outputs)
joblib.dump(best_model, "model_iu_bball.jlib")

['model_iu_bball.jlib']

In [54]:
model_iu_bball = joblib.load("model_iu_bball.jlib")

In [55]:
prob = model_iu_bball.predict(x_test)

In [56]:
from sklearn.metrics import f1_score


def find_best_threshold(y_true, y_pred_prob):
    """
    Find the best threshold to use for converting predicted probabilities to binary predictions,
    using the F1 score as the metric to optimize for.

    :param y_true: true labels, array-like of shape (n_samples,)
    :param y_pred_prob: predicted probabilities, array-like of shape (n_samples,)
    :return: best threshold value
    """
    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    best_f1 = 0
    best_threshold = 0
    for threshold in thresholds:
        y_pred = (y_pred_prob >= threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

In [57]:
threshold = find_best_threshold(y_true=y_test, y_pred_prob=prob)
threshold

0.1

In [58]:
prob_threshold = np.where(prob > threshold, 1, 0)

In [59]:
balanced_accuracy = balanced_accuracy_score(y_true=y_test, y_pred=prob_threshold)
balanced_accuracy

0.5

In [60]:
f1_accuracy = f1_score(y_true=y_test, y_pred=prob_threshold)
f1_accuracy

0.7311827956989247