In [177]:
import os
import joblib
import pickle
import numpy as np
import pandas as pd
import optuna
import json
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [83]:
STUDY = os.curdir + '/study'
MODEL = os.curdir + '/model'
TRIALS = os.curdir + '/trials'

In [176]:
X = pd.read_csv(os.curdir + "/data/train/X_train.csv")
y = pd.read_csv(os.curdir + "/data/train/y_train.csv")["target"].to_numpy()

train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.5, shuffle=True, random_state=42
)
valid_X, test_X, valid_y, test_y = train_test_split(
    test_X, test_y, test_size=0.6, shuffle=True, random_state=42
)

441609     0
351433     0
613687     0
455880     0
993540     0
          ..
259178     0
1414414    0
131932     0
671155     0
121958     0
Name: target, Length: 763329, dtype: int64


In [184]:
def get_auc_score(model, train_X, train_y, valid_X, valid_y, test_X, test_y): 
    y_pred = None
    for data, label in [(train_X, train_y), (valid_X, valid_y), (test_X, test_y)]:
        if isinstance(model, xgb.core.Booster): 
            dataset = xgb.DMatrix(data, label=label)
            y_pred = model.inplace_predict(dataset, iteration_range=(0,600), strict_shape=1)
        print(y_pred)
        print(f'The AUC score on set is: {roc_auc_score(np.reshape(label(-1,)), y_pred)}')
        

## XGBoost

In [185]:
# 100 Iterations
xgboost = joblib.load(MODEL + "/xgboost_model_20240418-222300_100.pkl")
study = joblib.load(STUDY + "/xgboost_study_20240418-222300_100.pkl")
trials = pd.read_csv(TRIALS + "/xgboost_trials_20240418-222300_100.csv")





In [186]:
get_auc_score(xgboost, train_X, train_y, valid_X, valid_y, test_X, test_y)

TypeError: `shape` attribute is required when `validate_features` is True.

In [39]:
study.best_params

{'learning_rate': 0.12173935390067935,
 'lambda': 5.0768340133873275,
 'alpha': 0.0005354999147544236,
 'max_depth': 6,
 'gamma': 1.6805456405924428e-07}

In [32]:
fig = optuna.visualization.plot_contour(study)
fig.show()
fig = optuna.visualization.plot_param_importances(study)
fig.show()
fig = optuna.visualization.plot_optimization_history(study)
fig.show()
fig = optuna.visualization.plot_edf(study)
fig.show()
fig = optuna.visualization.plot_rank(study)
fig.show()
fig = optuna.visualization.plot_intermediate_values(study)
fig.show()


plot_rank is experimental (supported from v3.2.0). The interface can change in the future.



In [33]:
# 500 iterations
xgboost = pickle.load(open(MODEL + "/xgboost_model_20240418-234815_500.pkl", 'rb'))
study = joblib.load(STUDY + "/xgboost_study_20240418-234815_500.pkl")
trials = pd.read_csv(TRIALS + "/xgboost_trials_20240418-234815_500.csv")







In [34]:
fig = optuna.visualization.plot_contour(study)
fig.show()
fig = optuna.visualization.plot_param_importances(study)
fig.show()
fig = optuna.visualization.plot_optimization_history(study)
fig.show()
fig = optuna.visualization.plot_edf(study)
fig.show()
fig = optuna.visualization.plot_rank(study)
fig.show()
fig = optuna.visualization.plot_intermediate_values(study)
fig.show()


plot_rank is experimental (supported from v3.2.0). The interface can change in the future.



## LightGBM

In [None]:
# 100 Iterations
lightgbm = pickle.load(open(MODEL + "/lightgbm_model_20240418-1345825_100.pkl", 'rb'))
study = joblib.load(STUDY + "/lightgbm_study_20240418-1345825_100.pkl")
trials = pd.read_csv(TRIALS + "/lightgbm_trials_20240418-1345825_100.csv")

In [35]:
fig = optuna.visualization.plot_contour(study)
fig.show()
fig = optuna.visualization.plot_param_importances(study)
fig.show()
fig = optuna.visualization.plot_optimization_history(study)
fig.show()
fig = optuna.visualization.plot_edf(study)
fig.show()
fig = optuna.visualization.plot_rank(study)
fig.show()
fig = optuna.visualization.plot_intermediate_values(study)
fig.show()


plot_rank is experimental (supported from v3.2.0). The interface can change in the future.

