In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb

possible_n_vals = [9]
possible_e_vals = [1]


def run_model(n, e):

    X = np.load('Datasets/kryptonite-%s-X.npy'%(n))
    y = np.load('Datasets/kryptonite-%s-y.npy'%(n))

    # Shuffle and split the data
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.6, random_state=42)  # 60% training
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 20% validation, 20% test


    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    x_test_scaled = scaler.transform(X_test)

    # print(np.mean(X_train_scaled, axis=0))
    # print(np.std(X_train_scaled, axis=0))

    # feature_creator = PCA(n_components=min(e, X_train.shape[1]))
    # X_train_features = feature_creator.fit_transform(X_train_scaled)
    # X_val_features = feature_creator.transform(X_val_scaled)
    # X_test_features = feature_creator.transform(x_test_scaled)
    # print(X_train_features.shape)
    
    # print("Created features")




    features = X_train_scaled.shape[-1]

    print(f'n:[{n}], e:[{e}]')

    # Initialize and fit
    grid = {'base_score':0.5, 'colsample_bylevel':1, 'colsample_bytree':1,
       'gamma':0, 'learning_rate':0.1, 'max_delta_step':0, 'max_depth':5,
       'min_child_weight':1, 'missing':0, 'n_estimators':1000, 'nthread':-1,
       'objective':'binary:logistic', 'reg_alpha':0, 'reg_lambda':1,
       'scale_pos_weight':1, 'seed':0, 'silent':True, 'subsample':1}

    grid = {'n_estimators':500, 'n_jobs':-1, 'booster': 'gbtree'}

    classifier = xgb.XGBClassifier(n_estimators=200, max_depth=20, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, n_jobs=-1)

    print(classifier)
    classifier.fit(X_train_scaled, y_train)
    print("Fit Model")

    # Evaluate on the validation set
    y_val_pred = classifier.predict(X_val_scaled)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Validation Accuracy: {val_accuracy:.4f}")

    # Evaluate on the test set
    y_test_pred = classifier.predict(x_test_scaled)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Test Accuracy: {test_accuracy:.4f}")
    return test_accuracy, features
        


from tqdm import tqdm

acc_by_n = []
feat_by_n = []
for n in tqdm(possible_n_vals):
    single_n = []
    single_feat = []
    for e in tqdm(possible_e_vals):
        acc, feat = run_model(n, e)
        single_n.append(acc)
        single_feat.append(feat)
    acc_by_n.append(single_n)
    feat_by_n.append(single_feat)

print(acc_by_n)

  0%|          | 0/1 [00:00<?, ?it/s]

n:[9], e:[1]
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=20, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=-1,
              num_parallel_tree=None, random_state=None, ...)


100%|██████████| 1/1 [00:01<00:00,  1.74s/it]
100%|██████████| 1/1 [00:01<00:00,  1.75s/it]

Fit Model
Validation Accuracy: 0.4996
Test Accuracy: 0.5117
[[0.5116666666666667]]





In [2]:
# Import necessary libraries
from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load a sample dataset (Iris dataset in this case)
data = load_iris()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBClassifier
model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 100.00%


Parameters: { "use_label_encoder" } are not used.



In [3]:
"""
Optuna example that optimizes a classifier configuration for cancer dataset
using XGBoost.

In this example, we optimize the validation accuracy of cancer detection
using XGBoost. We optimize both the choice of booster model and its
hyperparameters.

"""

import numpy as np
import optuna

import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb


def objective(trial):
    (data, target) = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-11-07 18:55:11,438] A new study created in memory with name: no-name-67bb2bc1-5394-4aac-9d2a-63283c65df9f
[I 2024-11-07 18:55:11,603] Trial 0 finished with value: 0.6573426573426573 and parameters: {'booster': 'dart', 'lambda': 0.031014380421931033, 'alpha': 1.919456871600701e-08, 'subsample': 0.9778746763396537, 'colsample_bytree': 0.8450870999723714, 'max_depth': 5, 'min_child_weight': 5, 'eta': 3.942512249990728e-07, 'gamma': 0.02043756715058513, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 0.2583352806247043, 'skip_drop': 0.004963739817831114}. Best is trial 0 with value: 0.6573426573426573.
[I 2024-11-07 18:55:11,614] Trial 1 finished with value: 0.6573426573426573 and parameters: {'booster': 'gblinear', 'lambda': 1.6507551583271398e-08, 'alpha': 0.0013362317481975742, 'subsample': 0.46169990872598876, 'colsample_bytree': 0.258015255689735}. Best is trial 0 with value: 0.6573

Number of finished trials:  100
Best trial:
  Value: 0.986013986013986
  Params: 
    booster: gbtree
    lambda: 4.0627169874393963e-07
    alpha: 0.691680412114956
    subsample: 0.49583830792730366
    colsample_bytree: 0.4333525932651625
    max_depth: 9
    min_child_weight: 4
    eta: 0.3631077530061857
    gamma: 4.465268498962337e-05
    grow_policy: depthwise


In [9]:
from sklearn.datasets import make_regression

data = make_regression()

In [11]:
import optuna
import optuna.visualization as vis

import numpy as np

from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


def objective(trial):
    # Load dataset
    
    X, y = data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    # Suggest hyperparameters
    alpha = trial.suggest_float("alpha", 0.0, 1.0)
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)
    # Train and evaluate model
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    return score


# Create a study object
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction="minimize", pruner=pruner)

# Optimize the objective function
study.optimize(objective, n_trials=100)

print("Best hyperparameters:", study.best_params)
print("Best value:", study.best_value)

vis.plot_optimization_history(study)
vis.plot_param_importances(study)
vis.plot_slice(study)

[I 2024-11-07 18:59:38,879] A new study created in memory with name: no-name-ac4809da-1f22-4aee-98fd-d161c98d505b
[I 2024-11-07 18:59:38,887] Trial 0 finished with value: 11462.74867595377 and parameters: {'alpha': 0.2548922724375373, 'l1_ratio': 0.7687210280285625}. Best is trial 0 with value: 11462.74867595377.

Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.892e+02, tolerance: 3.002e+02

[I 2024-11-07 18:59:38,903] Trial 1 finished with value: 7937.368314204487 and parameters: {'alpha': 0.005258832929089907, 'l1_ratio': 0.8756038210274348}. Best is trial 1 with value: 7937.368314204487.
[I 2024-11-07 18:59:38,910] Trial 2 finished with value: 12337.183617786914 and parameters: {'alpha': 0.4893870766495918, 'l1_ratio': 0.7577739161216243}. Best is trial 1 with value: 7937.368314204487.
[I 2024-11-07 18:59:38,915] Trial 3 finished with value: 4522.293025864972 and pa

Best hyperparameters: {'alpha': 0.014191135183176084, 'l1_ratio': 0.9954243029423632}
Best value: 0.005328238307579821
