In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings, os, sys, shutil
import xgboost as xgb
import lightgbm as lgb
import optuna
from catboost import CatBoost, CatBoostClassifier
from typing import Tuple
from utility import *
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

warnings.filterwarnings("ignore")

In [3]:
train = pd.read_csv('../preprocessed/combined_features_256.csv')
train = shuffle(train, random_state=42)
train.head()

Unnamed: 0,center_0_real,center_0_imag,center_1_real,center_1_imag,center_2_real,center_2_imag,center_3_real,center_3_imag,mean_i,mean_q,std_dev,phase_consistency,entropy,skewness,kurtosis,center_distance_std,cluster_counts_std,label
8756,0.684971,0.684896,-0.678801,-0.678806,-0.706127,0.705716,-0.695909,-0.695835,-0.240262,-0.229244,0.013307,1.503176,8.0,0.750905,1.564859,0.311617,43.330128,2
4660,-0.686052,-0.685979,0.692808,0.692833,-0.70569,0.706469,0.704746,-0.705361,-0.271577,-0.254963,0.013975,1.503887,8.0,0.868453,1.755275,0.344874,70.770757,1
6095,0.684879,0.684834,-0.697076,-0.697706,-0.705148,0.705929,0.70562,-0.706525,0.560786,0.5662,0.014042,0.872788,8.0,-2.866266,9.222158,0.452918,96.68247,1
304,-0.701005,-0.701018,0.706231,0.706214,-0.70245,0.707012,0.707062,-0.702382,-0.470135,-0.470149,0.002911,1.201315,7.960938,1.814242,4.291488,0.491674,86.844689,0
8241,-0.696289,-0.696402,0.68646,0.686404,-0.705922,0.704163,-0.679434,-0.679478,-0.219297,-0.213853,0.013941,1.503746,8.0,0.676113,1.458163,0.290996,40.155946,2


In [4]:
labels = train['label']
features = train.drop('label', axis=1)
features_col = features.columns

In [5]:
# 数据集分割
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


# 正式的KFold交叉验证 + Optuna模型调参

In [3]:
import optuna, tune

from optuna.samplers import TPESampler
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, log_loss
from sklearn.metrics import make_scorer, accuracy_score, log_loss
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

# fold
from sklearn.model_selection import KFold, StratifiedKFold

without boost
Number of finished trials: 500
Best trial:
  Value: 0.9753125
  Params: 
    lambda: 0.5634396819975385
    alpha: 7.214394029936068e-08
    subsample: 0.37357930763967645
    colsample_bytree: 0.2930407868220849
    learning_rate: 0.010006143053735522
    n_estimators: 4509

with boost

In [7]:
def objective(trial):
    # dtrain = xgb.DMatrix(train, label=ytrain['Class'])
    '''

    '''
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.2, 0.8),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 0.8),
        'random_state' : 42,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
        # "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 100, 300),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 16)
        param["eta"] = trial.suggest_float("eta", 1e-3, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    xgb1 = xgb.XGBClassifier(**param)
    xgb1.fit(x_train, y_train)
    y_pred = xgb1.predict(x_test)
    return accuracy_score(y_test, y_pred)

    # # 交叉验证
    # skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    # log_loss_scores = []

    # # 划分验证集
    # for train_index, val_index in skf.split(x_train, y_train):
    #     x_train_, x_val = x_train.iloc[train_index], x_train.iloc[val_index]
    #     y_train_, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    #     classifier = xgb.XGBClassifier(**param)
    #     classifier.fit(x_train_, y_train_, eval_set=[(x_val, y_val)], verbose=False, early_stopping_rounds=200)
    #     y_pred = classifier.predict(x_val)
    #     acc = accuracy_score(y_val, y_pred)
    #     # y_pred_proba = classifier.predict_proba(x_val)[:, 1]
    #     log_loss_scores.append(acc)
    # return np.mean(acc)

if __name__ == "__main__":
    sampler = TPESampler(seed=42)
    pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
    study = optuna.create_study(pruner=pruner, direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=120)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2024-04-21 09:58:08,577] A new study created in memory with name: no-name-acc3a114-a3cc-4bd1-aa15-0a13895736ca
[I 2024-04-21 09:58:10,808] Trial 0 finished with value: 0.716875 and parameters: {'booster': 'gblinear', 'lambda': 0.0006155564318973012, 'alpha': 1.77071686435378e-07, 'subsample': 0.2935967122017216, 'colsample_bytree': 0.23485016730091968, 'learning_rate': 0.2611910822747312, 'n_estimators': 721}. Best is trial 0 with value: 0.716875.
[I 2024-04-21 10:02:12,755] Trial 1 finished with value: 0.9703125 and parameters: {'booster': 'dart', 'lambda': 0.04566054873446119, 'alpha': 4.997040685255803e-07, 'subsample': 0.3090949803242604, 'colsample_bytree': 0.3100427059120603, 'learning_rate': 0.09823025045826593, 'n_estimators': 667, 'max_depth': 7, 'eta': 0.007476312062252299, 'gamma': 0.000784915956255507, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 0.00012997969313168238, 'skip_drop': 0.0005486767416600901}. Best is trial 1 

In [4]:
plot_optimization_history(study)

NameError: name 'study' is not defined

In [None]:
plot_intermediate_values(study)

In [None]:
plot_parallel_coordinate(study)

In [None]:
plot_contour(study)

In [None]:
plot_slice(study)

In [None]:
plot_param_importances(study)