In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings, os, sys, shutil
import xgboost as xgb
import lightgbm as lgb
import optuna
from catboost import CatBoost, CatBoostClassifier
from typing import Tuple
from utility import *
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('../preprocessed/combined_features_256.csv')
test  = pd.read_csv('../preprocessed/combined_features_256_test.csv')
train = shuffle(train, random_state=42)
y_train = train['label']
x_train = train.drop('label', axis=1)
Y_test = test['label']
X_test = test.drop('label', axis=1)

In [3]:
import optuna, tune
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from optuna.samplers import TPESampler

In [4]:
# 定义优化的目标函数
def objective(trial):
    # 提议参数
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
    C = trial.suggest_loguniform('C', 1e-10, 1e10)
    if kernel == 'poly':
        degree = trial.suggest_int('degree', 1, 5)
    else:
        degree = 3  # 默认值
    if kernel == 'rbf':
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    else:
        gamma = 'scale'  # 默认值
    # 创建 SVM 模型
    model = SVC(kernel=kernel, C=C, degree=degree, gamma=gamma, random_state=42, max_iter=100000)
    
    # 进行 k-fold 交叉验证
    scores = cross_val_score(model, x_train, y_train, cv=5)  # 使用5折交叉验证
    accuracy = np.mean(scores)
    return accuracy

if __name__ == "__main__":
    sampler = TPESampler(seed=42)
    pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
    study = optuna.create_study(pruner=pruner, direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=50)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2024-04-29 13:13:31,536] A new study created in memory with name: no-name-32e6707f-62e8-4b8c-a123-69b5ba2f6fb0
[I 2024-04-29 13:14:24,833] Trial 0 finished with value: 0.3168125 and parameters: {'kernel': 'poly', 'C': 94.00905432049124, 'degree': 1}. Best is trial 0 with value: 0.3168125.
[I 2024-04-29 13:15:53,073] Trial 1 finished with value: 0.3549375 and parameters: {'kernel': 'rbf', 'C': 105.26893636712298, 'gamma': 'scale'}. Best is trial 1 with value: 0.3549375.
[I 2024-04-29 13:16:32,597] Trial 2 finished with value: 0.28581249999999997 and parameters: {'kernel': 'linear', 'C': 4.3301141381717194e-07}. Best is trial 1 with value: 0.3549375.
[I 2024-04-29 13:17:44,193] Trial 3 finished with value: 0.32156250000000003 and parameters: {'kernel': 'rbf', 'C': 0.043541197871659006, 'gamma': 'auto'}. Best is trial 1 with value: 0.3549375.
[I 2024-04-29 13:18:59,665] Trial 4 finished with value: 0.29137500000000005 and parameters: {'kernel': 'rbf', 'C': 0.13225121917959448, 'gamma':

Number of finished trials: 50
Best trial:
  Value: 0.7705
  Params: 
    kernel: rbf
    C: 5545.043434006406
    gamma: auto


In [6]:
# 创建一个 Optuna 优化器的实例
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 打印最优的参数和在训练集上的性能
print('Best trial:')
trial = study.best_trial
print(f'  Average CV Accuracy: {trial.value:.3f}')
print('  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')



[I 2024-04-28 15:16:34,812] A new study created in memory with name: no-name-51abfaf9-3ec2-4906-9e24-5fe00a5699fd


In [None]:
# 使用最优参数在测试集上评估模型
best_model = SVC(**trial.params, random_state=42)
best_model.fit(x_train, y_train)
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy on Test set: {test_accuracy:.3f}')