# Optuna optimize

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from src.common import init_logger, setup_seed
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
import warnings
import json
import itertools
import optuna
import os
from tabpfn import TabPFNClassifier
from model.deep_model import FCN, CNN,train_evaluate
warnings.filterwarnings("ignore")

# ML

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
def ml_optimize_model(X, y, model_name, feature_group, n_trials=50):
    def objective(trial):
        try:
            params = {}
            for param_name, param_info in config[model_name].items():
                param_method = getattr(trial, param_info["type"])
                params[param_name] = param_method(param_name, *param_info["args"])
            
            f1_list = []
            for train_index, test_index in skf.split(X, y):
                model_constructor = model_dict[model_name]
                model = model_constructor(**params)
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                f1 = f1_score(y_test, y_pred)
                f1_list.append(f1)
            return np.mean(f1)
        
        except Exception as e:
            trial.report(float('-inf'), step=0)
            raise optuna.exceptions.TrialPruned()

    study_name = feature_group + "_" + model_name + "_" + MODE
    try:
        optuna.delete_study(study_name=study_name, storage=storage_name)
        print(f"Study '{study_name}' deleted successfully.")
    except:
        print(f"Study '{study_name}' does not exist, skipping deletion.")
    study = optuna.create_study(study_name=feature_group+"_"+model_name+"_"+MODE, storage=storage_name, direction='maximize')
    study.optimize(objective, n_trials=n_trials, n_jobs=-1)
    return study.best_params, study.best_value, study

# DL

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
device = torch.device(f'cuda' if torch.cuda.is_available() else 'cpu')

def dl_optimize_model(X, y, model_name, feature_group, n_trials=50):

    def objective(trial):
        if model_name == "FCNN":
            lr = trial.suggest_loguniform('lr', 1e-4, 2e-1)
            n_layers = trial.suggest_int("n_layers", 1, 5)
            hidden_layers = [trial.suggest_categorical(f"n_units_l{i}", [8, 16, 32, 64, 128]) for i in range(n_layers)]
            activation_func = trial.suggest_categorical('activation_func', ['relu','tanh','sigmoid'])
            optimizer_name = trial.suggest_categorical('optimizer', ['AdamW','Adam','SGD'])
            epochs = trial.suggest_int('epochs', 50, 500, step=10)
            
            f1_list = []
            for train_index, test_index in skf.split(X, y):
                X_train, X_val = X.iloc[train_index], X.iloc[test_index]
                y_train, y_val = y.iloc[train_index], y.iloc[test_index]
                model = FCN(input_dim=X.shape[1], output_dim=1, hidden_layers=hidden_layers, activation_func=activation_func).to(device)
                optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
                criterion = nn.BCEWithLogitsLoss()
                
                f1_value,_,_ = train_evaluate(model, criterion, optimizer, X_train, y_train, X_val, y_val, epochs)
                f1_list.append(f1_value)

            return np.mean(f1_list)
        
        elif model_name == "CNN":
            lr = trial.suggest_loguniform('lr', 1e-4, 2e-1)
            n_layers = trial.suggest_int("n_layers", 1, 5)
            n_filters = [trial.suggest_categorical(f"n_filters_l{i}", [8, 16, 32, 64, 128]) for i in range(n_layers)]
            kernel_size = trial.suggest_categorical("kernel_size", [3, 5, 7])
            activation_func = trial.suggest_categorical('activation_func', ['relu','tanh','sigmoid'])
            optimizer_name = trial.suggest_categorical('optimizer', ['AdamW','Adam','SGD'])
            epochs = trial.suggest_int('epochs', 50, 500, step=10)
            
            f1_list = []
            for train_index, test_index in skf.split(X, y):
                X_train, X_val = X.iloc[train_index], X.iloc[test_index]
                y_train, y_val = y.iloc[train_index], y.iloc[test_index]
                model = CNN(n_features=X.shape[1], output_dim=1, n_layers=n_layers, n_filters=n_filters, kernel_size=kernel_size, activation_func=activation_func).to(device)
                optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
                criterion = nn.BCEWithLogitsLoss()

                f1_value,_,_ = train_evaluate(model, criterion, optimizer, X_train, y_train, X_val, y_val, epochs)
                f1_list.append(f1_value)

            return np.mean(f1_list)

        else:
            NotImplementedError

    study_name = feature_group + "_" + model_name + "_" + MODE
    try:
        optuna.delete_study(study_name=study_name, storage=storage_name)
        print(f"Study '{study_name}' deleted successfully.")
    except:
        print(f"Study '{study_name}' does not exist, skipping deletion.")
    study = optuna.create_study(study_name=study_name, storage=storage_name, direction='maximize')
    study.optimize(objective, n_trials=n_trials, n_jobs=-1)
    return study.best_params, study.best_value, study


# Main

In [None]:
# Load config
def load_config(path):
    with open(path, 'r') as f:
        return json.load(f)

MODE = "6M"
CV_RESULT = "../result/01experiment/"
LOGFILE = f"../result/01experiment/01model_optuna_{MODE}.log"
if os.path.exists(LOGFILE):
    os.remove(LOGFILE)
logger,file_handler = init_logger(LOGFILE)

def my_svm(**params):
    return SVC(probability=True, **params)

def my_lgb(**params):
    return LGBMClassifier(verbose=-1, **params)

def my_tabpfn(**params):
    return TabPFNClassifier(device='cuda',n_jobs=5, **params)

model_dict = {
    "LR": LogisticRegression,
    "SVM": my_svm,
    "NB": GaussianNB,
    "KNN": KNeighborsClassifier,
    "RF": RandomForestClassifier,
    "XGBoost": XGBClassifier,
    "LightGBM": my_lgb,
    "TabPFN": my_tabpfn,
}

clinical_features = ['Gender', 'ALT', 'AST', 'Albumin', 'GGT', 'DBIL', 'IBIL', 'AFP', 'DNA load', 'HBsAg']
specific_features = ['HBV-T', 'HBsAg-T(pH>7)', 'HBsAg-T(pH≤7)', 'HBpol-T(pH>7)', 'HBpol-T(pH≤7)', 'HBx-T(pH>7)', 'HBx-T(pH≤7)', 'HBeAg-T(pH>7)', 'HBeAg-T(pH≤7)']
treat_features = ['ThSched', 'ADV', 'ETV', 'PEG-IFN', 'TAF', 'TDF', 'TFV', 'TMF']

# Load data and config
setup_seed(42)
config = load_config('./config/ml_config.json')
storage_name = "postgresql://postgres:123...@127.0.0.1/hepatitis"

In [None]:
data = pd.read_csv('../result/00pre-processing/05final_data-minmax.csv')
y = data[f'{MODE}-Label']
model_list = []

feature_dict = {
    "CIF": clinical_features,
    "STCF": specific_features,
    "TPF": treat_features
}

feature_names = list(feature_dict.keys())
combinations_1 = list(itertools.combinations(feature_names, 1))
combinations_2 = list(itertools.combinations(feature_names, 2))
combinations_3 = list(itertools.combinations(feature_names, 3))
all_combinations = combinations_1 + combinations_2 + combinations_3
all_combinations

for combo in all_combinations:

    combined_features = []
    for group in combo:
        combined_features.extend(feature_dict[group])
    feature_group = ' + '.join([g for g in combo])
    logger.info("#"*50)
    logger.info(feature_group)
    X = data[combined_features]
    for model_name in model_dict.keys():
        best_params, best_score, study = ml_optimize_model(X, y, model_name, feature_group, n_trials=50)
        logger.info(f"{model_name} Best parameters: {best_params}")
        logger.info(f"{model_name} Best score: {best_score}")
    
    for model_name in ['FCNN','CNN']:
        best_params, best_score, study = dl_optimize_model(X, y, model_name, feature_group, n_trials=50)
        logger.info(f"{model_name} Best parameters: {best_params}")
        logger.info(f"{model_name} Best score: {best_score}")

In [None]:
file_handler.close()