In [1]:
import sys
import os

ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

print("Path configurado para importar módulos de 'chemai':")
print(ROOT_DIR)


from proxy import configure_proxy
configure_proxy(ROOT_DIR)


Path configurado para importar módulos de 'chemai':
C:\Users\f0pi\git\viscosidade-ai
Proxy configurado.


In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import  SelectKBest, f_regression, VarianceThreshold

from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import TruncatedSVD

import os
import joblib
from joblib import dump, load

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import optuna

import torch
import random
import onnxmltools

from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

from chemai.loader import DipprDatasetLoader
from chemai.chem_featurizer import ChemFeaturizer
from chemai.chemberta_featurizer import ChemBERTaFeaturizer
from chemai.train import train_test_split



In [51]:
GLOBAL_SEED = 13
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
os.environ["PYTHONHASHSEED"] = str(GLOBAL_SEED)

In [4]:
data_loader = DipprDatasetLoader(data_dir='../data/nist_dippr_data')
data_loader.prepare()

pure = data_loader.get_pure()
mix  = data_loader.get_mix()

pure_train, pure_dev = train_test_split(
    smiles1=pure['train']["MOL"],
    T=pure['train']["T"],
    logV=pure['train']["logV"]
)
pure_test = pure['test']  # conjunto final de teste

mix_train, mix_dev = train_test_split(
    smiles1=mix['train']["MOL_1"],
    smiles2=mix['train']["MOL_2"],
    MolFrac_1=mix['train']["MolFrac_1"],
    T=mix['train']["T"],
    logV=mix['train']["logV"]
)
mix_test = mix['test']  # conjunto final de teste

print(f"Puro: train={len(pure_train['smiles'])}, dev={len(pure_dev['smiles'])}, test={pure_test.shape[0]}")
print(f"Mix: train={len(mix_train['smiles_1'])}, dev={len(mix_dev['smiles_1'])}, test={mix_test.shape[0]}")


Puro: train=5268, dev=1450, test=885
Mix: train=20635, dev=5254, test=5585


In [53]:
chem = ChemFeaturizer()
chemberta = ChemBERTaFeaturizer(device='cuda' if torch.cuda.is_available() else 'cpu')

# PUROS
df_train_pure = pd.DataFrame({'MOL': pure_train['smiles'], 'T': pure_train['T'], 'logV': pure_train['logV']})
df_dev_pure   = pd.DataFrame({'MOL': pure_dev['smiles'],   'T': pure_dev['T'],   'logV': pure_dev['logV']})
df_test_pure  = pure_test.copy()

feat_train_pure_chem = chem.featurize_pure(df_train_pure)
feat_dev_pure_chem   = chem.featurize_pure(df_dev_pure)
feat_test_pure_chem  = chem.featurize_pure(df_test_pure)

feat_train_pure_bert = chemberta.featurize_pure(df_train_pure)
feat_dev_pure_bert   = chemberta.featurize_pure(df_dev_pure)
feat_test_pure_bert  = chemberta.featurize_pure(df_test_pure)

# MISTURAS
df_train_mix = pd.DataFrame({
    'MOL_1': mix_train['smiles_1'], 'MOL_2': mix_train['smiles_2'],
    'MolFrac_1': mix_train['MolFrac_1'], 'T': mix_train['T'], 'logV': mix_train['logV']})
df_dev_mix = pd.DataFrame({
    'MOL_1': mix_dev['smiles_1'], 'MOL_2': mix_dev['smiles_2'],
    'MolFrac_1': mix_dev['MolFrac_1'], 'T': mix_dev['T'], 'logV': mix_dev['logV']})
df_test_mix = mix_test.copy()

feat_train_mix_chem = chem.featurize_mix_parallel(df_train_mix)
feat_dev_mix_chem   = chem.featurize_mix_parallel(df_dev_mix)
feat_test_mix_chem  = chem.featurize_mix_parallel(df_test_mix)

feat_train_mix_bert = chemberta.featurize_mix(df_train_mix)
feat_dev_mix_bert   = chemberta.featurize_mix(df_dev_mix)
feat_test_mix_bert  = chemberta.featurize_mix(df_test_mix)


Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
def build_pipeline(model, X_sample, embedding_features, k_features=50):
    if not embedding_features:
        cols = list(X_sample.columns)
        ignore = [c for c in cols if 'has_' in c or c == 'frac']
        floats = [c for c in cols if np.issubdtype(X_sample[c].dtype, np.floating) and c not in ignore]
        ints = [c for c in cols if np.issubdtype(X_sample[c].dtype, np.integer) and c not in ignore]


        transformers = []
        if floats: transformers.append(('float_scaler', StandardScaler(), floats))
        if ints: transformers.append(('int_scaler', MinMaxScaler(), ints))
        if ignore: transformers.append(('skip', 'passthrough', ignore))

        col_transform = ColumnTransformer(transformers)

        pipeline = Pipeline([
            ('scaler', col_transform),
            ('imputer', SimpleImputer(strategy='median')),
            ('kbest', SelectKBest(score_func=f_regression, k=k_features)),
            ('model', model)
        ])
        
        return pipeline
    
    pipeline = Pipeline([
        # ('svd', TruncatedSVD(n_components=100, random_state=GLOBAL_SEED)),
        ('selector', VarianceThreshold(0.0)),
        ('model', model)
    ])
    return pipeline 


In [46]:
def optuna_objective(trial, model_name, X_train, y_train, X_dev, y_dev, embedding_features):
    if model_name == 'SVR':
        params = {
            'C': trial.suggest_float('C', 1e-2, 1e2, log=True),
            'kernel': trial.suggest_categorical('kernel', ['rbf', 'linear']),
            'gamma': trial.suggest_categorical('gamma', ['scale', 'auto'])
        }
        model = SVR(**params)

    elif model_name == 'XGB':
        params = {
            'n_estimators': trial.suggest_categorical('n_estimators', [200, 400, 800]),
            'max_depth': trial.suggest_int('max_depth', 3, 8),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.7, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
            'random_state': GLOBAL_SEED
        }
        model = XGBRegressor(**params, n_jobs=-1)

    elif model_name == 'LGB':
        params = {
            'n_estimators': trial.suggest_categorical('n_estimators', [200, 400, 800]),
            'max_depth': trial.suggest_int('max_depth', 3, 8),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.7, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
            'random_state': GLOBAL_SEED
        }
        model = LGBMRegressor(**params, n_jobs=-1, verbose=-1)

    elif model_name == 'RF':
        params = {
            'n_estimators': trial.suggest_categorical('n_estimators', [200, 400, 800]),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
            'random_state': GLOBAL_SEED,
        }
        model = RandomForestRegressor(**params,  n_jobs=-1)

    k_features = trial.suggest_int('k_features', 50, X_train.shape[1])
    pipeline = build_pipeline(model, X_train, embedding_features, k_features)

    pipeline.fit(X_train, y_train)

    y_pred_dev = pipeline.predict(X_dev)
    r2_dev = r2_score(y_dev, y_pred_dev)
    rmse_dev = np.sqrt(mean_squared_error(y_dev, y_pred_dev))

    trial.set_user_attr('rmse_dev', rmse_dev)
    trial.set_user_attr('k_features', k_features)
    trial.set_user_attr('params', params)

    return r2_dev


In [47]:
def export_pipeline_to_onnx(pipeline, columns, feat_name, model_name):
    initial_types = [(col, FloatTensorType([None, 1])) for col in columns]
    
    if model_name not in {'XGB', 'LGB'}:
        onnx_path = f"../models/sklearn/{feat_name}_{model_name}_optuna_pipeline.onnx"

        onnx_model = convert_sklearn(pipeline, initial_types=initial_types)

        with open(onnx_path, "wb") as f:
            f.write(onnx_model.SerializeToString())
    else:
        preprocess = Pipeline(pipeline.steps[:-1])
        model = pipeline.named_steps["model"]


        preprocess_path = f"../models/sklearn/{feat_name}_{model_name}_optuna_preprocess.onnx"

        onnx_pre = convert_sklearn(preprocess, initial_types=initial_types)
        with open(preprocess_path, "wb") as f:
            f.write(onnx_pre.SerializeToString())

        model_path = f"../models/sklearn/{feat_name}_{model_name}_optuna_model.onnx"
        if model_name == 'XGB':
            xgb_model.get_booster().save_model(model_path)
        elif model_name == 'LGB':
            onnx_model = onnxmltools.convert_lightgbm(
            model, initial_types=[('float_input', FloatTensorType([None, len(columns)]))])
            onnxmltools.utils.save_model(onnx_model, model_path)

In [48]:
n_trials = 100
results = []
os.makedirs("../models/sklearn", exist_ok=True)

datasets = {
    'ChemFeaturizer_Puro': (feat_train_pure_chem, feat_dev_pure_chem, feat_test_pure_chem),
    'ChemFeaturizer_Mix':  (feat_train_mix_chem,  feat_dev_mix_chem,  feat_test_mix_chem),
    'ChemBERTa_Puro': (feat_train_pure_bert, feat_dev_pure_bert, feat_test_pure_bert),
    'ChemBERTa_Mix': (feat_train_mix_bert, feat_dev_mix_bert, feat_test_mix_bert)
}

sampler = optuna.samplers.TPESampler(seed=GLOBAL_SEED)

for feat_name, (df_train, df_dev, df_test) in datasets.items():
    X_train, y_train = df_train.drop(columns=['logV']), df_train['logV']
    X_dev, y_dev = df_dev.drop(columns=['logV']), df_dev['logV']
    X_test, y_test = df_test.drop(columns=['logV']), df_test['logV']

    for model_name in  ['RF', 'SVR', 'XGB', 'LGB']:
        if model_name in ['RF', 'SVR'] and feat_name == 'ChemBERTa_Mix':
            continue
        print(f"\n Optuna - {model_name} ({feat_name})")
        study = optuna.create_study(direction='maximize', sampler=sampler)
        study.optimize(lambda trial: optuna_objective(
            trial, model_name, X_train, y_train, X_dev, y_dev, feat_name.startswith('ChemBERTa')),
                       n_trials=n_trials, show_progress_bar=True)

        best_trial = study.best_trial
        best_params = best_trial.user_attrs['params']

        X_full = pd.concat([X_train, X_dev], axis=0).reset_index(drop=True)
        y_full = pd.concat([y_train, y_dev], axis=0).reset_index(drop=True)

        if model_name == 'SVR':
            model_final = SVR(**best_params)
        elif model_name == 'XGB':
            model_final = XGBRegressor(**best_params)
        elif model_name == 'RF':
            model_final = RandomForestRegressor(**best_params)
        elif model_name == 'LGB':
            model_final = LGBMRegressor(**best_params)

        k_features = best_trial.user_attrs['k_features']
        print(f"Melhor número de features selecionadas: {k_features}")

        full_pipeline = build_pipeline(model_final, X_full, feat_name.startswith('ChemBERTa'), k_features)
        full_pipeline.fit(X_full, y_full)
        y_pred_test = full_pipeline.predict(X_test)

        r2 = r2_score(y_test, y_pred_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

        filename = f"../models/sklearn/{feat_name}_{model_name}_optuna_pipeline.joblib"
        dump(full_pipeline, filename)

        export_pipeline_to_onnx(full_pipeline, X_train.columns, feat_name, model_name)

        print(f" Pipeline salva: {filename}")
        print(f" [TEST] R²={r2:.4f}, RMSE={rmse:.4f}")

        results.append({
            'featurizer': feat_name,
            'model': model_name,
            'r2_test': r2,
            'rmse_test': rmse,
            'params': best_params,
            'file': filename
        })


[I 2025-12-16 03:52:36,120] A new study created in memory with name: no-name-859a3a29-3ffc-4191-b9cb-8a2aecaf471d



 Optuna - LGB (ChemBERTa_Mix)


  0%|          | 0/100 [00:00<?, ?it/s]



[I 2025-12-16 03:53:02,414] Trial 0 finished with value: 0.9130334989034612 and parameters: {'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.2733063849296914, 'subsample': 0.8360347742251937, 'colsample_bytree': 0.8827127388283833, 'k_features': 609}. Best is trial 0 with value: 0.9130334989034612.




[I 2025-12-16 03:53:14,971] Trial 1 finished with value: 0.8355624404428412 and parameters: {'n_estimators': 400, 'max_depth': 4, 'learning_rate': 0.012201972599808023, 'subsample': 0.9571182827761597, 'colsample_bytree': 0.8118562083624422, 'k_features': 540}. Best is trial 0 with value: 0.9130334989034612.




[I 2025-12-16 03:53:27,034] Trial 2 finished with value: 0.9178905256384325 and parameters: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.252305773452444, 'subsample': 0.7653697027395067, 'colsample_bytree': 0.7958174099141093, 'k_features': 711}. Best is trial 2 with value: 0.9178905256384325.




[I 2025-12-16 03:54:09,931] Trial 3 finished with value: 0.910256708259597 and parameters: {'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.01030087681090652, 'subsample': 0.9239731710981812, 'colsample_bytree': 0.943852351300787, 'k_features': 104}. Best is trial 2 with value: 0.9178905256384325.




[I 2025-12-16 03:54:24,807] Trial 4 finished with value: 0.79262366276898 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.010000409293826099, 'subsample': 0.7740936102975748, 'colsample_bytree': 0.913669803373475, 'k_features': 284}. Best is trial 2 with value: 0.9178905256384325.




[I 2025-12-16 03:54:44,158] Trial 5 finished with value: 0.9197795506623874 and parameters: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.047495044310490286, 'subsample': 0.7758978048464519, 'colsample_bytree': 0.8137999874449198, 'k_features': 485}. Best is trial 5 with value: 0.9197795506623874.




[I 2025-12-16 03:54:56,000] Trial 6 finished with value: 0.8522183179364913 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.015989030515643893, 'subsample': 0.7296259657614755, 'colsample_bytree': 0.7736677315973967, 'k_features': 159}. Best is trial 5 with value: 0.9197795506623874.




[I 2025-12-16 03:55:09,315] Trial 7 finished with value: 0.9133037904936562 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.06640687357907646, 'subsample': 0.7218211632653043, 'colsample_bytree': 0.9519125531248213, 'k_features': 342}. Best is trial 5 with value: 0.9197795506623874.




[I 2025-12-16 03:55:41,722] Trial 8 finished with value: 0.9228783412744377 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.2850188152256289, 'subsample': 0.9624359505205133, 'colsample_bytree': 0.8497125051702682, 'k_features': 126}. Best is trial 8 with value: 0.9228783412744377.




[I 2025-12-16 03:56:00,026] Trial 9 finished with value: 0.8544434114893817 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.015900757696605462, 'subsample': 0.7709140480458461, 'colsample_bytree': 0.8786197736057137, 'k_features': 456}. Best is trial 8 with value: 0.9228783412744377.




[I 2025-12-16 03:56:28,745] Trial 10 finished with value: 0.9323502828265505 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.11320632767996111, 'subsample': 0.9989354915586504, 'colsample_bytree': 0.7327390935773109, 'k_features': 50}. Best is trial 10 with value: 0.9323502828265505.




[I 2025-12-16 03:56:55,392] Trial 11 finished with value: 0.9288828873640355 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.1349603198462242, 'subsample': 0.9958252288626217, 'colsample_bytree': 0.7056170510936972, 'k_features': 54}. Best is trial 10 with value: 0.9323502828265505.




[I 2025-12-16 03:57:33,090] Trial 12 finished with value: 0.9332612306403273 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.10817635549252136, 'subsample': 0.9050049172335859, 'colsample_bytree': 0.7057716557966478, 'k_features': 228}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 03:58:07,927] Trial 13 finished with value: 0.9326147125852865 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.09874087604483943, 'subsample': 0.8872591427877177, 'colsample_bytree': 0.7021559227744556, 'k_features': 224}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 03:58:36,545] Trial 14 finished with value: 0.9327793661543311 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.04274851766985435, 'subsample': 0.8817731850557153, 'colsample_bytree': 0.7445509210737259, 'k_features': 242}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 03:59:02,116] Trial 15 finished with value: 0.9260526049512212 and parameters: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.03497021363717278, 'subsample': 0.8545636755136308, 'colsample_bytree': 0.753822139981188, 'k_features': 354}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 03:59:18,124] Trial 16 finished with value: 0.8264715254787618 and parameters: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.028991811413087487, 'subsample': 0.8880153393716648, 'colsample_bytree': 0.7389850956497135, 'k_features': 223}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 03:59:47,178] Trial 17 finished with value: 0.9328713067543811 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.07456968375086631, 'subsample': 0.8260076663700248, 'colsample_bytree': 0.7718845101245728, 'k_features': 228}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 04:00:15,454] Trial 18 finished with value: 0.9241706789408983 and parameters: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.17319223777031137, 'subsample': 0.8196182520155689, 'colsample_bytree': 0.9992392398448945, 'k_features': 368}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 04:00:32,184] Trial 19 finished with value: 0.9308172378123123 and parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.07510984167025225, 'subsample': 0.8121812529011194, 'colsample_bytree': 0.7844641390649831, 'k_features': 296}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 04:01:00,689] Trial 20 finished with value: 0.9260056555063201 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.16698609527916564, 'subsample': 0.9256144503030431, 'colsample_bytree': 0.8446197149288196, 'k_features': 175}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 04:01:27,536] Trial 21 finished with value: 0.9309043044580799 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.044709010279295046, 'subsample': 0.8681317102100831, 'colsample_bytree': 0.727828431521508, 'k_features': 245}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 04:01:53,825] Trial 22 finished with value: 0.9318315470902939 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.07389744683667229, 'subsample': 0.9058984679994629, 'colsample_bytree': 0.7619923879592508, 'k_features': 416}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 04:02:17,873] Trial 23 finished with value: 0.9224571922955103 and parameters: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.027967882195256748, 'subsample': 0.8697198009307319, 'colsample_bytree': 0.727174632976269, 'k_features': 282}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 04:02:45,635] Trial 24 finished with value: 0.9320792778087137 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.05765255304570781, 'subsample': 0.8363795957700169, 'colsample_bytree': 0.7025557631868766, 'k_features': 175}. Best is trial 12 with value: 0.9332612306403273.




[I 2025-12-16 04:03:10,354] Trial 25 finished with value: 0.9356640411309243 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.0908312460455067, 'subsample': 0.7991344715747851, 'colsample_bytree': 0.7533463466834283, 'k_features': 221}. Best is trial 25 with value: 0.9356640411309243.




[I 2025-12-16 04:03:30,486] Trial 26 finished with value: 0.9257940890549493 and parameters: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.09852592149100017, 'subsample': 0.8023366208125616, 'colsample_bytree': 0.8250720638192435, 'k_features': 328}. Best is trial 25 with value: 0.9356640411309243.




[I 2025-12-16 04:03:57,395] Trial 27 finished with value: 0.9280636300757505 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.19284633289492933, 'subsample': 0.7451877351381575, 'colsample_bytree': 0.7699021204129521, 'k_features': 103}. Best is trial 25 with value: 0.9356640411309243.




[I 2025-12-16 04:04:13,451] Trial 28 finished with value: 0.92447029258733 and parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.08663539369356914, 'subsample': 0.7961037480389886, 'colsample_bytree': 0.7914790509360605, 'k_features': 196}. Best is trial 25 with value: 0.9356640411309243.




[I 2025-12-16 04:04:24,450] Trial 29 finished with value: 0.9202485620834363 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.1247015544902583, 'subsample': 0.8344407721903586, 'colsample_bytree': 0.8739291077514822, 'k_features': 759}. Best is trial 25 with value: 0.9356640411309243.




[I 2025-12-16 04:04:45,886] Trial 30 finished with value: 0.9310770138095702 and parameters: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.05760758440156789, 'subsample': 0.849440429080393, 'colsample_bytree': 0.7184436606395097, 'k_features': 596}. Best is trial 25 with value: 0.9356640411309243.




[I 2025-12-16 04:05:12,712] Trial 31 finished with value: 0.929323234181969 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.04158582537107192, 'subsample': 0.9017069644449298, 'colsample_bytree': 0.750330198521092, 'k_features': 259}. Best is trial 25 with value: 0.9356640411309243.




[I 2025-12-16 04:05:40,281] Trial 32 finished with value: 0.9361390537800752 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.08414303674388098, 'subsample': 0.9258308391147805, 'colsample_bytree': 0.7433810586160599, 'k_features': 136}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:06:09,336] Trial 33 finished with value: 0.93349487850007 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.14343793305418134, 'subsample': 0.9566927504696127, 'colsample_bytree': 0.8017199829398617, 'k_features': 118}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:06:36,555] Trial 34 finished with value: 0.9213386471282011 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.21112018714454925, 'subsample': 0.9525798532783012, 'colsample_bytree': 0.8041579537550012, 'k_features': 138}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:06:52,108] Trial 35 finished with value: 0.9258602859372179 and parameters: {'n_estimators': 400, 'max_depth': 8, 'learning_rate': 0.14405252595132187, 'subsample': 0.945660797489887, 'colsample_bytree': 0.7178864604324177, 'k_features': 87}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:07:22,518] Trial 36 finished with value: 0.9328437223236096 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.1080973730446373, 'subsample': 0.9752428924609227, 'colsample_bytree': 0.8294028844759987, 'k_features': 126}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:07:50,712] Trial 37 finished with value: 0.929053406885388 and parameters: {'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.21997474887989166, 'subsample': 0.9303374035612962, 'colsample_bytree': 0.7852052489119448, 'k_features': 184}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:07:59,504] Trial 38 finished with value: 0.9121987834052196 and parameters: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.09003108270380628, 'subsample': 0.940051095481232, 'colsample_bytree': 0.8105444367875215, 'k_features': 75}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:08:26,930] Trial 39 finished with value: 0.9232405004737575 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.1532532617871489, 'subsample': 0.9656566573299196, 'colsample_bytree': 0.7579491870000448, 'k_features': 544}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:08:43,927] Trial 40 finished with value: 0.9259364986030492 and parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.13409637218848355, 'subsample': 0.9108889294503101, 'colsample_bytree': 0.9043599004603338, 'k_features': 141}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:09:11,047] Trial 41 finished with value: 0.9351944196067947 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.07704457366911968, 'subsample': 0.7011223312002403, 'colsample_bytree': 0.7766161997040604, 'k_features': 202}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:09:38,361] Trial 42 finished with value: 0.9323280792533909 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.11460742878605319, 'subsample': 0.7560247255505212, 'colsample_bytree': 0.7989431552306703, 'k_features': 185}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:10:16,301] Trial 43 finished with value: 0.9335424865679842 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.08736979046647428, 'subsample': 0.9782671298858296, 'colsample_bytree': 0.7787367709941777, 'k_features': 304}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:10:40,381] Trial 44 finished with value: 0.929805110414926 and parameters: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.05337620573814655, 'subsample': 0.9815227406843811, 'colsample_bytree': 0.7763921567209302, 'k_features': 312}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:10:51,162] Trial 45 finished with value: 0.9135301691669713 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.0648933857373875, 'subsample': 0.7047539114511047, 'colsample_bytree': 0.8325558861813515, 'k_features': 398}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:11:08,650] Trial 46 finished with value: 0.9093038567430833 and parameters: {'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.08523027774611246, 'subsample': 0.7846979952479528, 'colsample_bytree': 0.8626733654295515, 'k_features': 277}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:11:37,405] Trial 47 finished with value: 0.9329174125074955 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.06518037028190986, 'subsample': 0.969787321712854, 'colsample_bytree': 0.7647275999759433, 'k_features': 151}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:11:57,313] Trial 48 finished with value: 0.9254294134022879 and parameters: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.08319306052484335, 'subsample': 0.9921193633192087, 'colsample_bytree': 0.8194020282132071, 'k_features': 209}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:12:24,953] Trial 49 finished with value: 0.9128785742956829 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.260959543184069, 'subsample': 0.7264484501061533, 'colsample_bytree': 0.7427194924424326, 'k_features': 108}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:12:35,711] Trial 50 finished with value: 0.9227050742957645 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.1183517096564894, 'subsample': 0.7079828713367292, 'colsample_bytree': 0.793893391618449, 'k_features': 265}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:13:02,766] Trial 51 finished with value: 0.931777962563829 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.10127701228595398, 'subsample': 0.9156506695947995, 'colsample_bytree': 0.7168777223771914, 'k_features': 155}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:13:31,317] Trial 52 finished with value: 0.9358382693399637 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.0968834272664124, 'subsample': 0.9371272312612686, 'colsample_bytree': 0.7394630852026831, 'k_features': 207}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:13:58,659] Trial 53 finished with value: 0.9326108317090169 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.07307797875556889, 'subsample': 0.958722136073007, 'colsample_bytree': 0.7532597784002895, 'k_features': 205}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:14:25,937] Trial 54 finished with value: 0.9319713128916832 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.09257941113921926, 'subsample': 0.9834313704198071, 'colsample_bytree': 0.7790506501933782, 'k_features': 112}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:14:54,826] Trial 55 finished with value: 0.9318083824103985 and parameters: {'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.05187306841276359, 'subsample': 0.9330903777549343, 'colsample_bytree': 0.7380873306898976, 'k_features': 71}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:15:21,116] Trial 56 finished with value: 0.9282377144110864 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.15721681294754075, 'subsample': 0.9492223019296269, 'colsample_bytree': 0.7649126248472697, 'k_features': 164}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:15:47,647] Trial 57 finished with value: 0.9295721441715808 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.12811358603395104, 'subsample': 0.9177918937492076, 'colsample_bytree': 0.7324319518573275, 'k_features': 314}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:16:17,711] Trial 58 finished with value: 0.9331052754999016 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.07862616159883878, 'subsample': 0.8936536230649822, 'colsample_bytree': 0.7475415457943587, 'k_features': 242}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:16:32,144] Trial 59 finished with value: 0.9272535816608196 and parameters: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.17851110541332235, 'subsample': 0.7460958185328448, 'colsample_bytree': 0.7845174323788493, 'k_features': 356}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:17:01,150] Trial 60 finished with value: 0.9307584163182954 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.06400261379733299, 'subsample': 0.9360104353907998, 'colsample_bytree': 0.8092939644809918, 'k_features': 382}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:17:30,019] Trial 61 finished with value: 0.9295814547585238 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.10455195177380747, 'subsample': 0.8747440867823751, 'colsample_bytree': 0.7141465270879309, 'k_features': 222}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:17:56,538] Trial 62 finished with value: 0.9341880618479087 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.09688910564952813, 'subsample': 0.9603784470703042, 'colsample_bytree': 0.7300035547033361, 'k_features': 454}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:18:22,254] Trial 63 finished with value: 0.9309634125360288 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.09208342505184072, 'subsample': 0.9606217141425059, 'colsample_bytree': 0.7308909708370182, 'k_features': 432}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:18:49,155] Trial 64 finished with value: 0.933538070480316 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.07145626143995892, 'subsample': 0.9923218632541947, 'colsample_bytree': 0.7561821199316082, 'k_features': 466}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:19:16,253] Trial 65 finished with value: 0.9323989523658389 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.07033939394265033, 'subsample': 0.9919600941442387, 'colsample_bytree': 0.7400191878687067, 'k_features': 473}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:19:44,965] Trial 66 finished with value: 0.9314807103070462 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.0600606873881025, 'subsample': 0.979180447988717, 'colsample_bytree': 0.7558224509580181, 'k_features': 552}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:20:13,760] Trial 67 finished with value: 0.9330491655534076 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.04791320074007703, 'subsample': 0.9993938921181919, 'colsample_bytree': 0.7247005766960312, 'k_features': 513}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:20:39,442] Trial 68 finished with value: 0.9312115140843906 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.08066860449595255, 'subsample': 0.9709555092052159, 'colsample_bytree': 0.7078036362506147, 'k_features': 513}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:20:48,122] Trial 69 finished with value: 0.912871360493574 and parameters: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.0976674923173102, 'subsample': 0.9901635208360642, 'colsample_bytree': 0.7702186295830976, 'k_features': 446}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:21:18,691] Trial 70 finished with value: 0.9301212123892303 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.03951441023068092, 'subsample': 0.924775610836911, 'colsample_bytree': 0.7469679722826508, 'k_features': 499}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:21:46,999] Trial 71 finished with value: 0.9289646288066488 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.11458884296687842, 'subsample': 0.9493199233731265, 'colsample_bytree': 0.777039050585932, 'k_features': 578}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:22:13,394] Trial 72 finished with value: 0.9329543877251211 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.13933779069286242, 'subsample': 0.9418925305914938, 'colsample_bytree': 0.7601943746489552, 'k_features': 683}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:22:42,452] Trial 73 finished with value: 0.93367309796233 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.0795418872594867, 'subsample': 0.9610209421083303, 'colsample_bytree': 0.7907542767138246, 'k_features': 468}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:23:09,845] Trial 74 finished with value: 0.9339054405532394 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.07294240793660392, 'subsample': 0.9837475819213377, 'colsample_bytree': 0.787406042113027, 'k_features': 467}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:23:41,745] Trial 75 finished with value: 0.9317070068575374 and parameters: {'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.07931137329178306, 'subsample': 0.9666458795039561, 'colsample_bytree': 0.7874675815270924, 'k_features': 428}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:24:03,100] Trial 76 finished with value: 0.8931971708980301 and parameters: {'n_estimators': 400, 'max_depth': 7, 'learning_rate': 0.014905883435639417, 'subsample': 0.8588686703032155, 'colsample_bytree': 0.7699073535732474, 'k_features': 483}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:24:30,237] Trial 77 finished with value: 0.933263405391732 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.09053926274607811, 'subsample': 0.9835136283328615, 'colsample_bytree': 0.8377804107781217, 'k_features': 386}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:24:55,721] Trial 78 finished with value: 0.9331666887958661 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.06189516108190596, 'subsample': 0.9550971424199195, 'colsample_bytree': 0.7246886793105348, 'k_features': 293}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:25:24,479] Trial 79 finished with value: 0.9320178916855888 and parameters: {'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.06965039722511597, 'subsample': 0.975104745813264, 'colsample_bytree': 0.7379989953263796, 'k_features': 528}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:25:53,347] Trial 80 finished with value: 0.9341924396498792 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.10634359613919678, 'subsample': 0.7801244722262564, 'colsample_bytree': 0.8194026634913816, 'k_features': 418}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:26:23,520] Trial 81 finished with value: 0.9337715022161716 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.10474538604086167, 'subsample': 0.805963911992736, 'colsample_bytree': 0.7960766885946914, 'k_features': 446}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:26:52,702] Trial 82 finished with value: 0.9326891126832714 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.10663640231569743, 'subsample': 0.8061159710158516, 'colsample_bytree': 0.8203910035650069, 'k_features': 452}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:27:21,768] Trial 83 finished with value: 0.9342640348075341 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.09764693139334343, 'subsample': 0.8197000922035714, 'colsample_bytree': 0.797961084691505, 'k_features': 408}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:27:50,633] Trial 84 finished with value: 0.9326055108932682 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.12362641356995654, 'subsample': 0.7906168918241542, 'colsample_bytree': 0.8007031114277071, 'k_features': 414}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:28:20,104] Trial 85 finished with value: 0.9341795773221302 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.10022778004504136, 'subsample': 0.8339522245602338, 'colsample_bytree': 0.8066895695523045, 'k_features': 339}. Best is trial 32 with value: 0.9361390537800752.




[I 2025-12-16 04:28:47,783] Trial 86 finished with value: 0.9377591282274104 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.0934633289534973, 'subsample': 0.8232808376843558, 'colsample_bytree': 0.8083732444118185, 'k_features': 341}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:28:58,463] Trial 87 finished with value: 0.9209964946480738 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.09806914651583348, 'subsample': 0.8261395948843488, 'colsample_bytree': 0.8573359797588808, 'k_features': 338}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:29:29,605] Trial 88 finished with value: 0.9314669177887801 and parameters: {'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.1172419370027313, 'subsample': 0.8438488773106704, 'colsample_bytree': 0.823254823910604, 'k_features': 363}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:29:47,047] Trial 89 finished with value: 0.9318336993435888 and parameters: {'n_estimators': 400, 'max_depth': 7, 'learning_rate': 0.09444885827821155, 'subsample': 0.8142077581538704, 'colsample_bytree': 0.8066054451277359, 'k_features': 391}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:30:13,888] Trial 90 finished with value: 0.9308005461664612 and parameters: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.12892851309060613, 'subsample': 0.7718634347379979, 'colsample_bytree': 0.9596157533183891, 'k_features': 326}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:30:41,350] Trial 91 finished with value: 0.9326615711047069 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.0864320204862233, 'subsample': 0.8246325381629519, 'colsample_bytree': 0.8127122397431559, 'k_features': 409}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:31:09,768] Trial 92 finished with value: 0.9337408377352626 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.11152954150258305, 'subsample': 0.836835687303279, 'colsample_bytree': 0.8468310552353119, 'k_features': 255}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:31:38,247] Trial 93 finished with value: 0.9337866460491212 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.076690174006782, 'subsample': 0.8200199198346406, 'colsample_bytree': 0.8302157951420283, 'k_features': 431}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:32:06,075] Trial 94 finished with value: 0.934418024323 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.08551293784129656, 'subsample': 0.7180621482736635, 'colsample_bytree': 0.7827912196774628, 'k_features': 374}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:32:36,263] Trial 95 finished with value: 0.9369857204424931 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.09881730480646338, 'subsample': 0.7115097507598505, 'colsample_bytree': 0.816415147877328, 'k_features': 353}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:33:07,861] Trial 96 finished with value: 0.9332915435182977 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.08498828789332141, 'subsample': 0.7142481543430309, 'colsample_bytree': 0.8385411693975607, 'k_features': 382}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:33:39,160] Trial 97 finished with value: 0.9329630247887628 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.09222365000029835, 'subsample': 0.7391523537481436, 'colsample_bytree': 0.8174087550506348, 'k_features': 367}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:34:08,015] Trial 98 finished with value: 0.9307027321851657 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.12141309208225555, 'subsample': 0.761212665237573, 'colsample_bytree': 0.763997048861721, 'k_features': 167}. Best is trial 86 with value: 0.9377591282274104.




[I 2025-12-16 04:34:36,458] Trial 99 finished with value: 0.9276864001126544 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.1493828511862724, 'subsample': 0.7128211365883007, 'colsample_bytree': 0.7971091142624482, 'k_features': 274}. Best is trial 86 with value: 0.9377591282274104.
Melhor número de features selecionadas: 341




 Pipeline salva: ../models/sklearn/ChemBERTa_Mix_LGB_optuna_pipeline.joblib
 [TEST] R²=0.8400, RMSE=0.1436


In [79]:
def load_models_and_metrics(folder="../models/sklearn"):
    models = [f for f in os.listdir(folder) if f.endswith(".joblib")]
    results = []
    for file_name in models:
        model_name = file_name.replace("_optuna_pipeline.joblib", "")
        model = load(os.path.join(folder, file_name))
        if "ChemBERTa_Mix" in model_name:
            df_test = feat_test_mix_bert
            featurizer = "ChemBERTa"
            data_type = "Mix"
        elif "ChemFeaturizer_Mix" in model_name:
            df_test = feat_test_mix_chem
            featurizer = "ChemFeaturizer"
            data_type = "Mix"
        elif "ChemBERTa_Puro" in model_name:
            df_test = feat_test_pure_bert
            featurizer = "ChemBERTa"
            data_type = "Puro"
        elif "ChemFeaturizer_Puro" in model_name:
            df_test = feat_test_pure_chem
            featurizer = "ChemFeaturizer"
            data_type = "Puro"
        else:
            continue

        if "XGB" in model_name:
            model_type = "XGB"
        elif "LGB" in model_name:
            model_type = "LGB"
        elif "RF" in model_name:
            model_type = "RF"
        elif "SVR" in model_name:
            model_type = "SVR"
        else:
            model_type = "Unknown"

        X_test = df_test.drop(columns=["logV"])
        y_test = df_test["logV"]
        y_pred = model.predict(X_test)

        if data_type == 'Mix':
            mol1 = [col for col in X_test.columns if col.startswith('mol1')]
            mol2 = [col for col in X_test.columns if col.startswith('mol2')]
            inv_mols = mol2 + mol1 + ['frac'] + ['T']
            X_test_inv = X_test[inv_mols].copy()
            X_test_inv.columns = X_test.columns
            y_pred_inv = model.predict(X_test_inv)
            y_pred = 0.5 * y_pred + 0.5 * y_pred_inv

        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        results.append({
            "Featurizer": featurizer,
            "Type": data_type,
            "Model": model_type,
            "R2": r2,
            "RMSE": rmse
        })

    df_results = pd.DataFrame(results)
    df_results = df_results.sort_values(by=["Type", "Featurizer", "Model"]).reset_index(drop=True)
    return df_results

metrics_df = load_models_and_metrics()
metrics_df




Unnamed: 0,Featurizer,Type,Model,R2,RMSE
0,ChemBERTa,Mix,LGB,0.67878,0.203529
1,ChemBERTa,Mix,XGB,0.695918,0.198025
2,ChemFeaturizer,Mix,XGB,0.719597,0.190159
3,ChemBERTa,Puro,RF,0.890496,0.141315
4,ChemBERTa,Puro,SVR,0.944337,0.100753
5,ChemBERTa,Puro,XGB,0.938045,0.106295
6,ChemFeaturizer,Puro,RF,0.972323,0.071045
7,ChemFeaturizer,Puro,SVR,0.959024,0.086445
8,ChemFeaturizer,Puro,XGB,0.953514,0.092074
