In [1]:
import sys
import os

ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

print("Path configurado para importar módulos de 'chemai':")
print(ROOT_DIR)


from proxy import configure_proxy
configure_proxy(ROOT_DIR)


Path configurado para importar módulos de 'chemai':
c:\Users\f0pi\git\viscosidade-ai
Proxy configurado.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import  SelectKBest, f_regression, VarianceThreshold

from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor
import optuna
from joblib import dump

import torch
import random
from skl2onnx import convert_sklearn

from skl2onnx.common.data_types import FloatTensorType

from chemai.loader import DipprDatasetLoader
from chemai.chem_featurizer import ChemFeaturizer
from chemai.chemberta_featurizer import ChemBERTaFeaturizer
from chemai.train import train_test_split



In [3]:
GLOBAL_SEED = 13
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
os.environ["PYTHONHASHSEED"] = str(GLOBAL_SEED)

In [4]:
data_loader = DipprDatasetLoader(data_dir='../data/nist_dippr_data')
data_loader.prepare()

pure = data_loader.get_pure()
mix  = data_loader.get_mix()

# Separar puros
pure_train, pure_dev = train_test_split(
    smiles1=pure['train']["MOL"],
    T=pure['train']["T"],
    logV=pure['train']["logV"]
)
pure_test = pure['test']  # conjunto final de teste

# Separar misturas
mix_train, mix_dev = train_test_split(
    smiles1=mix['train']["MOL_1"],
    smiles2=mix['train']["MOL_2"],
    MolFrac_1=mix['train']["MolFrac_1"],
    T=mix['train']["T"],
    logV=mix['train']["logV"]
)
mix_test = mix['test']  # conjunto final de teste

print(f"Puro: train={len(pure_train['smiles'])}, dev={len(pure_dev['smiles'])}, test={pure_test.shape[0]}")
print(f"Mix: train={len(mix_train['smiles_1'])}, dev={len(mix_dev['smiles_1'])}, test={mix_test.shape[0]}")


Puro: train=5268, dev=1450, test=885
Mix: train=20635, dev=5254, test=5585


In [5]:
chem = ChemFeaturizer()
chemberta = ChemBERTaFeaturizer(device='cuda' if torch.cuda.is_available() else 'cpu')

# PUROS
df_train_pure = pd.DataFrame({'MOL': pure_train['smiles'], 'T': pure_train['T'], 'logV': pure_train['logV']})
df_dev_pure   = pd.DataFrame({'MOL': pure_dev['smiles'],   'T': pure_dev['T'],   'logV': pure_dev['logV']})
df_test_pure  = pure_test.copy()

feat_train_pure_chem = chem.featurize_pure(df_train_pure)
feat_dev_pure_chem   = chem.featurize_pure(df_dev_pure)
feat_test_pure_chem  = chem.featurize_pure(df_test_pure)

feat_train_pure_bert = chemberta.featurize_pure(df_train_pure)
feat_dev_pure_bert   = chemberta.featurize_pure(df_dev_pure)
feat_test_pure_bert  = chemberta.featurize_pure(df_test_pure)

# MISTURAS
df_train_mix = pd.DataFrame({
    'MOL_1': mix_train['smiles_1'], 'MOL_2': mix_train['smiles_2'],
    'MolFrac_1': mix_train['MolFrac_1'], 'T': mix_train['T'], 'logV': mix_train['logV']})
df_dev_mix = pd.DataFrame({
    'MOL_1': mix_dev['smiles_1'], 'MOL_2': mix_dev['smiles_2'],
    'MolFrac_1': mix_dev['MolFrac_1'], 'T': mix_dev['T'], 'logV': mix_dev['logV']})
df_test_mix = mix_test.copy()

feat_train_mix_chem = chem.featurize_mix_parallel(df_train_mix)
feat_dev_mix_chem   = chem.featurize_mix_parallel(df_dev_mix)
feat_test_mix_chem  = chem.featurize_mix_parallel(df_test_mix)

feat_train_mix_bert = chemberta.featurize_mix(df_train_mix)
feat_dev_mix_bert   = chemberta.featurize_mix(df_dev_mix)
feat_test_mix_bert  = chemberta.featurize_mix(df_test_mix)


Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def build_pipeline(model, X_sample, embedding_features, k_features=50):
    if not embedding_features:
        cols = list(X_sample.columns)
        ignore = [c for c in cols if 'has_' in c or c == 'frac']
        floats = [c for c in cols if np.issubdtype(X_sample[c].dtype, np.floating) and c not in ignore]
        ints = [c for c in cols if np.issubdtype(X_sample[c].dtype, np.integer) and c not in ignore]


        transformers = []
        if floats: transformers.append(('float_scaler', StandardScaler(), floats))
        if ints: transformers.append(('int_scaler', MinMaxScaler(), ints))
        if ignore: transformers.append(('skip', 'passthrough', ignore))

        col_transform = ColumnTransformer(transformers)

        pipeline = Pipeline([
            ('scaler', col_transform),
            ('imputer', SimpleImputer(strategy='median')),
            ('kbest', SelectKBest(score_func=f_regression, k=k_features)),
            ('model', model)
        ])
        
        return pipeline
    
    pipeline = Pipeline([
        ('selector', VarianceThreshold(threshold=0.0)),
        ('model', model)
    ])
    return pipeline 


In [7]:
def optuna_objective(trial, model_name, X_train, y_train, X_dev, y_dev, embedding_features):
    if model_name == 'SVR':
        params = {
            'C': trial.suggest_float('C', 1e-2, 1e2, log=True),
            'kernel': trial.suggest_categorical('kernel', ['rbf', 'linear']),
            'gamma': trial.suggest_categorical('gamma', ['scale', 'auto'])
        }
        model = SVR(**params)

    elif model_name == 'XGB':
        params = {
            'n_estimators': trial.suggest_categorical('n_estimators', [200, 400, 800]),
            'max_depth': trial.suggest_int('max_depth', 3, 8),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.7, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
            'random_state': GLOBAL_SEED
        }
        model = XGBRegressor(**params, n_jobs=-1)

    elif model_name == 'RF':
        params = {
            'n_estimators': trial.suggest_categorical('n_estimators', [200, 400, 800]),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
            'random_state': GLOBAL_SEED,
        }
        model = model = RandomForestRegressor(**params,  n_jobs=-1)

    k_features = trial.suggest_int('k_features', 50, X_train.shape[1])
    pipeline = build_pipeline(model, X_train, embedding_features, k_features)

    pipeline.fit(X_train, y_train)

    y_pred_dev = pipeline.predict(X_dev)
    r2_dev = r2_score(y_dev, y_pred_dev)
    rmse_dev = np.sqrt(mean_squared_error(y_dev, y_pred_dev))

    trial.set_user_attr('rmse_dev', rmse_dev)
    trial.set_user_attr('k_features', k_features)
    trial.set_user_attr('params', params)

    return r2_dev


In [None]:
def export_pipeline_to_onnx(pipeline, columns, feat_name, model_name):
    initial_types = [(col, FloatTensorType([None, 1])) for col in columns]
    
    if model_name != 'XGB':
        onnx_path = f"../models/sklearn/{feat_name}_{model_name}_optuna_pipeline.onnx"

        onnx_model = convert_sklearn(pipeline, initial_types=initial_types)

        with open(onnx_path, "wb") as f:
            f.write(onnx_model.SerializeToString())
    else:
        preprocess = Pipeline(pipeline.steps[:-1])
        xgb_model = pipeline.named_steps["model"]


        preprocess_path = f"../models/sklearn/{feat_name}_{model_name}_optuna_preprocess.onnx"

        onnx_pre = convert_sklearn(preprocess, initial_types=initial_types)
        with open(preprocess_path, "wb") as f:
            f.write(onnx_pre.SerializeToString())


        model_path = f"../models/sklearn/{feat_name}_{model_name}_optuna_model.onnx"
        xgb_model.get_booster().save_model(model_path)

: 

In [None]:
n_trials = 100
results = []
os.makedirs("../models/sklearn", exist_ok=True)

datasets = {
    # 'ChemFeaturizer_Puro': (feat_train_pure_chem, feat_dev_pure_chem, feat_test_pure_chem),
    # 'ChemFeaturizer_Mix':  (feat_train_mix_chem,  feat_dev_mix_chem,  feat_test_mix_chem),
    # 'ChemBERTa_Puro': (feat_train_pure_bert, feat_dev_pure_bert, feat_test_pure_bert),
    'ChemBERTa_Mix': (feat_train_mix_bert, feat_dev_mix_bert, feat_test_mix_bert)
}

sampler = optuna.samplers.TPESampler(seed=GLOBAL_SEED)

for feat_name, (df_train, df_dev, df_test) in datasets.items():
    X_train, y_train = df_train.drop(columns=['logV']), df_train['logV']
    X_dev, y_dev = df_dev.drop(columns=['logV']), df_dev['logV']
    X_test, y_test = df_test.drop(columns=['logV']), df_test['logV']

    # ['RF', 'SVR', 'XGB']
    for model_name in ['XGB']:
        print(f"\n Optuna - {model_name} ({feat_name})")
        study = optuna.create_study(direction='maximize', sampler=sampler)
        study.optimize(lambda trial: optuna_objective(
            trial, model_name, X_train, y_train, X_dev, y_dev, feat_name.startswith('ChemBERTa')),
                       n_trials=n_trials, show_progress_bar=True)

        best_trial = study.best_trial
        best_params = best_trial.user_attrs['params']

        X_full = pd.concat([X_train, X_dev], axis=0).reset_index(drop=True)
        y_full = pd.concat([y_train, y_dev], axis=0).reset_index(drop=True)

        if model_name == 'SVR':
            model_final = SVR(**best_params)
        elif model_name == 'XGB':
            model_final = XGBRegressor(**best_params)
        elif model_name == 'RF':
            model_final = RandomForestRegressor(**best_params)

        k_features = best_trial.user_attrs['k_features']
        print(f"Melhor número de features selecionadas: {k_features}")

        full_pipeline = build_pipeline(model_final, X_full, feat_name.startswith('ChemBERTa'), k_features)
        full_pipeline.fit(X_full, y_full)
        y_pred_test = full_pipeline.predict(X_test)

        r2 = r2_score(y_test, y_pred_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

        filename = f"../models/sklearn/{feat_name}_{model_name}_optuna_pipeline.joblib"
        dump(full_pipeline, filename)

        export_pipeline_to_onnx(full_pipeline, X_train.columns, feat_name, model_name)

        print(f" Pipeline salva: {filename}")
        print(f" [TEST] R²={r2:.4f}, RMSE={rmse:.4f}")

        results.append({
            'featurizer': feat_name,
            'model': model_name,
            'r2_test': r2,
            'rmse_test': rmse,
            'params': best_params,
            'file': filename
        })


[I 2025-12-15 19:48:33,242] A new study created in memory with name: no-name-205ab521-3ed0-4704-b485-6b89542da295



 Optuna - XGB (ChemBERTa_Mix)


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-12-15 19:49:59,376] Trial 0 finished with value: 0.8989100181296052 and parameters: {'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.2733063849296914, 'subsample': 0.8360347742251937, 'colsample_bytree': 0.8827127388283833, 'k_features': 609}. Best is trial 0 with value: 0.8989100181296052.
[I 2025-12-15 19:50:18,434] Trial 1 finished with value: 0.8362310521502271 and parameters: {'n_estimators': 400, 'max_depth': 4, 'learning_rate': 0.012201972599808023, 'subsample': 0.9571182827761597, 'colsample_bytree': 0.8118562083624422, 'k_features': 540}. Best is trial 0 with value: 0.8989100181296052.
[I 2025-12-15 19:50:36,982] Trial 2 finished with value: 0.9158206472807555 and parameters: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.252305773452444, 'subsample': 0.7653697027395067, 'colsample_bytree': 0.7958174099141093, 'k_features': 711}. Best is trial 2 with value: 0.9158206472807555.
[I 2025-12-15 21:27:20,406] Trial 3 finished with value: 0.9240130726893467