In [3]:
import sys
import os

ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

print("Path configurado para importar módulos de 'chemai':")
print(ROOT_DIR)


from proxy import configure_proxy
configure_proxy(ROOT_DIR)

Path configurado para importar módulos de 'chemai':
c:\Users\f0pi\git\viscosidade-ai
Proxy configurado.


In [4]:
import os
import torch
import pytorch_lightning as pl

import numpy as np
import random

from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from transformers import AutoTokenizer, AutoModel
from peft import LoraConfig, get_peft_model

from chemai.loader import DipprDatasetLoader
from chemai.train import train_test_split
from chemai.model import ChemBERTModel
from chemai.datamodule import ChemBERTDataModule
from chemai.callbacks import BestModelExporter

from sklearn.preprocessing import StandardScaler

torch.set_float32_matmul_precision('medium')

In [5]:
MODEL_NAME = "DeepChem/ChemBERTa-77M-MTR"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME)

'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /DeepChem/ChemBERTa-77M-MTR/resolve/main/tokenizer_config.json (Caused by ProxyError(\'Unable to connect to proxy\', NameResolutionError("HTTPSConnection(host=\'inet-sys.petrobras.com.br\', port=804): Failed to resolve \'inet-sys.petrobras.com.br\' ([Errno 11001] getaddrinfo failed)")))'), '(Request ID: 81b93c28-204e-463e-970f-3eac968c91c1)')' thrown while requesting HEAD https://huggingface.co/DeepChem/ChemBERTa-77M-MTR/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /DeepChem/ChemBERTa-77M-MTR/resolve/main/tokenizer_config.json (Caused by ProxyError(\'Unable to connect to proxy\', NameResolutionError("HTTPSConnection(host=\'inet-sys.petrobras.com.br\', port=804): Failed to resolve \'inet-sys.petrobras.com.br\' ([Errno 11001] getaddrinfo failed)")))'), '(Request ID:

In [6]:
GLOBAL_SEED = 13
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
os.environ["PYTHONHASHSEED"] = str(GLOBAL_SEED)

torch.manual_seed(GLOBAL_SEED)
torch.cuda.manual_seed_all(GLOBAL_SEED)
pl.seed_everything(GLOBAL_SEED, workers=True)
torch.use_deterministic_algorithms(False)

Seed set to 13


In [7]:
data_loader = DipprDatasetLoader(data_dir='../data/nist_dippr_data')
data_loader.prepare()
pure = data_loader.get_pure()
mix = data_loader.get_mix()
pure_train, pure_dev = train_test_split(
    smiles1=pure['train']["MOL"].to_list(), 
    temperatures=pure['train']["T"].values, 
    y=pure['train']["logV"].values
)
pure_test = {
    'smiles': pure['test']["MOL"].to_list(),
    'temperatures': pure['test']["T"].values,
    'y':  pure['test']["logV"].values
}

mix_train, mix_dev = train_test_split(
    smiles1=mix['train']["MOL_1"].to_list(),
    smiles2=mix['train']["MOL_2"].to_list(),
    frac=mix['train']["MolFrac_1"].values,
    temperatures=mix['train']["T"].values,
    y=mix['train']["logV"].values,
)
mix_test = {
    "smiles_1": mix['test']["MOL_1"].to_list(),
    "smiles_2": mix['test']["MOL_2"].to_list(),
    "frac": mix['test']["MolFrac_1"].values,
    "temperatures": mix['test']["T"].values,
    "y": mix['test']["logV"].values,
}

print(f"Puro: train={len(pure_train['temperatures'])}, dev={len(pure_dev['temperatures'])}, test={len(pure_test['temperatures'])}")
print(f"Mix: train={len(mix_train['temperatures'])}, dev={len(mix_dev['temperatures'])}, test={len(mix_test['temperatures'])}")


Puro: train=5268, dev=1450, test=885
Mix: train=20635, dev=5254, test=5585


In [8]:
scaler_pure = StandardScaler()
pure_train['temperatures'] = scaler_pure.fit_transform(pure_train['temperatures'].reshape(-1, 1)).reshape(-1)
pure_dev['temperatures'] = scaler_pure.transform(pure_dev['temperatures'].reshape(-1, 1)).reshape(-1)
pure_test['temperatures'] = scaler_pure.transform(pure_test['temperatures'].reshape(-1, 1)).reshape(-1)

scaler_mix = StandardScaler()
mix_train['temperatures'] = scaler_mix.fit_transform(mix_train['temperatures'].reshape(-1, 1)).reshape(-1)
mix_dev['temperatures'] = scaler_mix.transform(mix_dev['temperatures'].reshape(-1, 1)).reshape(-1)
mix_test['temperatures'] = scaler_mix.transform(mix_test['temperatures'].reshape(-1, 1)).reshape(-1)

In [9]:
pure_dm = ChemBERTDataModule(tokenizer, pure_train, pure_dev, pure_test, batch_size=64, max_length=35)
mix_dm = ChemBERTDataModule(tokenizer, mix_train, mix_dev, mix_test, batch_size=64, max_length=25)


In [10]:
def train_variant(mode, use_lora, datamodule, export_dir='../models/torch'):
    if use_lora:
        lora_cfg = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.1, task_type="FEATURE_EXTRACTION")
        model_base = get_peft_model(base_model, lora_cfg)
    else:
        model_base = base_model

    model = ChemBERTModel(
        base_model=model_base,
        mode=mode,
        hidden_dim=128,
        lr_head=1e-3,
        lr_lora=2e-4,
        weight_decay=1e-4
    )

    ckpt = ModelCheckpoint(monitor="val_r2", save_top_k=1, mode="max")
    early = EarlyStopping(monitor="val_r2", patience=5, mode="max")

    scaler = scaler_pure if mode == 'pure' else scaler_mix
    exporter = BestModelExporter(export_dir=
                                 f"{export_dir}/{mode}_{'lora' if use_lora else 'base'}", 
                                 scaler=scaler
    )
    
    trainer = pl.Trainer(
        max_epochs=100,
        accelerator="gpu" if torch.cuda.is_available() else "cpu",
        callbacks=[ckpt, early, exporter],
    )


    trainer.fit(model, datamodule)

In [11]:
train_variant('pure', False, pure_dm)
# train_variant('pure', True, pure_dm)
# train_variant('mix', False, mix_dm)
# train_variant('mix', True, mix_dm)

Base model congelado (train apenas MLP).


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
c:\Users\f0pi\AppData\Local\miniforge3\envs\chemai\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

In [46]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd

def evaluate_torch_pipeline(mode, model_dir, test_loader, max_length=128):
    map_location="cuda" if torch.cuda.is_available() else "cpu"
    mlp = torch.load(f"{model_dir}/mlp.pt", map_location=map_location, weights_only=False)
    mlp.eval().to(map_location)
    
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    base_model = AutoModel.from_pretrained(model_dir)
    base_model.to(map_location).eval()

    reals, preds = [], []
    with torch.no_grad():
        for batch in test_loader:
            out1 = base_model(
                input_ids=batch['input_ids_1'].to(map_location),
                attention_mask=batch['attention_mask_1'].to(map_location),
            )
            cls1 = out1.last_hidden_state[:, 0, :]
    
            if mode == 'pure':
                t = batch['temperatures'].unsqueeze(1).to(map_location).float()
                x = torch.cat([cls1, t], dim=1)
                y_hat = mlp(x).squeeze(1)
            else:
                out2 = base_model(
                    input_ids=batch['input_ids_2'].to(map_location),
                    attention_mask=batch['attention_mask_2'].to(map_location),
                )
                
                cls2 = out2.last_hidden_state[:, 0, :]
                t = batch['temperatures'].unsqueeze(1).to(map_location).float()
                f = batch['frac'].unsqueeze(1).to(map_location).float()
        
                x1 = torch.cat([cls1, cls2, t, f], dim=1)
                x2 = torch.cat([cls2, cls1, t, 1 - f], dim=1)
                y_hat = 0.5 * (mlp(x1) + mlp(x2))
                
            reals.append(batch["y"])
            preds.append(y_hat.cpu())

    reals = torch.cat(reals)
    preds = torch.cat(preds)

    r2 = r2_score(reals, preds)
    rmse = np.sqrt(mean_squared_error(reals, preds))

    return {"Model_Dir": model_dir, "R2": r2, "RMSE": rmse}

In [47]:
def evaluate_all_models(base_dir="../models/torch", max_length=35):
    pure_dm.setup('test')
    mix_dm.setup('test')
    
    configs = [
        ("pure_base", pure_dm.test_dataloader()),
        ("pure_lora", pure_dm.test_dataloader()),
        ("mix_base", mix_dm.test_dataloader()),
        ("mix_lora", mix_dm.test_dataloader())
    ]

    results = []
    for model_name, test_dataloader in configs:
        model_dir = f"{base_dir}/{model_name}"
        parts = model_name.split('_')
        res = evaluate_torch_pipeline(parts[0], model_dir, test_dataloader, max_length)
        results.append({
            "Dataset": parts[0],
            "Arquitetura": parts[1],
            "R2": res["R2"],
            "RMSE": res["RMSE"]
        })

    df_results = pd.DataFrame(results).sort_values(by=["Dataset", "Arquitetura"]).reset_index(drop=True)
    return df_results


metrics_all_df = evaluate_all_models()
print(metrics_all_df)


  Dataset Arquitetura        R2      RMSE
0     mix        base  0.937714  0.089623
1     mix        lora  0.932687  0.093170
2    pure        base  0.934206  0.109539
3    pure        lora  0.947874  0.097499


In [53]:
metrics_all_df
metrics_all_df.index.name =  'id'

In [54]:
metrics_all_df

Unnamed: 0_level_0,Dataset,Arquitetura,R2,RMSE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,mix,base,0.937714,0.089623
1,mix,lora,0.932687,0.09317
2,pure,base,0.934206,0.109539
3,pure,lora,0.947874,0.097499


In [8]:
scaler = StandardScaler()

In [10]:
scaler_dict = {
    'mean_': scaler.mean_.tolist() if hasattr(scaler, 'mean_') else None,
    'scale_': scaler.scale_.tolist() if hasattr(scaler, 'scale_') else None,
    'var_': scaler.var_.tolist() if hasattr(scaler, 'var_') else None,
    'n_features_in_': getattr(scaler, 'n_features_in_', None),
    'feature_names_in_': getattr(scaler, 'feature_names_in_', None),
}

In [11]:
scaler_dict

{'mean_': None,
 'scale_': None,
 'var_': None,
 'n_features_in_': None,
 'feature_names_in_': None}

In [15]:
os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

'c:\\Users\\f0pi\\git'

In [14]:
os.getcwd()

'c:\\Users\\f0pi\\git\\viscosidade-ai\\notebooks'