In [1]:
import os, sys, json, pickle, random
from collections import OrderedDict
from copy import deepcopy

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem.Scaffolds import MurckoScaffold

import optuna
from optuna.trial import Trial

import matplotlib.pyplot as plt
import seaborn as sns

# -------------------- [0. g-mlp 모듈 경로] --------------------
GMLP_DIR = "/home/minji/g-mlp"
if GMLP_DIR not in sys.path:
    sys.path.append(GMLP_DIR)
from g_mlp import gMLP


# -------------------- [1. 공통 유틸/환경] --------------------
def set_seed(seed=700):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # 추가: GPU 연산의 무작위성 제거
    if torch.cuda.is_available():
        torch.use_deterministic_algorithms(True)
        os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    else:
        # CPU 환경에서 무작위성 제거
        torch.use_deterministic_algorithms(True, warn_only=True)
        

set_seed(700)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# -------------------- [2. 피처 생성 유틸] --------------------
def to_numpy_bitvect(bitvect, n_bits=None, drop_first=False):
    if n_bits is None:
        n_bits = bitvect.GetNumBits()
    arr = np.zeros((n_bits,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(bitvect, arr)
    if drop_first:
        arr = arr[1:]
    return arr.astype(np.float32)

def get_ecfp(mol, radius=2, nbits=1024):
    return to_numpy_bitvect(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits), n_bits=nbits)

def get_maccs(mol):
    bv = MACCSkeys.GenMACCSKeys(mol)
    return to_numpy_bitvect(bv, n_bits=bv.GetNumBits(), drop_first=True)

def get_avalon(mol, nbits=512):
    from rdkit.Avalon import pyAvalonTools
    return to_numpy_bitvect(pyAvalonTools.GetAvalonFP(mol, nbits), n_bits=nbits)

def get_topological_torsion(mol, nbits=1024):
    bv = rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=nbits)
    return to_numpy_bitvect(bv, n_bits=nbits)

def get_rdkit_desc(mol):
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([d[0] for d in Descriptors._descList])
    try:
        descs = calc.CalcDescriptors(mol)
        descs = np.array(descs, dtype=np.float32)
        descs = np.nan_to_num(descs, nan=0.0, posinf=0.0, neginf=0.0)
    except Exception:
        descs = np.zeros(len(Descriptors._descList), dtype=np.float32)
    return descs

def get_rdkit_descriptor_length():
    return len(Descriptors._descList)

In [3]:
# -------------------- [3. SCAGE 임베딩 로드 & 차원 감지] --------------------
def load_scage_embeddings(scage_paths: dict):
    scage_embed_dicts, scage_dims = {}, {}
    for name, path in scage_paths.items():
        try:
            df = pd.read_csv(path)
        except Exception:
            scage_embed_dicts[name] = {}
            scage_dims[name] = 0
            continue
        
        def canon(s):
            m = Chem.MolFromSmiles(s)
            return Chem.MolToSmiles(m, canonical=True) if m else None

        df['smiles'] = df['smiles'].apply(canon)
        df = df.dropna(subset=['smiles']).reset_index(drop=True)

        embed_cols = [c for c in df.columns if c != 'smiles']
        dim = len(embed_cols)
        scage_dims[name] = dim
        scage_embed_dicts[name] = {
            row['smiles']: row[embed_cols].to_numpy(dtype=np.float32, copy=False)
            for _, row in df.iterrows()
        }
    return scage_embed_dicts, scage_dims

In [4]:
# -------------------- [4. 기대 차원 계산 + 안전 결합] --------------------
def compute_expected_dims(fp_types, scage_dims: dict):
    expected = OrderedDict()
    for t in fp_types:
        if t == 'ecfp':
            expected[t] = 1024
        elif t == 'avalon':
            expected[t] = 512
        elif t == 'maccs':
            expected[t] = 166
        elif t == 'tt':
            expected[t] = 1024
        elif t == 'rdkit':
            expected[t] = get_rdkit_descriptor_length()
        elif 'scage' in t:
            expected[t] = scage_dims.get(t, 512) if scage_dims.get(t, 0) > 0 else 512
        else:
            raise ValueError(f"Unknown fp_type: {t}")
    return expected

def safe_fit_to_dim(vec: np.ndarray, target_dim: int) -> np.ndarray:
    if vec is None:
        return np.zeros(target_dim, dtype=np.float32)
    vec = vec.astype(np.float32, copy=False)
    if np.any(np.isnan(vec)) or np.any(np.isinf(vec)):
        vec = np.nan_to_num(vec, nan=0.0, posinf=0.0, neginf=0.0)
    cur = vec.shape[0]
    if cur == target_dim:
        return vec
    elif cur < target_dim:
        pad = np.zeros(target_dim - cur, dtype=np.float32)
        return np.concatenate([vec, pad], axis=0)
    else:
        return vec[:target_dim]

def make_feature_vector(mol, smiles, fp_types, expected_dims, scage_embed_dicts):
    chunks = []
    for t in fp_types:
        dim = expected_dims[t]
        try:
            if t == 'ecfp':
                vec = get_ecfp(mol, radius=2, nbits=dim)
            elif t == 'avalon':
                vec = get_avalon(mol, nbits=dim)
            elif t == 'maccs':
                vec = get_maccs(mol)
            elif t == 'tt':
                vec = get_topological_torsion(mol, nbits=dim)
            elif t == 'rdkit':
                vec = get_rdkit_desc(mol)
            elif 'scage' in t:
                vec = scage_embed_dicts.get(t, {}).get(smiles, None)
            else:
                vec = None
        except Exception:
            vec = None
        chunks.append(safe_fit_to_dim(vec, dim))
    feat = np.concatenate(chunks, axis=0)
    feat = np.nan_to_num(feat, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
    return feat

In [5]:
# -------------------- [5. Dataset] --------------------
class ScageConcatDataset(data.Dataset):
    def __init__(self, label_path, scage_paths: dict, fp_types, expected_dims=None):
        df = pd.read_csv(label_path)
        df = df[['smiles', 'p_np']].rename(columns={'smiles': 'smiles', 'p_np': 'label'})
        df['label'] = df['label'].replace({'BBB-': 0, 'BBB+': 1})
        df = df.drop_duplicates(subset='smiles').reset_index(drop=True)

        self.scage_embed_dicts, scage_dims = load_scage_embeddings(scage_paths)

        if expected_dims is None:
            expected_dims = compute_expected_dims(fp_types, scage_dims)
        self.expected_dims = expected_dims
        self.fp_types = list(fp_types)

        def canon(s):
            m = Chem.MolFromSmiles(s)
            return Chem.MolToSmiles(m, canonical=True) if m else None

        df['smiles'] = df['smiles'].apply(canon)
        df = df.dropna(subset=['smiles']).reset_index(drop=True)

        features, labels, failed = [], [], []
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Generating Features"):
            smi = row['smiles']
            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                failed.append(smi)
                continue
            feat = make_feature_vector(mol, smi, self.fp_types, self.expected_dims, self.scage_embed_dicts)
            if feat is None or feat.ndim != 1:
                failed.append(smi)
                continue
            features.append(feat)
            labels.append(row['label'])

        self.features = torch.tensor(np.stack(features, axis=0), dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
        self.df = df[~df['smiles'].isin(failed)].reset_index(drop=True)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [143]:
# -------------------- [6. 스플릿 + RDKit 정규화 (토글)] --------------------
def split_then_normalize(
    dataset: ScageConcatDataset,
    split_mode: str = "scaffold",
    train_ratio: float = 0.8,
    val_ratio: float = 0.1,
    seed: int = 700
):
    set_seed(seed)
    df = dataset.df.copy()

    def get_scaffold(smi):
        m = Chem.MolFromSmiles(smi)
        return Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(m)) if m else None

    df['scaffold'] = df['smiles'].apply(get_scaffold)
    groups = list(df.groupby('scaffold').groups.values())

    if split_mode == "scaffold":
        groups = sorted(groups, key=lambda g: len(g), reverse=True)
    elif split_mode == "random_scaffold":
        rnd = random.Random(seed)
        rnd.shuffle(groups)
    else:
        raise ValueError("split_mode must be 'scaffold' or 'random_scaffold'")

    n = len(df)
    train_cap = int(round(train_ratio * n))
    val_cap   = int(round(val_ratio   * n))

    train_idx, val_idx, test_idx = [], [], []
    for g in groups:
        g = list(g)
        if len(train_idx) + len(g) <= train_cap:
            train_idx += g
        elif len(val_idx) + len(g) <= val_cap:
            val_idx += g
        else:
            test_idx += g

    def pick(idxs):
        return dataset.features[idxs], dataset.labels[idxs]

    X_train, y_train = pick(train_idx)
    X_val,   y_val   = pick(val_idx)
    X_test,  y_test  = pick(test_idx)

    rd_start, rd_end = None, None
    offset = 0
    for t in dataset.fp_types:
        dim = dataset.expected_dims[t]
        if t == 'rdkit':
            rd_start, rd_end = offset, offset + dim
            break
        offset += dim

    scaler = None
    if rd_start is not None:
        scaler = StandardScaler().fit(X_train[:, rd_start:rd_end])
        X_train[:, rd_start:rd_end] = torch.tensor(scaler.transform(X_train[:, rd_start:rd_end]), dtype=torch.float32)
        X_val[:, rd_start:rd_end]   = torch.tensor(scaler.transform(X_val[:, rd_start:rd_end]),   dtype=torch.float32)
        X_test[:, rd_start:rd_end]  = torch.tensor(scaler.transform(X_test[:, rd_start:rd_end]),  dtype=torch.float32)

    return (
        data.TensorDataset(X_train, y_train),
        data.TensorDataset(X_val,   y_val),
        data.TensorDataset(X_test,  y_test),
        scaler, (rd_start, rd_end),
        (train_idx, val_idx, test_idx)
    )

In [144]:
# -------------------- [7. 모델] --------------------
class MultiModalGMLPFromFlat(nn.Module):
    def __init__(self, mod_dims: OrderedDict, d_model=512, d_ffn=1024, depth=4, dropout=0.2, use_gated_pool=True):
        super().__init__()
        self.mod_names = list(mod_dims.keys())
        self.mod_dims = [mod_dims[n] for n in self.mod_names]
        self.in_features = sum(self.mod_dims)
        self.seq_len = len(self.mod_names)
        self.use_gated_pool = use_gated_pool

        self.proj = nn.ModuleDict({name: nn.Linear(in_dim, d_model) for name, in_dim in zip(self.mod_names, self.mod_dims)})
        self.backbone = gMLP(seq_len=self.seq_len, d_model=d_model, d_ffn=d_ffn, num_layers=depth)
        self.norm = nn.LayerNorm(d_model)
        if use_gated_pool:
            self.alpha = nn.Parameter(torch.zeros(self.seq_len))
        self.head = nn.Linear(d_model, 1)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        chunks = torch.split(x, self.mod_dims, dim=1)
        tokens = [self.proj[name](chunk) for name, chunk in zip(self.mod_names, chunks)]
        X = torch.stack(tokens, dim=1)
        X = self.backbone(X)
        if self.use_gated_pool:
            w = torch.softmax(self.alpha, dim=0)
            Xp = (X * w.view(1, -1, 1)).sum(dim=1)
        else:
            Xp = X.mean(dim=1)
        Xp = self.drop(self.norm(Xp))
        logits = self.head(Xp).squeeze(-1)
        return logits
    
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [145]:
# -------------------- [8. 학습/평가 루틴] --------------------
def train_model(model, optimizer, train_loader, val_loader, loss_fn, num_epochs=50, patience=10):
    best_val = float('inf'); best_state = None; bad = 0
    for epoch in range(num_epochs):
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = loss_fn(logits, y)
            loss.backward()
            optimizer.step()
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                val_loss += loss_fn(model(x), y).item()
        val_loss /= len(val_loader)
        
        if val_loss < best_val:
            best_val = val_loss
            best_state = deepcopy(model.state_dict())
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                break
    if best_state:
        model.load_state_dict(best_state)
    return model

def eval_model(model, loader):
    model.eval()
    y_true, y_prob, y_pred = [], [], []
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            logits = model(x)
            probs = torch.sigmoid(logits).cpu().numpy()
            pred = (probs > 0.5).astype(int)
            y_prob.extend(probs)
            y_pred.extend(pred)
            y_true.extend(y.numpy())
    cm = confusion_matrix(y_true, y_pred)
    if cm.size == 4:
        tn, fp, fn, tp = cm.ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
        sensitivity = recall_score(y_true, y_pred, zero_division=0)
    else:
        specificity, sensitivity = 0.0, 0.0
    return {
        'accuracy': round(accuracy_score(y_true, y_pred), 3),
        'precision': round(precision_score(y_true, y_pred, zero_division=0), 3),
        'recall': round(sensitivity, 3),
        'f1': round(f1_score(y_true, y_pred, zero_division=0), 3),
        'roc_auc': round(roc_auc_score(y_true, y_prob) if len(set(y_true)) > 1 else 0.0, 3),
        'mcc': round(matthews_corrcoef(y_true, y_pred), 3),
        'sensitivity': sensitivity,
        'specificity': specificity,
    }

In [146]:
# -------------------- [9. 스플릿 인덱스 저장/로드 (SCAGE 공정 비교용)] --------------------
def save_split_indices(out_dir, tag, split_indices):
    os.makedirs(out_dir, exist_ok=True)
    train_idx, val_idx, test_idx = split_indices
    np.save(os.path.join(out_dir, f"train_idx_{tag}.npy"), np.array(train_idx, dtype=np.int64))
    np.save(os.path.join(out_dir, f"val_idx_{tag}.npy"),   np.array(val_idx,   dtype=np.int64))
    np.save(os.path.join(out_dir, f"test_idx_{tag}.npy"),  np.array(test_idx,  dtype=np.int64))

def load_split_indices(path, tag):
    train_idx = np.load(os.path.join(path, f"train_idx_{tag}.npy"))
    val_idx = np.load(os.path.join(path, f"val_idx_{tag}.npy"))
    test_idx = np.load(os.path.join(path, f"test_idx_{tag}.npy"))
    return train_idx, val_idx, test_idx
# -------------------- [10. 구성/스케일러 저장·로드] --------------------
def save_config(cfg_path, config: dict):
    with open(cfg_path, 'w') as f:
        json.dump(config, f, indent=2)

def load_config(cfg_path):
    with open(cfg_path, 'r') as f:
        return json.load(f)

def save_scaler(path, scaler):
    with open(path, 'wb') as f:
        pickle.dump(scaler, f)

def load_scaler(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [87]:
if __name__ == "__main__":
    # (A) 실험 스위치
    split_mode = "scaffold" # "scaffold" | "random_scaffold"
    
    # (A-1) 반복 실험을 위한 시드 리스트
    seeds = [42, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    
    # (A-2) 결과를 저장할 리스트
    results = []

    # (L) 반복문 시작
    for seed in seeds:
        print("\n" + "="*80)
        print(f"--- [STARTING NEW RUN] Split Mode: {split_mode}, Seed: {seed} ---")
        print("="*80)
        
        # (B) 경로/설정 — BBBP(2039)만 사용
        label_path = '/home/minji/Downloads/bbbp.csv'
        scage_paths = {
            'scage1': '/home/minji/scage/BBB/bench_embed.csv',
            'scage2' : '/home/minji/scage/BBB/bench_atom_embed.csv'
        }
        fp_types = ['rdkit', 'scage1', 'mole']

        # (C) Dataset & dims
        dataset = ScageConcatDataset(label_path, scage_paths, fp_types=fp_types)
        expected_dims = dataset.expected_dims
        mod_dims = OrderedDict((t, expected_dims[t]) for t in fp_types)

        # (D) Split + RDKit normalize (8:1:1)
        train_ds, val_ds, test_ds, scaler, (rd_start, rd_end), split_indices = split_then_normalize(
            dataset, split_mode=split_mode, train_ratio=0.8, val_ratio=0.1, seed=seed
        )

        # (E) 스플릿 저장(→ SCAGE도 같은 인덱스로 학습/평가 가능)
        split_dir = "./splits"
        tag = f"{split_mode}_seed{seed}"
        save_split_indices(split_dir, tag, split_indices)
        print(f"[Save] split indices -> {split_dir} (tag={tag})")

        # (F) DataLoader
        train_loader = data.DataLoader(train_ds, batch_size=128, shuffle=True)
        val_loader   = data.DataLoader(val_ds,   batch_size=128, shuffle=False)
        test_loader  = data.DataLoader(test_ds,  batch_size=128, shuffle=False)
        
        # --- 클래스 분포 분석 및 시각화 ---
        def plot_class_distribution(train_labels, val_labels, test_labels, seed):
            labels_map = {0: 'Negative', 1: 'Positive'}
            train_counts = pd.Series(train_labels).map(labels_map).value_counts(normalize=True).sort_index()
            val_counts = pd.Series(val_labels).map(labels_map).value_counts(normalize=True).sort_index()
            test_counts = pd.Series(test_labels).map(labels_map).value_counts(normalize=True).sort_index()

            counts_df = pd.DataFrame({
                'Train': train_counts,
                'Validation': val_counts,
                'Test': test_counts
            }).fillna(0)

            fig, ax = plt.subplots(figsize=(8, 6))
            counts_df.T.plot(kind='bar', stacked=False, ax=ax, rot=0)
            ax.set_title(f'Class Distribution by Split (Seed: {seed})')
            ax.set_ylabel('Proportion')
            ax.set_xlabel('Dataset Split')
            ax.legend(title='Class')
            plt.tight_layout()
            
            os.makedirs("./artifacts", exist_ok=True)
            plt.savefig(f'./artifacts/class_distribution_{split_mode}_seed{seed}.png')
            plt.close()

        y_train = train_ds.tensors[1].cpu().numpy()
        y_val = val_ds.tensors[1].cpu().numpy()
        y_test = test_ds.tensors[1].cpu().numpy()

        print(f"\n--- [Info] Class distribution for seed {seed} ---")
        print(f"Train: Positive={np.mean(y_train):.2f}, Negative={1-np.mean(y_train):.2f}")
        print(f"Validation: Positive={np.mean(y_val):.2f}, Negative={1-np.mean(y_val):.2f}")
        print(f"Test: Positive={np.mean(y_test):.2f}, Negative={1-np.mean(y_test):.2f}")
        
        plot_class_distribution(y_train, y_val, y_test, seed)

        # (G) Model
        set_seed(seed)
        model = MultiModalGMLPFromFlat(
            mod_dims=mod_dims, d_model=512, d_ffn=1048, depth=4, dropout=0.2, use_gated_pool=True
        ).to(device)

        # (H) pos_weight (옵션)
        pos_weight = None
        try:
            n_pos = (y_train == 1).sum()
            n_neg = (y_train == 0).sum()
            if n_pos > 0:
                pos_weight = torch.tensor([max(n_neg / n_pos, 1.0)], dtype=torch.float32, device=device)
                print(f"[Info] Using pos_weight={pos_weight.item():.4f} (neg/pos={n_neg}/{n_pos})")
        except Exception as e:
            print(f"[Warn] pos_weight auto-calc skipped: {e}")

        optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
        loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight) if pos_weight is not None else nn.BCEWithLogitsLoss()

        # (I) Train
        print(f"--- [STEP 1] Training gMLP on BBBP (split_mode={split_mode}) ---")
        model = train_model(model, optimizer, train_loader, val_loader, loss_fn, num_epochs=50, patience=10)
        print("\n--- [STEP 1.5] Evaluating on BBBP Validation Split ---")
        baseline_val_metrics = eval_model(model, val_loader)
        
        # (J) Test eval (== 공식 벤치마크 성능 지표)
        print("\n--- [STEP 2] Evaluating on BBBP Test Split ---")
        metrics = eval_model(model, test_loader)
        
        # (J-1) 각 시드별 결과 저장
        metrics['seed'] = seed
        metrics['train_pos_ratio'] = np.mean(y_train)
        metrics['val_pos_ratio'] = np.mean(y_val)
        metrics['test_pos_ratio'] = np.mean(y_test)
        results.append(metrics)
        
        # (K) 아티팩트 저장 (모드 태그로 구분)
        model_path  = f"./artifacts/gmlp_best_model_{tag}.pth"
        cfg_path    = f"./artifacts/feature_config_{tag}.json"
        scaler_path = f"./artifacts/rdkit_scaler_{tag}.pkl"
        torch.save(model.state_dict(), model_path)
        cfg = {
            "fp_types": fp_types,
            "mod_dims": {k: int(v) for k, v in mod_dims.items()},
            "rd_slice": [rd_start, rd_end] if rd_start is not None else None,
            "split_mode": split_mode,
            "seed": seed,
            "model_hparams": {"d_model": 512, "d_ffn": 1048, "depth": 4, "dropout": 0.2, "use_gated_pool": True},
            "train_params": {"lr": 1e-4, "weight_decay": 1e-5, "num_epochs": 50, "patience": 10},
            "class_balance": {"n_pos": int(n_pos) if 'n_pos' in locals() else None,
                               "n_neg": int(n_neg) if 'n_neg' in locals() else None,
                               "pos_weight": float(pos_weight.item()) if pos_weight is not None else None}
        }
        save_config(cfg_path, cfg)
        if scaler is not None:
            save_scaler(scaler_path, scaler)
    
    # (M) 최종 결과 요약 및 출력
    print("\n" + "="*80)
    print("--- [FINAL SUMMARY] Average Performance Across 10 Seeds ---")
    print("="*80)
    
    # pandas DataFrame으로 변환하여 평균 계산
    results_df = pd.DataFrame(results).set_index('seed')
    summary = results_df.mean()
    std_dev = results_df.std()
    
    for metric in summary.index:
        print(f"{metric:<18}| Mean: {summary[metric]:.4f} | Std Dev: {std_dev[metric]:.4f}")
    
    # DataFrame을 CSV 파일로 저장 (선택 사항)
    results_df.to_csv(f"./artifacts/multi_seed_results_{split_mode}.csv")
    print(f"\n[Save] Detailed results saved to ./artifacts/multi_seed_results_{split_mode}.csv")
    print("="*80)


--- [STARTING NEW RUN] Split Mode: scaffold, Seed: 42 ---


  descs = np.array(descs, dtype=np.float32)
Generating Features: 100%|███████████████████████████████████████████████| 2039/2039 [00:13<00:00, 150.50it/s]


[Save] split indices -> ./splits (tag=scaffold_seed42)

--- [Info] Class distribution for seed 42 ---
Train: Positive=0.75, Negative=0.25
Validation: Positive=0.73, Negative=0.27
Test: Positive=0.89, Negative=0.11
[Info] Using pos_weight=1.0000 (neg/pos=402/1229)
--- [STEP 1] Training gMLP on BBBP (split_mode=scaffold) ---

--- [STEP 1.5] Evaluating on BBBP Validation Split ---

--- [STEP 2] Evaluating on BBBP Test Split ---

--- [STARTING NEW RUN] Split Mode: scaffold, Seed: 100 ---


  descs = np.array(descs, dtype=np.float32)
Generating Features: 100%|███████████████████████████████████████████████| 2039/2039 [00:14<00:00, 141.47it/s]


[Save] split indices -> ./splits (tag=scaffold_seed100)

--- [Info] Class distribution for seed 100 ---
Train: Positive=0.75, Negative=0.25
Validation: Positive=0.73, Negative=0.27
Test: Positive=0.89, Negative=0.11
[Info] Using pos_weight=1.0000 (neg/pos=402/1229)
--- [STEP 1] Training gMLP on BBBP (split_mode=scaffold) ---

--- [STEP 1.5] Evaluating on BBBP Validation Split ---

--- [STEP 2] Evaluating on BBBP Test Split ---

--- [STARTING NEW RUN] Split Mode: scaffold, Seed: 200 ---


  descs = np.array(descs, dtype=np.float32)
Generating Features: 100%|███████████████████████████████████████████████| 2039/2039 [00:14<00:00, 136.87it/s]


[Save] split indices -> ./splits (tag=scaffold_seed200)

--- [Info] Class distribution for seed 200 ---
Train: Positive=0.75, Negative=0.25
Validation: Positive=0.73, Negative=0.27
Test: Positive=0.89, Negative=0.11
[Info] Using pos_weight=1.0000 (neg/pos=402/1229)
--- [STEP 1] Training gMLP on BBBP (split_mode=scaffold) ---

--- [STEP 1.5] Evaluating on BBBP Validation Split ---

--- [STEP 2] Evaluating on BBBP Test Split ---

--- [STARTING NEW RUN] Split Mode: scaffold, Seed: 300 ---


  descs = np.array(descs, dtype=np.float32)
Generating Features: 100%|███████████████████████████████████████████████| 2039/2039 [00:13<00:00, 145.99it/s]


[Save] split indices -> ./splits (tag=scaffold_seed300)

--- [Info] Class distribution for seed 300 ---
Train: Positive=0.75, Negative=0.25
Validation: Positive=0.73, Negative=0.27
Test: Positive=0.89, Negative=0.11
[Info] Using pos_weight=1.0000 (neg/pos=402/1229)
--- [STEP 1] Training gMLP on BBBP (split_mode=scaffold) ---

--- [STEP 1.5] Evaluating on BBBP Validation Split ---

--- [STEP 2] Evaluating on BBBP Test Split ---

--- [STARTING NEW RUN] Split Mode: scaffold, Seed: 400 ---


  descs = np.array(descs, dtype=np.float32)
Generating Features: 100%|███████████████████████████████████████████████| 2039/2039 [00:14<00:00, 145.39it/s]


[Save] split indices -> ./splits (tag=scaffold_seed400)

--- [Info] Class distribution for seed 400 ---
Train: Positive=0.75, Negative=0.25
Validation: Positive=0.73, Negative=0.27
Test: Positive=0.89, Negative=0.11
[Info] Using pos_weight=1.0000 (neg/pos=402/1229)
--- [STEP 1] Training gMLP on BBBP (split_mode=scaffold) ---

--- [STEP 1.5] Evaluating on BBBP Validation Split ---

--- [STEP 2] Evaluating on BBBP Test Split ---

--- [STARTING NEW RUN] Split Mode: scaffold, Seed: 500 ---


  descs = np.array(descs, dtype=np.float32)
Generating Features: 100%|███████████████████████████████████████████████| 2039/2039 [00:14<00:00, 144.25it/s]


[Save] split indices -> ./splits (tag=scaffold_seed500)

--- [Info] Class distribution for seed 500 ---
Train: Positive=0.75, Negative=0.25
Validation: Positive=0.73, Negative=0.27
Test: Positive=0.89, Negative=0.11
[Info] Using pos_weight=1.0000 (neg/pos=402/1229)
--- [STEP 1] Training gMLP on BBBP (split_mode=scaffold) ---

--- [STEP 1.5] Evaluating on BBBP Validation Split ---

--- [STEP 2] Evaluating on BBBP Test Split ---

--- [STARTING NEW RUN] Split Mode: scaffold, Seed: 600 ---


  descs = np.array(descs, dtype=np.float32)
Generating Features: 100%|███████████████████████████████████████████████| 2039/2039 [00:14<00:00, 140.65it/s]


[Save] split indices -> ./splits (tag=scaffold_seed600)

--- [Info] Class distribution for seed 600 ---
Train: Positive=0.75, Negative=0.25
Validation: Positive=0.73, Negative=0.27
Test: Positive=0.89, Negative=0.11
[Info] Using pos_weight=1.0000 (neg/pos=402/1229)
--- [STEP 1] Training gMLP on BBBP (split_mode=scaffold) ---

--- [STEP 1.5] Evaluating on BBBP Validation Split ---

--- [STEP 2] Evaluating on BBBP Test Split ---

--- [STARTING NEW RUN] Split Mode: scaffold, Seed: 700 ---


  descs = np.array(descs, dtype=np.float32)
Generating Features: 100%|███████████████████████████████████████████████| 2039/2039 [00:13<00:00, 153.26it/s]


[Save] split indices -> ./splits (tag=scaffold_seed700)

--- [Info] Class distribution for seed 700 ---
Train: Positive=0.75, Negative=0.25
Validation: Positive=0.73, Negative=0.27
Test: Positive=0.89, Negative=0.11
[Info] Using pos_weight=1.0000 (neg/pos=402/1229)
--- [STEP 1] Training gMLP on BBBP (split_mode=scaffold) ---

--- [STEP 1.5] Evaluating on BBBP Validation Split ---

--- [STEP 2] Evaluating on BBBP Test Split ---

--- [STARTING NEW RUN] Split Mode: scaffold, Seed: 800 ---


  descs = np.array(descs, dtype=np.float32)
Generating Features: 100%|███████████████████████████████████████████████| 2039/2039 [00:14<00:00, 140.23it/s]


[Save] split indices -> ./splits (tag=scaffold_seed800)

--- [Info] Class distribution for seed 800 ---
Train: Positive=0.75, Negative=0.25
Validation: Positive=0.73, Negative=0.27
Test: Positive=0.89, Negative=0.11
[Info] Using pos_weight=1.0000 (neg/pos=402/1229)
--- [STEP 1] Training gMLP on BBBP (split_mode=scaffold) ---

--- [STEP 1.5] Evaluating on BBBP Validation Split ---

--- [STEP 2] Evaluating on BBBP Test Split ---

--- [STARTING NEW RUN] Split Mode: scaffold, Seed: 900 ---


  descs = np.array(descs, dtype=np.float32)
Generating Features: 100%|███████████████████████████████████████████████| 2039/2039 [00:14<00:00, 138.33it/s]


[Save] split indices -> ./splits (tag=scaffold_seed900)

--- [Info] Class distribution for seed 900 ---
Train: Positive=0.75, Negative=0.25
Validation: Positive=0.73, Negative=0.27
Test: Positive=0.89, Negative=0.11
[Info] Using pos_weight=1.0000 (neg/pos=402/1229)
--- [STEP 1] Training gMLP on BBBP (split_mode=scaffold) ---

--- [STEP 1.5] Evaluating on BBBP Validation Split ---

--- [STEP 2] Evaluating on BBBP Test Split ---

--- [FINAL SUMMARY] Average Performance Across 10 Seeds ---
roc_auc           | Mean: 0.7938 | Std Dev: 0.0176
f1                | Mean: 0.9426 | Std Dev: 0.0022
mcc               | Mean: 0.2342 | Std Dev: 0.0285
accuracy          | Mean: 0.8931 | Std Dev: 0.0039
precision         | Mean: 0.9050 | Std Dev: 0.0018
recall            | Mean: 0.9835 | Std Dev: 0.0045
sensitivity       | Mean: 0.9835 | Std Dev: 0.0045
specificity       | Mean: 0.1455 | Std Dev: 0.0192
train_pos_ratio   | Mean: 0.7535 | Std Dev: 0.0000
val_pos_ratio     | Mean: 0.7304 | Std Dev: 0.000

In [147]:
# -------------------- [11. 메인: BBBP만으로 학습/평가] --------------------
if __name__ == "__main__":
    # (A) 실험 스위치
    split_mode = "random_scaffold"  # "scaffold" | "random_scaffold"
    seed = 42

    # (B) 경로/설정 — BBBP(2039)만 사용
    label_path = '/home/minji/Downloads/bbbp.csv'
    scage_paths = {
        'scage1': '/home/minji/scage/BBB/bench_embed.csv',
        'scage2' : '/home/minji/scage/BBB/bench_atom_embed.csv'
    }
    fp_types = ['ecfp', 'avalon', 'rdkit', 'maccs', 'tt', 'scage1', 'scage2']

    # (C) Dataset & dims
    dataset = ScageConcatDataset(label_path, scage_paths, fp_types=fp_types)
    expected_dims = dataset.expected_dims
    mod_dims = OrderedDict((t, expected_dims[t]) for t in fp_types)

    # (D) Split + RDKit normalize (8:1:1)
    train_ds, val_ds, test_ds, scaler, (rd_start, rd_end), split_indices = split_then_normalize(
        dataset, split_mode=split_mode, train_ratio=0.8, val_ratio=0.1, seed=seed
    )

    # (E) 스플릿 저장(→ SCAGE도 같은 인덱스로 학습/평가 가능)
    split_dir = "./splits"
    tag = f"{split_mode}_seed{seed}"
    save_split_indices(split_dir, tag, split_indices)
    print(f"[Save] split indices -> {split_dir} (tag={tag})")

    # (F) DataLoader
    train_loader = data.DataLoader(train_ds, batch_size=128, shuffle=True)
    val_loader   = data.DataLoader(val_ds,   batch_size=128, shuffle=False)
    test_loader  = data.DataLoader(test_ds,  batch_size=128, shuffle=False)
    # (G) Model
    set_seed(seed)
    model = MultiModalGMLPFromFlat(
        mod_dims=mod_dims, d_model=512, d_ffn=1048, depth=4, dropout=0.2, use_gated_pool=True
    ).to(device)

    # (H) pos_weight (옵션)
    pos_weight = None
    try:
        y_train_np = train_ds.tensors[1].cpu().numpy()
        n_pos = (y_train_np == 1).sum()
        n_neg = (y_train_np == 0).sum()
        if n_pos > 0:
            pos_weight = torch.tensor([max(n_neg / n_pos, 1.0)], dtype=torch.float32, device=device)
            print(f"[Info] Using pos_weight={pos_weight.item():.4f} (neg/pos={n_neg}/{n_pos})")
    except Exception as e:
        print(f"[Warn] pos_weight auto-calc skipped: {e}")

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight) if pos_weight is not None else nn.BCEWithLogitsLoss()

    # (I) Train
    print(f"--- [STEP 1] Training gMLP on BBBP (split_mode={split_mode}) ---")
    model = train_model(model, optimizer, train_loader, val_loader, loss_fn, num_epochs=50, patience=10)
    print("\n--- [STEP 1.5] Evaluating on BBBP Validation Split ---")
    baseline_val_metrics = eval_model(model, val_loader)
    print("\n" + "="*50)
    print(f"BASELINE PERFORMANCE on Validation Set (split={split_mode}, seed={seed})")
    print("="*50)
    for k, v in baseline_val_metrics.items():
        print(f"{k:<12}| {v:.4f}")
    print("="*50)
    
    # (J) Test eval (== 공식 벤치마크 성능 지표)
    print("\n--- [STEP 2] Evaluating on BBBP Test Split ---")
    metrics = eval_model(model, test_loader)
    print("\n" + "="*50)
    print(f"FINAL PERFORMANCE on BBBP (split={split_mode}, seed={seed})")
    print("="*50)
    for k, v in metrics.items():
        print(f"{k:<12}| {v:.4f}")
    print("="*50)

    # (K) 아티팩트 저장 (모드 태그로 구분)
    os.makedirs("./artifacts", exist_ok=True)
    model_path  = f"./artifacts/gmlp_best_model_{tag}.pth"
    cfg_path    = f"./artifacts/feature_config_{tag}.json"
    scaler_path = f"./artifacts/rdkit_scaler_{tag}.pkl"

    torch.save(model.state_dict(), model_path)
    print(f"\n[Save] model -> {model_path}")

    cfg = {
        "fp_types": fp_types,
        "mod_dims": {k: int(v) for k, v in mod_dims.items()},
        "rd_slice": [rd_start, rd_end] if rd_start is not None else None,
        "split_mode": split_mode,
        "seed": seed,
        "model_hparams": {"d_model": 512, "d_ffn": 1048, "depth": 4, "dropout": 0.2, "use_gated_pool": True},
        "train_params": {"lr": 1e-4, "weight_decay": 1e-5, "num_epochs": 50, "patience": 10},
        "class_balance": {"n_pos": int(n_pos) if 'n_pos' in locals() else None,
                          "n_neg": int(n_neg) if 'n_neg' in locals() else None,
                          "pos_weight": float(pos_weight.item()) if pos_weight is not None else None}
    }
    save_config(cfg_path, cfg)
    print(f"[Save] config -> {cfg_path}")

    if scaler is not None:
        save_scaler(scaler_path, scaler)
        print(f"[Save] RDKit scaler -> {scaler_path}")
    else:
        print("[Info] RDKit slice not found. Skipping scaler save.")
    # (L) 학습된 alpha 파라미터 확인
    print("\n--- [STEP 3] Final Learned Alpha Parameters ---")
    if model.use_gated_pool:
        # 모델의 alpha 파라미터 값을 직접 가져와서 출력
        learned_alpha = model.alpha.detach().cpu().numpy()
        normalized_weights = torch.softmax(torch.tensor(learned_alpha), dim=0).numpy()
        
        # 모달리티 이름과 함께 출력
        for i, name in enumerate(model.mod_names):
            print(f"  - '{name}': alpha = {learned_alpha[i]:.4f}, normalized weight = {normalized_weights[i]:.4f}")
        
    else:
        print("Gated pooling is not enabled (use_gated_pool=False).")

  descs = np.array(descs, dtype=np.float32)
Generating Features: 100%|███████████████████████████████████████████████| 2039/2039 [00:14<00:00, 140.40it/s]


[Save] split indices -> ./splits (tag=random_scaffold_seed700)
[Info] Using pos_weight=1.0000 (neg/pos=373/1258)
--- [STEP 1] Training gMLP on BBBP (split_mode=random_scaffold) ---

--- [STEP 1.5] Evaluating on BBBP Validation Split ---

BASELINE PERFORMANCE on Validation Set (split=random_scaffold, seed=700)
accuracy    | 0.8870
precision   | 0.9100
recall      | 0.9500
f1          | 0.9290
roc_auc     | 0.8850
mcc         | 0.6560
sensitivity | 0.9497
specificity | 0.6667

--- [STEP 2] Evaluating on BBBP Test Split ---

FINAL PERFORMANCE on BBBP (split=random_scaffold, seed=700)
accuracy    | 0.7400
precision   | 0.7740
recall      | 0.8880
f1          | 0.8270
roc_auc     | 0.7500
mcc         | 0.3250
sensitivity | 0.8881
specificity | 0.3934

[Save] model -> ./artifacts/gmlp_best_model_random_scaffold_seed700.pth
[Save] config -> ./artifacts/feature_config_random_scaffold_seed700.json
[Save] RDKit scaler -> ./artifacts/rdkit_scaler_random_scaffold_seed700.pkl

--- [STEP 3] Final Le

In [115]:
import optuna
from optuna.trial import Trial
from copy import deepcopy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from sklearn.metrics import matthews_corrcoef

def train_model_mcc_optuna(model, optimizer, train_loader, val_loader, loss_fn, num_epochs=50, patience=10):
    best_val_mcc = -1.0
    best_state = None
    bad_epochs = 0
    
    for epoch in range(num_epochs):
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = loss_fn(logits, y)
            loss.backward()
            optimizer.step()
        
        model.eval()
        y_true, y_pred = [], []
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                logits = model(x)
                probs = torch.sigmoid(logits).cpu().numpy()
                pred = (probs > 0.5).astype(int)
                y_true.extend(y.cpu().numpy())
                y_pred.extend(pred)

        val_mcc = matthews_corrcoef(y_true, y_pred) if len(set(y_true)) > 1 else 0.0
        
        if val_mcc > best_val_mcc:
            best_val_mcc = val_mcc
            best_state = deepcopy(model.state_dict()) # 최적 모델의 state_dict 저장
            bad_epochs = 0
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                break
    
    return best_val_mcc, best_state # 최적 MCC와 state_dict 반환

In [116]:
def objective(trial: Trial, train_ds, val_ds, mod_dims, pos_weight):
    set_seed(900)

    # 하이퍼파라미터 후보 범위 (기존과 동일)
    d_model = trial.suggest_categorical('d_model', [256, 512, 1024])
    d_ffn = trial.suggest_int('d_ffn', d_model * 2, d_model * 4) 
    depth = trial.suggest_int('depth', 2, 6)
    dropout = trial.suggest_float('dropout', 0.1, 0.4)
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])

    model = MultiModalGMLPFromFlat(
        mod_dims=mod_dims, d_model=d_model, d_ffn=d_ffn, depth=depth, dropout=dropout, use_gated_pool=True
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight) if pos_weight is not None else nn.BCEWithLogitsLoss()

    g = torch.Generator()
    g.manual_seed(900)
    train_loader = data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=True, generator=g)
    val_loader = data.DataLoader(val_ds, batch_size=batch_size, shuffle=False, pin_memory=True)
    
    # 모델 학습 및 최적 MCC, state_dict 반환
    val_mcc, best_state = train_model_mcc_optuna(model, optimizer, train_loader, val_loader, loss_fn)
    
    # Optuna 트라이얼에 최적의 state_dict를 사용자 속성으로 저장
    trial.set_user_attr('best_model_state', best_state)
    
    return val_mcc

In [117]:
# -------------------- [Optuna 실행 및 최종 평가] --------------------
print("--- Starting Optuna Hyperparameter Optimization (Objective: Maximize MCC) ---")

study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, train_ds, val_ds, mod_dims, pos_weight), n_trials=100, catch=(torch.cuda.OutOfMemoryError,))

print("\n--- Best Hyperparameters Found ---")
best_params = study.best_trial.params
print(best_params)

print("\n--- Final Evaluation with Best Parameters ---")

# 최적의 하이퍼파라미터로 모델 인스턴스만 생성 (학습은 하지 않음)
model_best = MultiModalGMLPFromFlat(
    mod_dims=mod_dims,
    d_model=best_params['d_model'],
    d_ffn=best_params['d_ffn'],
    depth=best_params['depth'],
    dropout=best_params['dropout'],
    use_gated_pool=True
).to(device)

# Optuna가 저장한 최적의 state_dict를 불러옴
best_state = study.best_trial.user_attrs['best_model_state']
model_best.load_state_dict(best_state)

# 검증 세트 평가를 위한 DataLoader 생성 (최적의 배치 크기 사용)
val_loader_optuna = data.DataLoader(val_ds, batch_size=best_params['batch_size'], shuffle=False, pin_memory=True)
# 검증 세트로 성능 평가
optuna_val_metrics = eval_model(model_best, val_loader_optuna)

print("\n" + "="*50)
print("BEST OPTUNA MODEL PERFORMANCE on Validation Set")
print("="*50)
for k, v in optuna_val_metrics.items():
    print(f"{k:<12}| {v:.4f}")
print("="*50)
# [새로 추가된 부분 끝]

# 최종 평가를 위한 DataLoader 생성
test_loader = data.DataLoader(test_ds, batch_size=best_params['batch_size'], shuffle=False, pin_memory=True)
metrics = eval_model(model_best, test_loader)

# 최종 결과 출력 (기존과 동일)
print("\n" + "="*50)
print("FINAL PERFORMANCE on BBBP with Best HPs (Objective: Maximize MCC)")
print("="*50)
for k, v in metrics.items():
    print(f"{k:<12}| {v:.4f}")
print("="*50)

# 최적화된 모델의 alpha 값 확인 (기존과 동일)
print("\n--- Final Learned Alpha Parameters with Best HPs ---")
if model_best.use_gated_pool:
    learned_alpha = model_best.alpha.detach().cpu().numpy()
    normalized_weights = torch.softmax(torch.tensor(learned_alpha), dim=0).numpy()
    
    for i, name in enumerate(model_best.mod_names):
        print(f"  - '{name}': alpha = {learned_alpha[i]:.4f}, normalized weight = {normalized_weights[i]:.4f}")
else:
    print("Gated pooling is not enabled.")

[I 2025-09-25 14:05:20,136] A new study created in memory with name: no-name-25ea5185-0638-4a52-b14f-4a11d92b7391


--- Starting Optuna Hyperparameter Optimization (Objective: Maximize MCC) ---


[I 2025-09-25 14:05:27,388] Trial 0 finished with value: 0.7582620144832126 and parameters: {'d_model': 512, 'd_ffn': 2024, 'depth': 4, 'dropout': 0.13893510083926988, 'lr': 1.2474284146438976e-05, 'batch_size': 64}. Best is trial 0 with value: 0.7582620144832126.
[I 2025-09-25 14:05:30,223] Trial 1 finished with value: 0.7187831727079755 and parameters: {'d_model': 512, 'd_ffn': 1786, 'depth': 4, 'dropout': 0.3329315528817993, 'lr': 0.00018042112129334085, 'batch_size': 256}. Best is trial 0 with value: 0.7582620144832126.
[I 2025-09-25 14:05:40,714] Trial 2 finished with value: 0.7069498703111543 and parameters: {'d_model': 1024, 'd_ffn': 4040, 'depth': 3, 'dropout': 0.39175425085937476, 'lr': 4.835279946459043e-05, 'batch_size': 128}. Best is trial 0 with value: 0.7582620144832126.
[I 2025-09-25 14:05:43,417] Trial 3 finished with value: 0.6882697943419441 and parameters: {'d_model': 256, 'd_ffn': 533, 'depth': 5, 'dropout': 0.3073008844666876, 'lr': 5.5355180252296156e-05, 'batch_s


--- Best Hyperparameters Found ---
{'d_model': 256, 'd_ffn': 964, 'depth': 4, 'dropout': 0.311008758334534, 'lr': 3.135188168159014e-05, 'batch_size': 256}

--- Final Evaluation with Best Parameters ---

BEST OPTUNA MODEL PERFORMANCE on Validation Set
accuracy    | 0.9170
precision   | 0.9370
recall      | 0.9550
f1          | 0.9460
roc_auc     | 0.9160
mcc         | 0.7670
sensitivity | 0.9548
specificity | 0.7959

FINAL PERFORMANCE on BBBP with Best HPs (Objective: Maximize MCC)
accuracy    | 0.8970
precision   | 0.9290
recall      | 0.9460
f1          | 0.9380
roc_auc     | 0.9140
mcc         | 0.6430
sensitivity | 0.9461
specificity | 0.6757

--- Final Learned Alpha Parameters with Best HPs ---
  - 'ecfp': alpha = 0.0037, normalized weight = 0.1434
  - 'avalon': alpha = -0.0008, normalized weight = 0.1428
  - 'rdkit': alpha = -0.0026, normalized weight = 0.1425
  - 'maccs': alpha = -0.0008, normalized weight = 0.1428
  - 'tt': alpha = 0.0044, normalized weight = 0.1435
  - 'scage

In [None]:
print("\n--- Saving the Best Model and Hyperparameters to Disk ---")

# 최적 트라이얼에서 모델 상태와 하이퍼파라미터 추출
best_model_state = study.best_trial.user_attrs['best_model_state']
best_hparams = study.best_trial.params

# 저장 경로 설정
out_dir = './best_model_artifacts'
os.makedirs(out_dir, exist_ok=True)
model_path = os.path.join(out_dir, 'best_model.pth')
hparams_path = os.path.join(out_dir, 'best_hparams.json')

# 1) 모델 가중치 저장
torch.save(best_model_state, model_path)
print(f"✅ Best model state saved to: {model_path}")

# 2) 하이퍼파라미터 저장
with open(hparams_path, 'w') as f:
    json.dump(best_hparams, f, indent=4)
print(f"✅ Best hyperparameters saved to: {hparams_path}")

In [None]:
# new_evaluation_script.py
import os
import json
import torch
import torch.utils.data as data
from collections import OrderedDict

# 여기서 기존에 정의한 모든 함수(MultiModalGMLPFromFlat, eval_model, load_scaler 등)를
# 동일하게 가져오거나 임포트해야 합니다.

# 1. 저장된 아티팩트 경로
artifact_dir = './best_model_artifacts'
hparams_path = os.path.join(artifact_dir, 'best_hparams.json')
model_path = os.path.join(artifact_dir, 'best_model.pth')

# 2. 하이퍼파라미터와 데이터셋 관련 정보 로드
with open(hparams_path, 'r', encoding='utf-8') as f:
    hparams = json.load(f)

# 데이터 분할 인덱스와 스케일러를 로드하여 일관성 유지
split_dir = "./splits" # 기존에 저장한 분할 인덱스 경로
tag = "random_scaffold_seed700" # 저장 시 사용한 태그
_, _, test_idx = load_split_indices(split_dir, tag)
scaler = load_scaler(f"./artifacts/rdkit_scaler_{tag}.pkl")

# 기존 데이터셋을 생성하고 테스트 데이터셋만 준비
dataset = ScageConcatDataset(label_path, scage_paths, fp_types=fp_types)
test_ds = data.TensorDataset(dataset.features[test_idx], dataset.labels[test_idx])
mod_dims = dataset.expected_dims

# RDKit 피처 인덱스
rd_start, rd_end = # ... (기존과 동일하게 RDKit 피처의 시작/끝 인덱스를 구하세요)

# 3. 저장된 스케일러를 테스트 데이터셋에 적용
test_ds.tensors[0][:, rd_start:rd_end] = torch.tensor(
    scaler.transform(test_ds.tensors[0][:, rd_start:rd_end]), 
    dtype=torch.float32
)

# 4. 모델 인스턴스를 생성하고 가중치 로드
model_loaded = MultiModalGMLPFromFlat(
    mod_dims=mod_dims,
    d_model=hparams['d_model'],
    d_ffn=hparams['d_ffn'],
    depth=hparams['depth'],
    dropout=hparams['dropout'],
    use_gated_pool=True
).to(device)
model_loaded.load_state_dict(torch.load(model_path))

# 5. 복원된 모델로 성능 평가
test_loader = data.DataLoader(test_ds, batch_size=hparams['batch_size'], shuffle=False)
metrics = eval_model(model_loaded, test_loader)

print("\n" + "="*50)
print("불러온 모델의 최종 성능")
print("="*50)
for k, v in metrics.items():
    print(f"{k:<12}| {v:.4f}")
print("="*50)

In [11]:
import psutil

# 현재 프로세스(파이썬)의 메모리 정보
process = psutil.Process(os.getpid())
mem_info = process.memory_info()

# 메모리 사용량 출력
print(f"메모리 사용량: {mem_info.rss / (1024 * 1024):.2f} MB")

메모리 사용량: 868.89 MB


In [14]:
!nvidia-smi

Tue Sep 23 17:45:34 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 Ti     Off |   00000000:01:00.0  On |                  N/A |
|  0%   43C    P2             31W /  165W |    1601MiB /  16380MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!kill 2303472


In [1]:
!nvidia-smi

Tue Sep 23 17:44:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 Ti     Off |   00000000:01:00.0  On |                  N/A |
|  0%   41C    P8             15W /  165W |    1058MiB /  16380MiB |      5%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [77]:
pwd

'/home/minji/scage/BBB'