# TDC ADMET, Caco-2_Wang Submission

In [1]:
from typing import Tuple

import numpy as np
import pandas as pd

# cheminformatics
import rdkit.Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

# logging
import tqdm

# data preprocessing
import sklearn.impute
import sklearn.preprocessing

# modeling
import sklearn.ensemble
from sklearn.model_selection import ParameterGrid

# metrics
import sklearn.metrics

from tdc.single_pred import ADME


In [2]:
data = ADME(name = 'Caco2_Wang')
split = data.get_split()

Found local copy...
Loading...
Done!


In [5]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
import tqdm
import sklearn

def add_descriptor_columns(data: pd.DataFrame, radius: int = 2, n_bits: int = 2048) -> pd.DataFrame:
    """
    Calculate Morgan fingerprints for each molecule in the dataframe.

    Parameters
    ----------
    data : pd.DataFrame
        Must contain a 'Drug' column with SMILES strings and 'Y' column with target values.
    radius : int
        Radius of Morgan fingerprint (default=2).
    n_bits : int
        Length of fingerprint vector (default=2048).

    Returns
    -------
    df : pd.DataFrame
        DataFrame with Morgan fingerprint bits as columns + Drug + Y.
    """
    
    assert 'Drug' in data.columns, "'Drug' must be a column in the input DataFrame."
    assert 'Y' in data.columns, "'Y' must be a column in the input DataFrame."
    
    drugs = data['Drug']
    y = data['Y']

    fps = []
    print("Calculating Morgan fingerprints...")
    for smi, target in tqdm.tqdm(zip(drugs, y), total=len(drugs)):
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            fp = np.zeros(n_bits, dtype=int)
        else:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
            fp = np.array(fp)  # конвертация в numpy array
        fps.append(np.concatenate([fp, [target]]))

    # имена колонок: fp_0 ... fp_(n_bits-1), Y
    fp_columns = [f'fp_{i}' for i in range(n_bits)] + ['Y']
    df = pd.DataFrame(fps, columns=fp_columns)
    df['Drug'] = drugs.values

    return df


def preprocess_data(
    data: pd.DataFrame, 
    imputer=sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean'),
    fit_imputer=True,
    scaler_X=sklearn.preprocessing.RobustScaler(),
    scaler_y=sklearn.preprocessing.RobustScaler(),
    fit_scaler=True
):
    """
    Imputes missing values.
    Scales feature data.

    Returns a tuple X, y of scaled feature data and target data.
    """

    col_array = np.array(data.columns)

    # extract just the feature data (все кроме Drug, Y)
    X = data[col_array[~np.isin(col_array, ['Drug_ID', 'Drug', 'Y'])]].to_numpy()
    
    # extract the target data
    y = np.array(data['Y']).reshape(-1,1)
    
    # impute missing data
    if imputer is not None:
        if fit_imputer:
            X = imputer.fit_transform(X)
        else:
            X = imputer.transform(X)

    # scale the feature data
    if scaler_X is not None:
        if fit_scaler:
            X = scaler_X.fit_transform(X)
            y = scaler_y.fit_transform(y)
        else:
            X = scaler_X.transform(X)
            y = scaler_y.transform(y)

    return X, y, imputer, scaler_X, scaler_y


In [6]:
X_train, y_train, imputer, scaler_X, scaler_y = preprocess_data(
    add_descriptor_columns(split['train'])
)
X_val, y_val, _, _, _ = preprocess_data(
    add_descriptor_columns(split['valid']),
    imputer=imputer, fit_imputer=False,
    scaler_X=scaler_X, scaler_y=scaler_y,
    fit_scaler=False)


Calculating Morgan fingerprints...


100%|██████████| 637/637 [00:00<00:00, 765.55it/s]


Calculating Morgan fingerprints...


100%|██████████| 91/91 [00:00<00:00, 791.87it/s]


In [7]:
from tqdm import tqdm
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import ParameterGrid, RandomizedSearchCV
import numpy as np
import os
import json
import joblib

# Папка для сохранения модели и параметров
model_dir = "model/Cacao2"
os.makedirs(model_dir, exist_ok=True)

# ------------------ Stage 1: грубый поиск ------------------ #
params_grid_stage1 = params_grid_stage1 = {
    "C": [10 ** i for i in range(-1, 6)],         # [0.1, 1, 10, 100, 1000, 10000, 100000]
    "gamma": [10 ** i for i in range(-7, 1)],     # [1e-7, 1e-6, ..., 1e0]
    "epsilon": [0.001, 0.01, 0.05, 0.1, 0.2, 0.5]  # более широкий диапазон
}


best_score = float('inf')
best_set = {}
best_model = None

for param_set in tqdm(ParameterGrid(params_grid_stage1), desc="Grid Search Stage 1"):
    model = SVR(kernel="rbf", **param_set)
    model.fit(X_train, y_train.ravel())

    y_val_pred_tmp = model.predict(X_val)
    score_MAE = mean_absolute_error(y_val, y_val_pred_tmp)

    if score_MAE < best_score:
        best_score = score_MAE
        best_set = param_set
        best_model = model

print("Лучшие параметры (Stage 1):", best_set)
print("MAE (Stage 1):", best_score)

# ------------------ Stage 2: уточнение ------------------ #
C_best = best_set["C"]
gamma_best = best_set["gamma"]
eps_best = best_set["epsilon"]

params_grid_stage2 = {
    "C": [C_best * f for f in [0.5, 0.8, 1.0, 1.2, 2.0]],
    "gamma": [gamma_best * f for f in [0.5, 0.8, 1.0, 1.2, 2.0]],
    "epsilon": [max(1e-4, eps_best * f) for f in [0.5, 0.8, 1.0, 1.2, 2.0]]
}

for param_set in tqdm(ParameterGrid(params_grid_stage2), desc="Grid Search Stage 2"):
    model = SVR(kernel="rbf", **param_set)
    model.fit(X_train, y_train.ravel())

    y_val_pred_tmp = model.predict(X_val)
    score_MAE = mean_absolute_error(y_val, y_val_pred_tmp)

    if score_MAE < best_score:
        best_score = score_MAE
        best_set = param_set
        best_model = model

print("Лучшие параметры (Stage 2):", best_set)
print("MAE (Stage 2):", best_score)

# ------------------ Stage 3: RandomizedSearchCV ------------------ #
param_distributions = {
    "C": np.logspace(np.log10(best_set["C"] * 0.5), np.log10(best_set["C"] * 2), 100),
    "gamma": np.logspace(np.log10(best_set["gamma"] * 0.5), np.log10(best_set["gamma"] * 2), 100),
    "epsilon": np.linspace(max(1e-4, best_set["epsilon"] * 0.5), best_set["epsilon"] * 2, 100)
}

random_search = RandomizedSearchCV(
    SVR(kernel="rbf"),
    param_distributions=param_distributions,
    n_iter=20,              # 20 случайных комбинаций
    scoring="neg_mean_absolute_error",
    cv=[(np.arange(len(X_train)), np.arange(len(X_val)))],  # имитация train/val split
    random_state=42,
    verbose=1,
    n_jobs=-1
)

random_search.fit(
    np.vstack((X_train, X_val)), 
    np.hstack((y_train.ravel(), y_val.ravel())))


best_model = random_search.best_estimator_
best_set = random_search.best_params_
best_score = -random_search.best_score_

print("Лучшие параметры (Stage 3 RandomizedSearchCV):", best_set)
print("MAE (Stage 3):", best_score)

# ------------------ Сохраняем модель и параметры ------------------ #
model_path = os.path.join(model_dir, "best_model_svm.pkl")
joblib.dump(best_model, model_path)

params_path = os.path.join(model_dir, "best_params.json")
with open(params_path, "w") as f:
    json.dump({
        "best_score": best_score,
        "best_params": best_set
    }, f, indent=4)

print("✅ Лучшая модель сохранена в:", model_path)
print("✅ Параметры сохранены в:", params_path)


Grid Search Stage 1: 100%|██████████| 336/336 [00:53<00:00,  6.27it/s]


Лучшие параметры (Stage 1): {'C': 10, 'epsilon': 0.01, 'gamma': 0.01}
MAE (Stage 1): 0.29707819757098697


Grid Search Stage 2: 100%|██████████| 125/125 [00:23<00:00,  5.36it/s]


Лучшие параметры (Stage 2): {'C': 5.0, 'epsilon': 0.012, 'gamma': 0.005}
MAE (Stage 2): 0.2928915001054697
Fitting 1 folds for each of 20 candidates, totalling 20 fits
Лучшие параметры (Stage 3 RandomizedSearchCV): {'gamma': 0.009860946122593675, 'epsilon': 0.014363636363636365, 'C': 9.455278903023638}
MAE (Stage 3): 0.05019882998063193
✅ Лучшая модель сохранена в: model/Cacao2\best_model_svm.pkl
✅ Параметры сохранены в: model/Cacao2\best_params.json


In [10]:
from tdc.benchmark_group import admet_group
from sklearn.svm import SVR
import numpy as np
import tqdm

group = admet_group(path='data/')
predictions_list = []

for seed in [1, 2, 3, 4, 5]:
    benchmark = group.get('Caco2_Wang') 
    predictions = {}
    name = benchmark['name']

    # используем весь train_val для обучения
    train_val, test = benchmark['train_val'], benchmark['test']

    print(f"Seed {seed}:")

    # ---------------- Предобработка ---------------- #
    X_train, y_train, imputer, scaler_X, scaler_y = preprocess_data(
        add_descriptor_columns(train_val)
    )
    X_test, y_test, _, _, _ = preprocess_data(
        add_descriptor_columns(test),
        imputer=imputer, fit_imputer=False,
        scaler_X=scaler_X, fit_scaler=False,
        scaler_y=scaler_y
    )

    # ---------------- SVM (SVR) ---------------- #
    # Пример параметров — при необходимости замените или подберите через CV
    svr_model = SVR(**best_set)
    # Входные y в preprocess_data, как в вашем XGBoost-окружении, вероятно, уже масштабированы
    svr_model.fit(X_train, y_train.ravel())

    # ---------------- Предсказания ---------------- #
    y_pred_test_scaled = svr_model.predict(X_test)  # 1D array
    # inverse_transform ожидает 2D, поэтому reshape
    y_pred_test = scaler_y.inverse_transform(
        y_pred_test_scaled.reshape(-1, 1)
    ).reshape(-1)

    # сохраняем по TDC-шаблону
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

# ---------------- Оценка ---------------- #
results = group.evaluate_many(predictions_list)
print(results)


Found local copy...


Seed 1:
Calculating Morgan fingerprints...


100%|██████████| 728/728 [00:00<00:00, 791.55it/s]


Calculating Morgan fingerprints...


100%|██████████| 182/182 [00:00<00:00, 740.56it/s]


Seed 2:
Calculating Morgan fingerprints...


100%|██████████| 728/728 [00:00<00:00, 775.24it/s]


Calculating Morgan fingerprints...


100%|██████████| 182/182 [00:00<00:00, 776.45it/s]


Seed 3:
Calculating Morgan fingerprints...


100%|██████████| 728/728 [00:00<00:00, 752.46it/s]


Calculating Morgan fingerprints...


100%|██████████| 182/182 [00:00<00:00, 758.32it/s]


Seed 4:
Calculating Morgan fingerprints...


100%|██████████| 728/728 [00:00<00:00, 790.88it/s]


Calculating Morgan fingerprints...


100%|██████████| 182/182 [00:00<00:00, 762.29it/s]


Seed 5:
Calculating Morgan fingerprints...


100%|██████████| 728/728 [00:00<00:00, 771.03it/s]


Calculating Morgan fingerprints...


100%|██████████| 182/182 [00:00<00:00, 723.88it/s]


{'caco2_wang': [0.371, 0.0]}
