# TDC ADMET, Caco-2_Wang Submission

In [8]:
from typing import Tuple

import numpy as np
import pandas as pd

# cheminformatics
import rdkit.Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

# logging
import tqdm

# data preprocessing
import sklearn.impute
import sklearn.preprocessing

# modeling
import sklearn.ensemble
from sklearn.model_selection import ParameterGrid

# metrics
import sklearn.metrics

from tdc.single_pred import ADME


In [9]:
data = ADME(name = 'Caco2_Wang')
split = data.get_split()

Found local copy...
Loading...
Done!


In [10]:
import pandas as pd
import numpy as np
import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, MACCSkeys
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

def add_descriptor_columns(
    data: pd.DataFrame,
    include: list = ["rdkit", "morgan", "maccs"],
    radius: int = 2,
    n_bits: int = 2048
) -> pd.DataFrame:
    """
    Calculate molecular descriptors for each molecule in the `data` DataFrame.
    
    Parameters
    ----------
    data : pd.DataFrame
        Must contain columns ["Drug", "Y"].
    include : list of str, default=["rdkit","morgan","maccs"]
        Which descriptors to include. Options: "rdkit", "morgan", "maccs".
    radius : int, default=2
        Morgan FP radius.
    n_bits : int, default=2048
        Morgan FP length.
    
    Returns
    -------
    pd.DataFrame with selected descriptor features + Drug + Y.
    """
    assert 'Drug' in data.columns, "'Drug' must be a column in the input DataFrame."
    drugs = data['Drug']
    y = data['Y']

    print(f"Calculating descriptors: {include} ...")
    descriptors = []

    for drug, target in tqdm.tqdm(zip(drugs, y), total=len(drugs)):
        mol = Chem.MolFromSmiles(drug)

        row = {}

        if mol is None:
            if "rdkit" in include:
                row.update({name: np.nan for name, _ in Descriptors.descList})
            if "morgan" in include:
                row.update({f"morgan_{i}": 0 for i in range(n_bits)})
            if "maccs" in include:
                row.update({f"maccs_{i}": 0 for i in range(167)})
        else:
            if "rdkit" in include:
                row.update(Descriptors.CalcMolDescriptors(mol))
            if "morgan" in include:
                fp_morgan = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
                row.update(dict(zip([f"morgan_{i}" for i in range(n_bits)], np.array(fp_morgan))))
            if "maccs" in include:
                fp_maccs = MACCSkeys.GenMACCSKeys(mol)
                row.update(dict(zip([f"maccs_{i}" for i in range(len(fp_maccs))], np.array(fp_maccs))))

        # служебные колонки
        row['Drug'] = drug
        row['Y'] = target
        descriptors.append(row)

    return pd.DataFrame(descriptors)


def preprocess_data(
    data: pd.DataFrame, 
    imputer=SimpleImputer(missing_values=np.nan, strategy='mean'),
    fit_imputer=True,
    scaler_X=RobustScaler(),
    scaler_y=RobustScaler(),
    fit_scaler=True
):
    """
    Imputes missing values.
    Scales feature data.

    Returns a tuple X, y, imputer, scaler_X, scaler_y
    """

    col_array = np.array(data.columns)

    # extract features
    X = data[col_array[~np.isin(col_array, ['Drug_ID', 'Drug', 'Y'])]].to_numpy()
    
    # extract target
    y = np.array(data['Y']).reshape(-1,1)
    
    # impute missing data
    if imputer is not None:
        if fit_imputer:
            X = imputer.fit_transform(X)
        else:
            X = imputer.transform(X)

    # scale features and target
    if scaler_X is not None:
        if fit_scaler:
            X = scaler_X.fit_transform(X)
            y = scaler_y.fit_transform(y)
        else:
            X = scaler_X.transform(X)
            y = scaler_y.transform(y)

    return X, y, imputer, scaler_X, scaler_y


In [11]:
import xgboost as xgb

# Предобработка данных
X_train, y_train, imputer, scaler_X, scaler_y = preprocess_data(
    add_descriptor_columns(split['train'])
)
X_val, y_val, _, _, _ = preprocess_data(
    add_descriptor_columns(split['valid']),
    imputer=imputer, fit_imputer=False,
    scaler_X=scaler_X, scaler_y=scaler_y,
    fit_scaler=False)


Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 637/637 [00:10<00:00, 61.09it/s]


Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 91/91 [00:01<00:00, 65.16it/s]


In [12]:
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import numpy as np
import os
import json

# Создаем папку для модели, если нет
model_dir = "model/Cacao2"
os.makedirs(model_dir, exist_ok=True)

# Преобразуем данные в DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train.ravel())
dval = xgb.DMatrix(X_val, label=y_val.ravel())

# Сетка параметров
params_grid = {'subsample': [0.7, 0.8],
    'scale_pos_weight': [2, 3],   
    'reg_lambda': [1.0, 1.2],
    'reg_alpha': [0.3, 0.5],
    'n_estimators': [200, 300],  
    'min_child_weight': [1, 2],
    'max_depth': [7, 9],         
    'max_delta_step': [2, 3],
    'learning_rate': [0.03, 0.05],
    'gamma': [0.1, 0.2],
    'colsample_bytree': [0.7, 0.8]}

best_score = float('inf')
best_set = {}
best_num_boost_round = 0
best_model = None

for param_set in tqdm(ParameterGrid(params_grid), desc="Grid Search"):
    params = {
        'objective': 'reg:squarederror',
        'seed': 42,
        'eval_metric': 'mae',
        'tree_method': 'gpu_hist',  # Используем GPU
        'gpu_id': 0,
        'n_jobs': -1
    }
    params.update(param_set)

    evals = [(dtrain, 'train'), (dval, 'eval')]
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    y_val_pred_tmp = model.predict(dval)
    score_MAE = mean_absolute_error(y_val, y_val_pred_tmp)

    if score_MAE < best_score:
        best_score = score_MAE
        best_set = param_set
        best_num_boost_round = model.best_iteration
        best_model = model

# Сохраняем лучшую модель
model_path = os.path.join(model_dir, "best_model.xgb")
best_model.save_model(model_path)

# Сохраняем параметры лучшей модели
params_path = os.path.join(model_dir, "best_params.json")
with open(params_path, "w") as f:
    json.dump({
        "best_score": best_score,
        "best_params": best_set,
        "best_num_boost_round": best_num_boost_round
    }, f, indent=4)

print("Best MAE:", best_score)
print("Best params:", best_set)
print("Best num_boost_round:", best_num_boost_round)
print(f"Модель и параметры сохранены в {model_dir}")


Grid Search: 100%|██████████| 2048/2048 [1:01:01<00:00,  1.79s/it]

Best MAE: 0.2753240927697525
Best params: {'colsample_bytree': 0.7, 'gamma': 0.1, 'learning_rate': 0.03, 'max_delta_step': 2, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'scale_pos_weight': 2, 'subsample': 0.7}
Best num_boost_round: 171
Модель и параметры сохранены в model/Cacao2





In [14]:
from tdc.benchmark_group import admet_group
import tqdm

group = admet_group(path='data/')
predictions_list = []

for seed in [1, 2, 3, 4, 5]:
    benchmark = group.get('Caco2_Wang') 
    predictions = {}
    name = benchmark['name']

    # используем весь train_val для обучения
    train_val, test = benchmark['train_val'], benchmark['test']

    print(f"Seed {seed}:")

    # ---------------- Предобработка ---------------- #
    X_train, y_train, imputer, scaler_X, scaler_y = preprocess_data(
        add_descriptor_columns(train_val)
    )
    X_test, y_test, _, _, _ = preprocess_data(
        add_descriptor_columns(test),
        imputer=imputer, fit_imputer=False,
        scaler_X=scaler_X, fit_scaler=False,
        scaler_y=scaler_y
    )

    # ---------------- XGBoost ---------------- #
    dtrain = xgb.DMatrix(X_train, label=y_train.ravel())
    dtest = xgb.DMatrix(X_test)

    params = best_set.copy()
    params.update({
        'objective': 'reg:squarederror',
        'seed': seed,
        'eval_metric': 'mae'
    })

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=best_num_boost_round
    )

    # ---------------- Предсказания ---------------- #
    y_pred_test_scaled = model.predict(dtest)
    y_pred_test = scaler_y.inverse_transform(
        y_pred_test_scaled.reshape(-1, 1)
    ).reshape(-1)

    # сохраняем по TDC-шаблону
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

# ---------------- Оценка ---------------- #
results = group.evaluate_many(predictions_list)
print(results)
# {'caco2_wang': [MAE, R2]}


Found local copy...


Seed 1:
Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 728/728 [00:12<00:00, 59.29it/s]


Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 182/182 [00:03<00:00, 56.48it/s]


Seed 2:
Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 728/728 [00:12<00:00, 60.23it/s]


Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 182/182 [00:02<00:00, 62.93it/s]


Seed 3:
Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 728/728 [00:11<00:00, 61.89it/s]


Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 182/182 [00:02<00:00, 62.57it/s]


Seed 4:
Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 728/728 [00:11<00:00, 60.79it/s]


Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 182/182 [00:03<00:00, 60.22it/s]


Seed 5:
Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 728/728 [00:11<00:00, 62.39it/s]


Calculating descriptors: ['rdkit', 'morgan', 'maccs'] ...


100%|██████████| 182/182 [00:03<00:00, 60.63it/s]


{'caco2_wang': [0.281, 0.007]}
