# TDC ADMET, Caco-2_Wang Submission

In [2]:
from typing import Tuple

import numpy as np
import pandas as pd

# cheminformatics
import rdkit.Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

# logging
import tqdm

# data preprocessing
import sklearn.impute
import sklearn.preprocessing

# modeling
import sklearn.ensemble
from sklearn.model_selection import ParameterGrid

# metrics
import sklearn.metrics

from tdc.single_pred import ADME


In [3]:
data = ADME(name = 'Caco2_Wang')
split = data.get_split()

Downloading...
100%|██████████| 82.5k/82.5k [00:00<00:00, 305kiB/s]
Loading...
Done!


In [4]:
def add_descriptor_columns(data: pd.DataFrame) -> pd.DataFrame:
    """
    Use rdkit to get descriptors of each drug in the `data` df.
    Return a Pandas DataFrame with the descriptors as columns in the df and .
    """
    
    # Extract the Drug column
    assert 'Drug' in data.columns, "'Drug' must be a column in the input DataFrame."
    drugs = data['Drug']
    y = data['Y']
    
    # Get the descriptors for each drug
    print("Calculating descriptors...")
    descriptors = []
    for drug, target in tqdm.tqdm(zip(drugs, y)):
        descriptor = Descriptors.CalcMolDescriptors(
            rdkit.Chem.MolFromSmiles(drug)
        )
        descriptor['Drug'] = drug
        descriptor['Y'] = target
        descriptors.append(descriptor)

    # Make a dataframe for the descriptors
    df = pd.DataFrame(descriptors)

    return df

def preprocess_data(
    data: pd.DataFrame, 
    imputer=sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean'),
    fit_imputer=True,
    scaler_X=sklearn.preprocessing.RobustScaler(),
    scaler_y=sklearn.preprocessing.RobustScaler(),
    fit_scaler=True
):
    """
    Imputes missing values.
    Scales feature data.

    Returns a tuple X, y of scaled feature data and target data.
    """

    col_array = np.array(data.columns)

    # extract just the feature data
    X = data[col_array[~np.isin(col_array, ['Drug_ID', 'Drug', 'Y'])]].to_numpy()
    
    # extract the target data
    y = np.array(data['Y']).reshape(-1,1)
    
    # impute missing data
    if imputer is not None:
        if fit_imputer:
            X = imputer.fit_transform(X)
        else:
            X = imputer.transform(X)

    # scale the feature data
    if scaler_X is not None:
        if fit_scaler:
            X = scaler_X.fit_transform(X)
            y = scaler_y.fit_transform(y)
        else:
            X = scaler_X.transform(X)
            y = scaler_y.transform(y)



    return X, y, imputer, scaler_X, scaler_y

In [5]:
import xgboost as xgb

# Предобработка данных
X_train, y_train, imputer, scaler_X, scaler_y = preprocess_data(
    add_descriptor_columns(split['train'])
)
X_val, y_val, _, _, _ = preprocess_data(
    add_descriptor_columns(split['valid']),
    imputer=imputer, fit_imputer=False,
    scaler_X=scaler_X, scaler_y=scaler_y,
    fit_scaler=False)


Calculating descriptors...


637it [00:07, 86.55it/s] 


Calculating descriptors...


91it [00:00, 97.14it/s] 


In [6]:
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import numpy as np
import os
import json

# Создаем папку для модели, если нет
model_dir = "model/Cacao2"
os.makedirs(model_dir, exist_ok=True)

# Преобразуем данные в DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train.ravel())
dval = xgb.DMatrix(X_val, label=y_val.ravel())

# Сетка параметров
params_grid = {'subsample': [0.7, 0.8],
    'scale_pos_weight': [2, 3],   
    'reg_lambda': [1.0, 1.2],
    'reg_alpha': [0.3, 0.5],
    'n_estimators': [200, 300],  
    'min_child_weight': [1, 2],
    'max_depth': [7, 9],         
    'max_delta_step': [2, 3],
    'learning_rate': [0.03, 0.05],
    'gamma': [0.1, 0.2],
    'colsample_bytree': [0.7, 0.8]}

best_score = float('inf')
best_set = {}
best_num_boost_round = 0
best_model = None

for param_set in tqdm(ParameterGrid(params_grid), desc="Grid Search"):
    params = {
        'objective': 'reg:squarederror',
        'seed': 42,
        'eval_metric': 'mae',
        'tree_method': 'gpu_hist',  # Используем GPU
        'gpu_id': 0,
        'n_jobs': -1
    }
    params.update(param_set)

    evals = [(dtrain, 'train'), (dval, 'eval')]
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    y_val_pred_tmp = model.predict(dval)
    score_MAE = mean_absolute_error(y_val, y_val_pred_tmp)

    if score_MAE < best_score:
        best_score = score_MAE
        best_set = param_set
        best_num_boost_round = model.best_iteration
        best_model = model

# Сохраняем лучшую модель
model_path = os.path.join(model_dir, "best_model.xgb")
best_model.save_model(model_path)

# Сохраняем параметры лучшей модели
params_path = os.path.join(model_dir, "best_params.json")
with open(params_path, "w") as f:
    json.dump({
        "best_score": best_score,
        "best_params": best_set,
        "best_num_boost_round": best_num_boost_round
    }, f, indent=4)

print("Best MAE:", best_score)
print("Best params:", best_set)
print("Best num_boost_round:", best_num_boost_round)
print(f"Модель и параметры сохранены в {model_dir}")


Grid Search: 100%|██████████| 2048/2048 [34:18<00:00,  1.01s/it]

Best MAE: 0.27487999368926663
Best params: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.03, 'max_delta_step': 2, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0.3, 'reg_lambda': 1.2, 'scale_pos_weight': 2, 'subsample': 0.7}
Best num_boost_round: 208
Модель и параметры сохранены в model/Cacao2





In [8]:
from tdc.benchmark_group import admet_group
import tqdm

group = admet_group(path='data/')
predictions_list = []

for seed in [1, 2, 3, 4, 5]:
    benchmark = group.get('Caco2_Wang') 
    predictions = {}
    name = benchmark['name']

    # используем весь train_val для обучения
    train_val, test = benchmark['train_val'], benchmark['test']

    print(f"Seed {seed}:")

    # ---------------- Предобработка ---------------- #
    X_train, y_train, imputer, scaler_X, scaler_y = preprocess_data(
        add_descriptor_columns(train_val)
    )
    X_test, y_test, _, _, _ = preprocess_data(
        add_descriptor_columns(test),
        imputer=imputer, fit_imputer=False,
        scaler_X=scaler_X, fit_scaler=False,
        scaler_y=scaler_y
    )

    # ---------------- XGBoost ---------------- #
    dtrain = xgb.DMatrix(X_train, label=y_train.ravel())
    dtest = xgb.DMatrix(X_test)

    params = best_set.copy()
    params.update({
        'objective': 'reg:squarederror',
        'seed': seed,
        'eval_metric': 'mae'
    })

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=best_num_boost_round
    )

    # ---------------- Предсказания ---------------- #
    y_pred_test_scaled = model.predict(dtest)
    y_pred_test = scaler_y.inverse_transform(
        y_pred_test_scaled.reshape(-1, 1)
    ).reshape(-1)

    # сохраняем по TDC-шаблону
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

# ---------------- Оценка ---------------- #
results = group.evaluate_many(predictions_list)
print(results)


Found local copy...


Seed 1:
Calculating descriptors...


728it [00:08, 90.56it/s] 


Calculating descriptors...


182it [00:02, 84.81it/s]


Seed 2:
Calculating descriptors...


728it [00:08, 88.43it/s] 


Calculating descriptors...


182it [00:02, 87.22it/s]


Seed 3:
Calculating descriptors...


728it [00:08, 87.77it/s] 


Calculating descriptors...


182it [00:02, 84.67it/s]


Seed 4:
Calculating descriptors...


728it [00:08, 84.76it/s] 


Calculating descriptors...


182it [00:02, 80.99it/s]


Seed 5:
Calculating descriptors...


728it [00:08, 84.46it/s] 


Calculating descriptors...


182it [00:02, 79.02it/s]


{'caco2_wang': [0.272, 0.006]}
