In [1]:
import os
import tqdm
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
from MMFE import MMFE, normalize, denormalize, get_metrics

In [3]:
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb

In [4]:
random_state = 2025
np.random.seed(random_state)
torch.manual_seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
sns.set_theme(style='whitegrid')

# 1. Initialization

## Task

In [6]:
TASKNAME = '3uM'

output_dir = os.path.join('results', TASKNAME)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_dir = os.path.join(output_dir, 'MMFE')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## Torch Device

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data Load

In [8]:
df_raw = pd.read_csv(os.path.join('data', 'selectivity.csv'))

In [9]:
df_raw

Unnamed: 0,Compound,Drug name,PubChem CID,Binding Mode (based on ABL1-phos. vs. -nonphos affinity),S(300nM),S(3uM),SMILES
0,A-674563,,11314340,undetermined,0.1166,0.2772,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OC[C@H](CC4=C...
1,AB-1010,Masitinib,10074640,Type II,0.0337,0.0622,CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C...
2,ABT-869,Linifanib,11485656,undetermined,0.0648,0.1839,CC1=CC(=C(C=C1)F)NC(=O)NC2=CC=C(C=C2)C3=C4C(=C...
3,AC220,Quizartinib,24889392,Type II,0.0285,0.0751,CC(C)(C)C1=CC(=NO1)NC(=O)NC2=CC=C(C=C2)C3=CN4C...
4,AG-013736,Axitinib,6450551,Type I,0.0570,0.1969,CNC(=O)C1=CC=CC=C1SC2=CC3=C(C=C2)C(=NN3)/C=C/C...
...,...,...,...,...,...,...,...
67,TG-100-115,,10427712,Type I,0.0337,0.1321,C1=CC(=CC(=C1)O)C2=NC3=C(N=C(N=C3N=C2C4=CC(=CC...
68,TG-101348,,16722836,Type I,0.1788,0.5389,CC1=CN=C(N=C1NC2=CC(=CC=C2)S(=O)(=O)NC(C)(C)C)...
69,Vandetanib,,3081361,Type I,0.0933,0.2358,CN1CCC(CC1)COC2=C(C=C3C(=C2)N=CN=C3NC4=C(C=C(C...
70,VX-680/MK-0457,Tozasertib,5494449,Type I,0.1321,0.3472,CC1=CC(=NN1)NC2=CC(=NC(=N2)SC3=CC=C(C=C3)NC(=O...


In [10]:
X_raw = df_raw['SMILES'].values
y_raw = df_raw[f'S({TASKNAME})'].values.astype(np.float32)

# 2. 5-Fold Cross-Validation

In [11]:
# 5-fold cross-validation setup
y_binned = pd.qcut(y_raw, q=5, labels=False, duplicates='drop')
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

ys = []

for fold, (train_idx, test_idx) in enumerate(tqdm.tqdm(skf.split(X_raw, y_binned), total=5, desc="5-Fold CV")):
    ###################################################################
    ## Data Splits
    ###################################################################
    # Test set
    X_te = X_raw[test_idx]
    y_te = y_raw[test_idx]
    
    # Train set (further split into train and validation)
    X_train_full = X_raw[train_idx]
    y_train_full = y_raw[train_idx]
    
    # Split train into train and validation
    idx_tr, idx_va = train_test_split(range(len(X_train_full)), test_size=0.1, random_state=2025)
    
    X_tr = X_train_full[idx_tr]
    y_tr = y_train_full[idx_tr]
    X_va = X_train_full[idx_va]
    y_va = y_train_full[idx_va]

    ###################################################################
    ## Model Initialization
    ###################################################################
    model = MMFE(output_dir, device, fold)

    ###################################################################
    ## Model Training
    ###################################################################
    _ = model.fit(X_tr, X_va, y_tr.reshape(-1,1), y_va.reshape(-1,1), temperature=1.0)

    ###################################################################
    ## Embeddings
    ###################################################################
    H_tv = model.predict(np.hstack([X_tr, X_va]))
    z_tv = normalize(np.hstack([y_tr, y_va]))
    H_te = model.predict(X_te)

    ###################################################################
    ## Classifier
    ###################################################################
    clfs = {
        'ElasticNet': ElasticNet(alpha=0.01, random_state=random_state),
        'Ridge': Ridge(random_state=random_state),
        'Lasso': Lasso(alpha=0.01, random_state=random_state),
        'SVR': SVR(),
        'KNN': KNeighborsRegressor(),
        'DecisionTree': DecisionTreeRegressor(random_state=random_state),
        'RandomForest': RandomForestRegressor(random_state=random_state),
        'AdaBoost': AdaBoostRegressor(random_state=random_state),
        'XGBoost': xgb.XGBRegressor(random_state=random_state)
    }

    # Process each test sample for this fold
    fold_results = []
    for i, (gt, h_te) in enumerate(zip(y_te, H_te)):
        p_te = {'GroundTruth': gt, 'Fold': fold}
        for clf_name, clf in clfs.items():
            clf.fit(H_tv, z_tv)
            p_te[clf_name] = denormalize(clf.predict(h_te.reshape(1, -1))[0])
        fold_results.append(p_te)
        ys.append(p_te)
    
    ###################################################################
    ## Save individual fold results
    ###################################################################
    # Create fold directory
    fold_dir = os.path.join(output_dir, f'Fold{fold+1}')
    if not os.path.exists(fold_dir):
        os.makedirs(fold_dir)
    
    # Save fold predictions
    df_fold = pd.DataFrame(fold_results)
    df_fold.to_csv(os.path.join(fold_dir, 'predictions.csv'), index=False)
    
    # Calculate and save fold metrics
    fold_metrics = []
    model_names = [col for col in df_fold.columns if col not in ['GroundTruth', 'Fold']]
    
    for model_name in model_names:
        fold_res = get_metrics(df_fold['GroundTruth'], df_fold[model_name])
        fold_res['model_name'] = model_name
        fold_metrics.append(fold_res)
    
    df_fold_metrics = pd.DataFrame(fold_metrics)
    df_fold_metrics.to_csv(os.path.join(fold_dir, 'metrics.csv'), index=False)

5-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
5-Fold CV: 100%|██████████| 5/5 [08:12<00:00, 98.52s/it] 


In [12]:
df_ys = pd.DataFrame(ys)

In [13]:
df_ys

Unnamed: 0,GroundTruth,Fold,ElasticNet,Ridge,Lasso,SVR,KNN,DecisionTree,RandomForest,AdaBoost,XGBoost
0,0.0622,0,0.048132,0.047612,0.055737,0.053898,0.038701,0.0570,0.071727,0.086488,0.072099
1,0.1969,0,0.119453,0.116539,0.117372,0.133447,0.178126,0.2668,0.184765,0.146159,0.198524
2,0.4922,0,0.160034,0.155554,0.157381,0.170614,0.135725,0.1632,0.151517,0.125821,0.176353
3,0.1140,0,0.199696,0.200220,0.173411,0.239779,0.148058,0.1813,0.194274,0.130262,0.210028
4,0.1606,0,0.106999,0.102924,0.109395,0.113261,0.116177,0.1813,0.125638,0.098357,0.139576
...,...,...,...,...,...,...,...,...,...,...,...
67,0.0311,4,0.177329,0.187128,0.158380,0.239722,0.193282,0.2772,0.308613,0.364370,0.331224
68,0.6088,4,0.239286,0.250120,0.224132,0.292378,0.221017,0.2565,0.339195,0.327231,0.323684
69,0.0933,4,0.197799,0.212986,0.175098,0.270536,0.178586,0.1969,0.272308,0.141404,0.404131
70,0.5959,4,0.466697,0.479375,0.478472,0.477613,0.429440,0.4301,0.448371,0.447480,0.489280


In [14]:
results = []
model_names = [col for col in df_ys.columns if col not in ['GroundTruth', 'Fold']]

for model_name in model_names:
    fold_metrics = []
    for fold in range(5):
        fold_data = df_ys[df_ys['Fold'] == fold]
        if len(fold_data) > 0:
            fold_res = get_metrics(fold_data['GroundTruth'], fold_data[model_name])
            fold_metrics.append(fold_res)
    
    # Calculate mean and std across folds
    if fold_metrics:
        rmse_values = [m['rmse'] for m in fold_metrics]
        r2_values = [m['r2'] for m in fold_metrics]
        pcc_values = [m['pcc'] for m in fold_metrics]
        
        res = {
            'model_name': model_name,
            'rmse': round(np.mean(rmse_values), 3),
            'rmse_std': round(np.std(rmse_values), 3),
            'r2': round(np.mean(r2_values), 3),
            'r2_std': round(np.std(r2_values), 3),
            'pcc': round(np.mean(pcc_values), 3),
            'pcc_std': round(np.std(pcc_values), 3)
        }
        results.append(res)

df_res = pd.DataFrame(results)

In [15]:
df_res.to_csv(os.path.join(output_dir, 'metrics.csv'), index=False)

In [16]:
df_res

Unnamed: 0,model_name,rmse,rmse_std,r2,r2_std,pcc,pcc_std
0,ElasticNet,0.171,0.024,0.132,0.363,0.465,0.269
1,Ridge,0.171,0.024,0.122,0.378,0.461,0.274
2,Lasso,0.173,0.025,0.105,0.384,0.447,0.288
3,SVR,0.173,0.017,0.11,0.361,0.452,0.277
4,KNN,0.183,0.027,0.025,0.382,0.426,0.283
5,DecisionTree,0.202,0.038,-0.429,0.967,0.36,0.353
6,RandomForest,0.188,0.017,-0.072,0.487,0.377,0.313
7,AdaBoost,0.187,0.025,-0.046,0.509,0.392,0.281
8,XGBoost,0.187,0.019,-0.088,0.546,0.395,0.304
