# Prediction of drug release type with Random Forest with synthetic minority over sampling technique (RF-SMOTE)
## Initialization of environment

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from optuna.samplers import TPESampler
import warnings
optuna.logging.set_verbosity(optuna.logging.ERROR)
warnings.filterwarnings("ignore")

## Data loading and preparation
Definition of variables, data loading, normalization and interpolation of the drug release profile, calculation of drug release profile AUC, and definition of drug release type (burst: AUC > 0.5, delayed: AUC <= 0.5)

In [4]:
num_interp_pts = 11
n_outer_folds = 10
n_inner_folds = 2
n_trials = 50
# ----------------------------------------------------------------------------------------
# Load data
# ----------------------------------------------------------------------------------------
file_path_form = 'mp_dataset_processed_no_dupes.xlsx'
file_path_time = 'mp_dataset_processed_time_release_only.xlsx'
formulation_df = pd.read_excel(file_path_form, engine='openpyxl')
release_df = pd.read_excel(file_path_time, engine='openpyxl')

# ----------------------------------------------------------------------------------------
# Encode categorical
# ----------------------------------------------------------------------------------------
unique_values_emulsion = formulation_df['Formulation Method'].unique()
mapping = {v: i for i, v in enumerate(unique_values_emulsion)}
formulation_df['Formulation Method Encoded'] = formulation_df['Formulation Method'].map(mapping)
formulation_df.drop(columns=['Formulation Method', 'Drug SMILES'], inplace=True)


# ----------------------------------------------------------------------------------------
# Interpolation
# ----------------------------------------------------------------------------------------
group = release_df.groupby('Formulation Index')['Time']
min_time = group.transform('min')
max_time = group.transform('max')
release_df['Normalized Time'] = (release_df['Time'] - min_time) / (max_time - min_time)
normalized_times = np.linspace(0, 1, num_interp_pts)
interpolated_dfs = []
for formulation, g in release_df.groupby('Formulation Index'):
    g = g.sort_values('Time')
    time_min, time_max = g['Time'].min(), g['Time'].max()
    g['Normalized Time'] = (g['Time'] - time_min) / (time_max - time_min)
    interp_release = np.interp(normalized_times, g['Normalized Time'], g['Release'])
    interpolated_dfs.append(pd.DataFrame({
        'Formulation Index': formulation,
        'Normalized Time': normalized_times,
        'Interpolated Release': interp_release
    }))
interp_df = pd.concat(interpolated_dfs, ignore_index=True)

X = formulation_df.drop(columns=['Formulation Index']).to_numpy()
#X = formulation_df.to_numpy()  # [321, 11]
groups = interp_df.groupby('Formulation Index')['Interpolated Release']

# ----------------------------------------------------------------------------------------
# AUC and drug release type definition
# ----------------------------------------------------------------------------------------
auc = (
    interp_df.groupby("Formulation Index")
      .apply(lambda g: np.trapz(g["Interpolated Release"], g["Normalized Time"]))
      .reset_index(name="AUC")
)

auc['burst'] = (auc['AUC'] > 0.5).astype(int)
y = auc['burst'].values  # shape (n_samples,)


## Model Definition and Training
Nested cross-validation with optuna hyperparameter optimization

In [5]:
# ----------------------------------------------------------------------------------------
# Model wrapper for Random Forest Classifier
# ----------------------------------------------------------------------------------------
class RandomForestModel:
    def __init__(self, **params):
        self.model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)[:, 1]

    def evaluate(self, X, y_true):
        y_pred = self.predict(X)
        return accuracy_score(y_true, y_pred)

    def feature_importances(self):
        return self.model.feature_importances_

# ----------------------------------------------------------------------------------------
# Storage
# ----------------------------------------------------------------------------------------
stored_best_models = []
stored_best_preds = []
stored_best_proba = []
stored_test_targets = []
stored_metrics = []

# ----------------------------------------------------------------------------------------
# Nested CV setup
# ----------------------------------------------------------------------------------------
outer_kf = StratifiedKFold(n_splits=n_outer_folds, shuffle=True, random_state=42)
inner_kf = StratifiedKFold(n_splits=n_inner_folds, shuffle=True, random_state=42)

# ----------------------------------------------------------------------------------------
# Outer CV loop
# ----------------------------------------------------------------------------------------
for outer_fold, (train_val_idx, test_idx) in enumerate(outer_kf.split(X, y)):
    print(f"\nOuter Fold {outer_fold + 1}")

    X_train_val, y_train_val = X[train_val_idx], y[train_val_idx]
    X_test, y_test = X[test_idx], y[test_idx]
    stored_test_targets.append(y_test)

    # ----------------------------------------------------------------------------------------
    # Optuna objective for inner CV
    # ----------------------------------------------------------------------------------------
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 3, 20),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False])
        }

        val_accs = []

        for inner_train_idx, inner_val_idx in inner_kf.split(X_train_val, y_train_val):
            X_tr, X_val = X_train_val[inner_train_idx], X_train_val[inner_val_idx]
            y_tr, y_val = y_train_val[inner_train_idx], y_train_val[inner_val_idx]

            # Apply SMOTE to training fold
            smote = SMOTE(random_state=42)
            X_tr_res, y_tr_res = smote.fit_resample(X_tr, y_tr)

            model = RandomForestModel(**params)
            model.fit(X_tr_res, y_tr_res)
            val_accs.append(model.evaluate(X_val, y_val))

        return 1 - np.mean(val_accs)

    # ----------------------------------------------------------------------------------------
    # Run Optuna study
    # ----------------------------------------------------------------------------------------
    tpe_sampler = TPESampler(seed=42) 
    study = optuna.create_study(direction="minimize", sampler=tpe_sampler)
    study.optimize(objective, n_trials=n_trials)
    best_params = study.best_params
    print("Best params:", best_params)

    # ----------------------------------------------------------------------------------------
    # Train best model on full training data with SMOTE
    # ----------------------------------------------------------------------------------------
    smote = SMOTE(random_state=42)
    X_train_val_res, y_train_val_res = smote.fit_resample(X_train_val, y_train_val)

    best_model = RandomForestModel(**best_params)
    best_model.fit(X_train_val_res, y_train_val_res)
    preds_best = best_model.predict(X_test)
    preds_proba_best = best_model.predict_proba(X_test)

    # ----------------------------------------------------------------------------------------
    # Compute metrics
    # ----------------------------------------------------------------------------------------
    acc_best = accuracy_score(y_test, preds_best)
    precision_best = precision_score(y_test, preds_best, zero_division=0)
    recall_best = recall_score(y_test, preds_best, zero_division=0)
    f1_best = f1_score(y_test, preds_best, zero_division=0)
    auc_best = roc_auc_score(y_test, preds_proba_best)

    try:
        tn, fp, fn, tp = confusion_matrix(y_test, preds_best).ravel()
        specificity_best = tn / (tn + fp)
    except ValueError:
        specificity_best = 0.0

    stored_best_models.append(best_model)
    stored_best_preds.append(preds_best)
    stored_best_proba.append(preds_proba_best)
    stored_metrics.append({
        "fold": outer_fold + 1,
        "accuracy": acc_best,
        "precision": precision_best,
        "recall_sensitivity": recall_best,
        "specificity": specificity_best,
        "f1": f1_best,
        "auc": auc_best
    })

    print(f"Fold {outer_fold+1} - ACC: {acc_best:.2}, AUC: {auc_best:.2}, "
          f"Prec: {precision_best:.2f}, Rec: {recall_best:.2}, F1: {f1_best:.2}")



Outer Fold 1
Best params: {'n_estimators': 53, 'max_depth': 17, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True}
Fold 1 - ACC: 0.73, AUC: 0.67, Prec: 0.80, Rec: 0.83, F1: 0.82

Outer Fold 2
Best params: {'n_estimators': 223, 'max_depth': 19, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True}
Fold 2 - ACC: 0.72, AUC: 0.81, Prec: 0.77, Rec: 0.87, F1: 0.82

Outer Fold 3
Best params: {'n_estimators': 264, 'max_depth': 11, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False}
Fold 3 - ACC: 0.72, AUC: 0.74, Prec: 0.85, Rec: 0.74, F1: 0.79

Outer Fold 4
Best params: {'n_estimators': 258, 'max_depth': 15, 'min_samples_split': 17, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True}
Fold 4 - ACC: 0.78, AUC: 0.84, Prec: 0.90, Rec: 0.78, F1: 0.84

Outer Fold 5
Best params: {'n_estimators': 273, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_featu

## Performance metrics
Accuracy, AUC, precision, recall sensitivity, F1

In [7]:
# ----------------------
# Save metrics
# ----------------------
metrics_df = pd.DataFrame(stored_metrics)
#metrics_df.to_csv("RF_class_metrics.csv", index=False)
#print("\nRF_classn_metrics.csv")

# ----------------------
# Final summary
# ----------------------
print("\nFinal Nested CV Results:")
for metric in ["accuracy", "auc", "precision", "recall_sensitivity", "f1"]:
    print(f"{metric.upper()}: {metrics_df[metric].mean():.2f} ± {metrics_df[metric].std():.2f}")


Final Nested CV Results:
ACCURACY: 0.76 ± 0.06
AUC: 0.79 ± 0.09
PRECISION: 0.85 ± 0.05
RECALL_SENSITIVITY: 0.83 ± 0.06
F1: 0.84 ± 0.04
