## Prediction of drug release type with Logistic Regression with minority oversampling technique (LR-SMOTE)
Initialization of environment

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, confusion_matrix
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from functools import partial
from sklearn.preprocessing import StandardScaler
from optuna.samplers import TPESampler
import warnings
optuna.logging.set_verbosity(optuna.logging.ERROR)
warnings.filterwarnings("ignore")

## Data loading and preparation
Definition of variables, data loading, normalization and interpolation of the drug release profile, calculation of drug release profile AUC, and definition of drug release type (burst: AUC > 0.5, delayed: AUC <= 0.5)

In [4]:
num_interp_pts = 11
n_outer_folds = 10
n_inner_folds = 2
n_trials = 50
# ----------------------------------------------------------------------------------------
# Load data
# ----------------------------------------------------------------------------------------
file_path_form = 'mp_dataset_processed_no_dupes.xlsx'
file_path_time = 'mp_dataset_processed_time_release_only.xlsx'
formulation_df = pd.read_excel(file_path_form, engine='openpyxl')
release_df = pd.read_excel(file_path_time, engine='openpyxl')

# ----------------------------------------------------------------------------------------
# Encode categorical
# ----------------------------------------------------------------------------------------
unique_values_emulsion = formulation_df['Formulation Method'].unique()
mapping = {v: i for i, v in enumerate(unique_values_emulsion)}
formulation_df['Formulation Method Encoded'] = formulation_df['Formulation Method'].map(mapping)
formulation_df.drop(columns=['Formulation Method', 'Drug SMILES'], inplace=True)

# ----------------------------------------------------------------------------------------
# Interpolation
# ----------------------------------------------------------------------------------------
group = release_df.groupby('Formulation Index')['Time']
min_time = group.transform('min')
max_time = group.transform('max')
release_df['Normalized Time'] = (release_df['Time'] - min_time) / (max_time - min_time)
normalized_times = np.linspace(0, 1, num_interp_pts)
interpolated_dfs = []
for formulation, g in release_df.groupby('Formulation Index'):
    g = g.sort_values('Time')
    time_min, time_max = g['Time'].min(), g['Time'].max()
    g['Normalized Time'] = (g['Time'] - time_min) / (time_max - time_min)
    interp_release = np.interp(normalized_times, g['Normalized Time'], g['Release'])
    interpolated_dfs.append(pd.DataFrame({
        'Formulation Index': formulation,
        'Normalized Time': normalized_times,
        'Interpolated Release': interp_release
    }))
interp_df = pd.concat(interpolated_dfs, ignore_index=True)

X = formulation_df.drop(columns=['Formulation Index']).to_numpy()
#X = formulation_df.to_numpy()  # [321, 11]
groups = interp_df.groupby('Formulation Index')['Interpolated Release']

# ----------------------------------------------------------------------------------------
# AUC and drug release type definition
# ----------------------------------------------------------------------------------------
auc = (
    interp_df.groupby("Formulation Index")
      .apply(lambda g: np.trapz(g["Interpolated Release"], g["Normalized Time"]))
      .reset_index(name="AUC")
)


auc['burst'] = (auc['AUC'] > 0.5).astype(int)
y = auc['burst'].values  # shape (n_samples,)


## Model Definition and Training
Nested cross-validation with optuna hyperparameter optimization

In [11]:
# ----------------------------------------------------------------------------------------
# Model wrapper for LR-SMOTE classifier
# ----------------------------------------------------------------------------------------
class LogisticModel:
    def __init__(self, **params):
        self.model = LogisticRegression(**params, solver='liblinear', random_state=42)

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        proba_output = self.model.predict_proba(X)
        if proba_output.ndim == 2 and proba_output.shape[1] == 2:
            return proba_output[:, 1]
        
        return proba_output.squeeze()


# ----------------------------------------------------------------------------------------
# Storage
# ----------------------------------------------------------------------------------------
stored_test_targets = []
stored_best_models = []
stored_best_preds = []
stored_best_proba_all = []
stored_metrics = []

# ----------------------------------------------------------------------------------------
# Nested CV setup
# ----------------------------------------------------------------------------------------
outer_kf = StratifiedKFold(n_splits=n_outer_folds, shuffle=True, random_state=42)
inner_kf = StratifiedKFold(n_splits=n_inner_folds, shuffle=True, random_state=42)

# ----------------------------------------------------------------------------------------
# Optuna objective for inner CV
# ----------------------------------------------------------------------------------------
def objective(trial, X_train_val, y_train_val, inner_kf):
    params = {
            "C": trial.suggest_float("C", 1e-4, 1e4, log=True),
            "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
            "max_iter": trial.suggest_int("max_iter", 200, 1000),
            "tol": trial.suggest_float("tol", 1e-6, 1e-3, log=True),
            "class_weight": trial.suggest_categorical("class_weight", [None, 'balanced']),
        }

    
    val_accs = []
    for inner_train_idx, inner_val_idx in inner_kf.split(X_train_val, y_train_val):
        X_tr, X_val = X_train_val[inner_train_idx], X_train_val[inner_val_idx]
        y_tr, y_val = y_train_val[inner_train_idx], y_train_val[inner_val_idx]

        pipeline = Pipeline([
            ('smote', SMOTE(random_state=42)),
            ('scaler', StandardScaler()),
            ('model', LogisticRegression(**params))
        ])

        try:
            pipeline.fit(X_tr, y_tr)
            preds_val = pipeline.predict(X_val)
            acc_val = accuracy_score(y_val, preds_val)
        except ValueError:
            acc_val = 0.0

        val_accs.append(acc_val)
        
    return 1 - np.mean(val_accs)

# ----------------------------------------------------------------------------------------
# Outer CV loop
# ----------------------------------------------------------------------------------------
for outer_fold, (train_val_idx, test_idx) in enumerate(outer_kf.split(X, y)):
    print(f"\nOuter Fold {outer_fold + 1}")

    X_train_val, y_train_val = X[train_val_idx], y[train_val_idx]
    X_test, y_test = X[test_idx], y[test_idx]
    stored_test_targets.append(y_test)

    objective_with_data = partial(objective, X_train_val=X_train_val, y_train_val=y_train_val, inner_kf=inner_kf)
    # ----------------------------------------------------------------------------------------
    # Run Optuna study
    # ----------------------------------------------------------------------------------------
    tpe_sampler = TPESampler(seed=42) 
    study = optuna.create_study(direction="minimize", sampler=tpe_sampler)
    study.optimize(objective_with_data, n_trials=n_trials, n_jobs=-1)
    best_params = study.best_params
    print("Best params:", best_params)



    best_pipeline = Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('scaler', StandardScaler()), 
        ('model', LogisticModel(**best_params))
    ])

    best_pipeline.fit(X_train_val, y_train_val)

    preds_best = best_pipeline.predict(X_test)
    preds_best_proba = best_pipeline.predict_proba(X_test)

    # --- Compute metrics ---
    acc_best = accuracy_score(y_test, preds_best)
    auc_best = roc_auc_score(y_test, preds_best_proba)
    precision_best = precision_score(y_test, preds_best, zero_division=0)
    recall_best = recall_score(y_test, preds_best, zero_division=0)
    f1_best = f1_score(y_test, preds_best, zero_division=0)

    try:
        tn, fp, fn, tp = confusion_matrix(y_test, preds_best).ravel()
        specificity_best = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    except ValueError:
        specificity_best = 0.0

    stored_best_models.append(best_pipeline)
    stored_best_preds.append(preds_best)
    stored_best_proba_all.append(preds_best_proba)
    stored_metrics.append({
        "fold": outer_fold + 1,
        "accuracy": acc_best,
        "auc": auc_best,
        "precision": precision_best,
        "recall_sensitivity": recall_best,
        "specificity": specificity_best,
        "f1": f1_best
    })

    print(f"Fold {outer_fold+1} - ACC: {acc_best:.2}, AUC: {auc_best:.2}, "
          f"Prec: {precision_best:.2f}, Rec: {recall_best:.2}, F1: {f1_best:.2}")
    


Outer Fold 1
Best params: {'C': 6.9376994718731995, 'penalty': 'l2', 'max_iter': 297, 'tol': 0.00028791072842977933, 'class_weight': None}
Fold 1 - ACC: 0.73, AUC: 0.63, Prec: 0.83, Rec: 0.79, F1: 0.81

Outer Fold 2
Best params: {'C': 3.1785064804200625, 'penalty': 'l2', 'max_iter': 665, 'tol': 1.120930273013763e-06, 'class_weight': None}
Fold 2 - ACC: 0.62, AUC: 0.58, Prec: 0.79, Rec: 0.65, F1: 0.71

Outer Fold 3
Best params: {'C': 27.29224298971202, 'penalty': 'l2', 'max_iter': 308, 'tol': 0.0009432910085869173, 'class_weight': None}
Fold 3 - ACC: 0.72, AUC: 0.71, Prec: 0.89, Rec: 0.7, F1: 0.78

Outer Fold 4
Best params: {'C': 0.5289721289149582, 'penalty': 'l2', 'max_iter': 797, 'tol': 4.805167840332365e-05, 'class_weight': None}
Fold 4 - ACC: 0.66, AUC: 0.73, Prec: 0.93, Rec: 0.57, F1: 0.7

Outer Fold 5
Best params: {'C': 0.00021924110564178025, 'penalty': 'l2', 'max_iter': 530, 'tol': 7.819087183146375e-05, 'class_weight': 'balanced'}
Fold 5 - ACC: 0.53, AUC: 0.65, Prec: 0.74, Re

## Performance metrics
Accuracy, AUC, precision, recall sensitivity, F1

In [12]:
# ----------------------
# Save metrics
# ----------------------
metrics_df = pd.DataFrame(stored_metrics)
#metrics_df.to_csv("RF_class_metrics.csv", index=False)
#print("\nRF_classn_metrics.csv")

# ----------------------
# Final summary
# ----------------------
print("\nFinal Nested CV Results:")
for metric in ["accuracy", "auc", "precision", "recall_sensitivity", "f1"]:
    print(f"{metric.upper()}: {metrics_df[metric].mean():.2f} ± {metrics_df[metric].std():.2f}")


Final Nested CV Results:
ACCURACY: 0.65 ± 0.10
AUC: 0.68 ± 0.09
PRECISION: 0.82 ± 0.07
RECALL_SENSITIVITY: 0.67 ± 0.11
F1: 0.74 ± 0.08
