# DD-Hybrid Sampler: Quick Demo

In [2]:

from dd_hybrid_sampler import DDHybridSampler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, balanced_accuracy_score
import numpy as np

# Generate an imbalanced dataset
X, y = make_classification(n_samples=1500, n_features=20, n_informative=8,
                            n_redundant=4, n_clusters_per_class=1,
                            weights=[0.90, 0.10], 
                            # n_classes=4,
                            # weights=[0.6, 0.25, 0.1, 0.05],
                            flip_y=0.01, random_state=42)

def ir_of(y):
    classes, counts = np.unique(y, return_counts=True)
    maj = classes[np.argmax(counts)]
    mino = classes[np.argmin(counts)]
    return counts[np.argmax(counts)] / counts[np.argmin(counts)]

print(f"IR before: {ir_of(y):.2f}")
sampler = DDHybridSampler(target_ir=1.5, k=5, random_state=42)
Xr, yr = sampler.fit_resample(X, y)
print(f"IR after : {ir_of(yr):.2f}  (Δn = {len(yr)-len(y)})")


IR before: 8.49
IR after : 1.50  (Δn = 737)


## Train/Test Performance (Before vs After Resampling)

In [3]:

# Train/test split (same split used for both experiments for fairness)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.3, stratify=yr, random_state=0)

scaler = StandardScaler()
clf = LogisticRegression(max_iter=200, random_state=0)

def fit_eval(Xtr, ytr, Xte, yte, label):
    Xtr_s = scaler.fit_transform(Xtr)
    Xte_s = scaler.transform(Xte)
    clf.fit(Xtr_s, ytr)
    ypred = clf.predict(Xte_s)
    bal_acc = balanced_accuracy_score(yte, ypred)
    print(f"[{label}] Balanced Acc: {bal_acc:.3f}")
    print(classification_report(yte, ypred, digits=3))

print("Original data:")
fit_eval(X_train, y_train, X_test, y_test, "Original")

print("\nResampled data:")
fit_eval(Xr_train, yr_train, Xr_test, yr_test, "DD-Hybrid")


Original data:
[Original] Balanced Acc: 0.859
              precision    recall  f1-score   support

           0      0.969     0.995     0.982       403
           1      0.944     0.723     0.819        47

    accuracy                          0.967       450
   macro avg      0.957     0.859     0.900       450
weighted avg      0.966     0.967     0.965       450


Resampled data:
[DD-Hybrid] Balanced Acc: 0.926
              precision    recall  f1-score   support

           0      0.936     0.948     0.942       403
           1      0.920     0.903     0.912       269

    accuracy                          0.930       672
   macro avg      0.928     0.926     0.927       672
weighted avg      0.930     0.930     0.930       672

