- **GPU Acceleration**

In [1]:
# import cuml.accel
# cuml.accel.install()

# Imports

In [2]:
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, matthews_corrcoef)
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.ensemble import BalancedBaggingClassifier
from dd_hybrid_sampler import DDHybridSampler
from cost_sensitive import OverBoostClassifier

from pathlib import Path

# Dataset Listing and Loading

In [3]:
def read_data(path: Path):
    if path.suffix == ".csv":
        return pd.read_csv(path)
    else:                       # Excel
        return pd.read_excel(path)

- Dataset path:

In [4]:
DATA_DIR   = Path("Imbalanced_datasets") / "binary"         # CSV files
CSV_FILES  = sorted(DATA_DIR.glob("*.csv"))
assert CSV_FILES, "No CSV found"


- Result save path

In [5]:
OUT_DIR = pathlib.Path("output") / "binary"  
OUT_DIR.mkdir(exist_ok=True, parents=True)

- All avalable datasets:

In [6]:
for i, f in enumerate(CSV_FILES):
    print(i ,f.name)

0 BigML_Dataset.csv
1 MBA.csv
2 Satimage.csv
3 abalone-17_vs_7-8-9-10.csv
4 abalone-19_vs_10-11-12-13.csv
5 abalone-20_vs_8-9-10.csv
6 abalone.csv
7 abalone19.csv
8 abalone_20.csv
9 brain_stroke.csv
10 breast_cancer_wisconsin.csv
11 car_good.csv
12 cervical.csv
13 cirrhosis.csv
14 cleveland-0_vs_4.csv
15 compile_level_01.csv
16 df_train1.csv
17 diabetes.csv
18 ecoli-0-1-4-7_vs_2-3-5-6.csv
19 ecoli-0-1-4-7_vs_5-6.csv
20 ecoli-0-2-3-4_vs_5.csv
21 ecoli-0-2-6-7_vs_3-5.csv
22 ecoli-0-3-4-6_vs_5.csv
23 ecoli-0-3-4-7_vs_5-6.csv
24 ecoli-0-3-4_vs_5.csv
25 ecoli-0-4-6_vs_5.csv
26 ecoli-0-6-7_vs_3-5.csv
27 ecoli-0-6-7_vs_5.csv
28 ecoli-0_vs_1.csv
29 ecoli2.csv
30 ecoli3.csv
31 ecoli4.csv
32 ecoli_013vs26.csv
33 flaref.csv
34 glass-0-1-4-6_vs_2.csv
35 glass-0-1-5_vs_2.csv
36 glass-0-1-6_vs_2.csv
37 glass-0-1-6_vs_5.csv
38 glass-0-4_vs_5.csv
39 glass-0-6_vs_5.csv
40 glass0.csv
41 glass1.csv
42 glass2.csv
43 glass4.csv
44 glass6.csv
45 hcv_binary_mice.csv
46 hcv_data_binary.csv
47 hf.csv
48 ionosp

# Machine Learning Pipeline

## ML Model

In [7]:

CV         = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
CLASSIFIER = RandomForestClassifier(n_estimators=300,
                                    n_jobs=-1,
                                    random_state=42)


## Sampling Algorithms

In [8]:

SAMPLERS   = {
    "None"          : None,
    "SMOTE"         : SMOTE(random_state=42),
    "OverSample"    : RandomOverSampler(random_state=42),
    "DownSample"    : RandomUnderSampler(random_state=42),
    "SMOTE_Bagging" : BalancedBaggingClassifier(random_state=84, sampler = SMOTE(random_state=100, k_neighbors=2)),
    "CNN"           : CondensedNearestNeighbour(random_state=10, n_jobs=-1),
    "NC"            : NeighbourhoodCleaningRule(),
    "ICost"         : OverBoostClassifier(random_state=84),
    "DD_Hybrid"     : DDHybridSampler(target_ir=1.5, k=5, random_state=42)
}

SAMPLE_MODEL = ["ICost", "SMOTE_Bagging"]

## Model Metrics

In [9]:

METRICS    = ["accuracy", "macro_precision", "macro_recall", "macro_f1",
              "weighted_precision", "weighted_recall", "weighted_f1", "mcc"]

## Model Pipeline

In [10]:
def build_pipeline(sampler, classifier=CLASSIFIER) -> ImbPipeline:
    steps = [
        ("impute", SimpleImputer(strategy="median")),
        ("scale" , StandardScaler()),          # optional but cheap
    ]
    if sampler is not None:
        steps.append(("sample", sampler))
    steps.append(("clf", classifier))
    return ImbPipeline(steps)

## Scoring function

In [11]:
def score_fold(y_true, y_pred) -> dict:
    return {
        "accuracy"          : accuracy_score(y_true, y_pred),
        "macro_precision"   : precision_score(y_true, y_pred,
                                              average="macro", zero_division=0),
        "macro_recall"      : recall_score(y_true, y_pred,
                                           average="macro", zero_division=0),
        "macro_f1"          : f1_score(y_true, y_pred,
                                       average="macro", zero_division=0),
        "weighted_precision": precision_score(y_true, y_pred,
                                              average="weighted", zero_division=0),
        "weighted_recall"   : recall_score(y_true, y_pred,
                                           average="weighted", zero_division=0),
        "weighted_f1"       : f1_score(y_true, y_pred,
                                       average="weighted", zero_division=0),
        "mcc"               : matthews_corrcoef(y_true, y_pred),
    }

# Master loop – run pipelines

For all Datasets
- Loading Dataset
- Label Encoding
- Loop for all Sampling Algorithms
- Build Pipeline
- Train-Test model with StratifiedkFold
- Save the scores

In [12]:
def imbalance_ratio(y):
    """
    y : 1-D numpy array of class labels
    returns IR (float) = majority_count / minority_count
    """
    counts = np.bincount(y)          # works for integer-encoded labels
    return counts.max() / counts.min()

In [13]:
scores = {m: {s: {} for s in SAMPLERS} for m in METRICS}   # metric -> sampler -> dataset -> score

for dataset_path in CSV_FILES:
    ds_name = dataset_path.stem
    print(f"\nProcessing {ds_name}" , end="\t\t")
    
    # load + basic cleaning
    df = read_data(dataset_path)
    df = df.dropna(subset=[df.columns[-1]])
    if df.shape[0] < 20:
        print("Too few samples (<20). Skipping...")
        continue
    print("Target Size = ", df.shape, "\t", "No. of classes = ", df.iloc[:, -1].nunique())

    X = df.iloc[:, :-1].copy()
    y = LabelEncoder().fit_transform(df.iloc[:, -1])

    
    cat_cols = X.select_dtypes(exclude=np.number).columns
    if len(cat_cols):
        X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
    
    

    for samp_name, sampler in SAMPLERS.items():
        print("\t- Performing Cross-Validation on : ", samp_name)
        if samp_name in SAMPLE_MODEL:
            pipe = build_pipeline(None, classifier=sampler)
        else:
            pipe = build_pipeline(sampler)
        fold_scores = {m: [] for m in METRICS}
        
        for train_idx, val_idx in CV.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            pipe.fit(X_train, y_train)          # ALL steps fitted only on train
            y_pred = pipe.predict(X_val)
            
            for k, v in score_fold(y_val, y_pred).items():
                fold_scores[k].append(v)
        
        # mean across folds
        for m in METRICS:
            scores[m]["IR"][ds_name] = imbalance_ratio(y)
            scores[m][samp_name][ds_name] = np.mean(fold_scores[m])


Processing BigML_Dataset		Target Size =  (3333, 21) 	 No. of classes =  2
	- Performing Cross-Validation on :  None


KeyError: 'IR'

# Results

- Scores

In [None]:
df_per_metric = {}
for metric in METRICS:
    df = pd.DataFrame(scores[metric])#.T   # samplers × datasets
    # df = df.T                             # datasets × samplers
    # df.name = metric
    # df_per_metric.append(df)
    df_per_metric[metric] = df

- Display Scores

In [None]:
# for df in df_per_metric:
#     print("="*80)
#     print(df.name)
#     display(df)
#     print("")

In [None]:
df_per_metric["mcc"]

# 7.  Bar-plots: % change vs SMOTE

- scores of SMOTE is set as baseline

In [None]:
baseline = "SMOTE"

# for df in df_per_metric:
for name, df in df_per_metric.items():
#     metric = df.name
    rel = ((df.subtract(df[baseline], axis=0)
            .div(df[baseline], axis=0))*100).round(2)
    rel = rel.drop(columns=[baseline], errors="ignore")
    
    fig, ax = plt.subplots(figsize=(10, 6))
    rel.plot(kind="bar", ax=ax, width=0.8)
    ax.axhline(0, color="black", lw=0.8)
    ax.set_title(f"{metric} : Change (%) vs {baseline}")
    ax.set_ylabel("Change (%)")
    ax.set_xlabel("datasets")
    ax.legend(title="sampler", bbox_to_anchor=(1.02, 1), loc="upper left")
    plt.tight_layout()
    fig.savefig(OUT_DIR/f"rel_vs_{baseline}_{metric}.png", dpi=300)
    plt.show()
    plt.close(fig)

# 8.  Export CSVs

In [None]:
# for df in df_per_metric:
for name, df in df_per_metric.items():
    df.to_csv(OUT_DIR/f"{name}.csv")

✅ **Completed**