##  SMOTE (Synthetic Minority Over-sampling Technique) 

## **Introduction**
SMOTE is a resampling technique used to handle class imbalance by generating synthetic samples for the minority class instead of simply duplicating existing ones. It works by interpolating between real minority class instances.

## **Algorithm Steps**
1. **Select a minority class sample** $x_i$ from the dataset.
2. **Find its k-nearest neighbors** in the minority class using Euclidean distance.
3. **Randomly select one of these neighbors** $x_{nn}$.
4. **Generate a synthetic sample** along the line segment joining $x_i$ and $x_{nn}$ using interpolation:

   $$
   x_{\text{new}} = x_i + \lambda \cdot (x_{nn} - x_i)
   $$

   where:

   $$
   \lambda \sim U(0,1)
   $$

   is a random number between 0 and 1.

## **Mathematical Formulation**
For a given minority class instance $x_i$, let $x_{nn}$ be one of its k-nearest neighbors. The synthetic sample is created as:

$$
x_{\text{new}} = x_i + \lambda (x_{nn} - x_i)
$$

where:
- $x_i$ is a real minority class instance.
- $x_{nn}$ is one of its k-nearest neighbors.
- $\lambda$ is a random number sampled from a uniform distribution:

  $$
  \lambda \sim U(0,1)
  $$

This process is repeated until the desired number of synthetic samples is generated.

## **Advantages of SMOTE**
- Reduces class imbalance by adding synthetic samples.
- Prevents overfitting caused by simple duplication of minority class instances.
- Preserves the relationships between data points.

## **Limitations of SMOTE**
- Can generate noisy samples if the minority class has a complex distribution.
- Does not consider the majority class, which may lead to overlapping regions and potential misclassification.



In [162]:
import pandas as pd
import numpy as np
from imblearn.combine import SMOTEENN


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score, recall_score, accuracy_score,
    confusion_matrix
)
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import warnings

warnings.simplefilter("ignore")  # Ignore all warnings

def calculate_metrics(y_test, y_pred_proba):
    threshold = np.mean(y_pred_proba)  # Dynamic threshold based on mean
    y_pred_binary = [1 if p > 0.5 else 0 for p in y_pred_proba]  # Convert probabilities to binary

    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    pr_auc = average_precision_score(y_test, y_pred_proba)
    recall = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_binary).ravel()
    
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    fp_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
    g_mean = np.sqrt(recall * specificity)

    return accuracy, roc_auc, pr_auc, recall, f1, specificity, fp_rate, g_mean

def model(X,y):
    # Train-test split (stratified)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Standardize numerical features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    ### Train XGBoost WITHOUT SMOTE
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "gpu_hist",  # Use GPU
        "predictor": "gpu_predictor",
        "learning_rate": 0.05,
        "max_depth": 6,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "random_state": 42
    }
    
    # Train XGBoost without SMOTE
    model_no_smote = xgb.train(params, dtrain, num_boost_round=200)
    
    # Predict on test set
    y_pred_proba_no_smote = model_no_smote.predict(dtest)
    from sklearn.metrics import precision_recall_curve, f1_score
    
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_no_smote)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-6)  # Avoid division by zero
    optimal_idx = f1_scores.argmax()
    optimal_threshold = thresholds[optimal_idx]

    y_pred_binary_no_smote = [1 if p > optimal_threshold else 0 for p in y_pred_proba_no_smote]  # Convert probabilities to binary predictions

    metrics_no_smote = calculate_metrics(y_test, y_pred_proba_no_smote)

    
        ### Apply SMOTE
    smote = SMOTEENN(sampling_strategy=1
                     , random_state=42)  # Fully balance classes
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    #X_train_resampled,y_train_resampled=smoteenn_with_weighted_enn(pd.DataFrame(X_train,columns=X.columns),y_train['Y'])

    #X_train_resampled, y_train_resampled=manual_smoteenn(X_train,y_train)
    
    # Train XGBoost WITH SMOTE
    dtrain_resampled = xgb.DMatrix(X_train_resampled, label=y_train_resampled)
    model_with_smote = xgb.train(params, dtrain_resampled, num_boost_round=200)
    
    # Predict on test set
    y_pred_proba_with_smote = model_with_smote.predict(dtest)
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_with_smote)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-6)  # Avoid division by zero
    optimal_idx = f1_scores.argmax()
    optimal_threshold = thresholds[optimal_idx]
    y_pred_binary_with_smote = [1 if p > optimal_threshold else 0 for p in y_pred_proba_with_smote]  # Convert probabilities to binary predictions

    metrics_with_smote = calculate_metrics(y_test, y_pred_proba_with_smote)

    # Print comparison
    metric_names = ["Accuracy", "ROC-AUC", "PR-AUC", "Recall (Sensitivity)", "F1", "Specificity", "FP-Rate", "G-Mean"]
    print("\n--- Model Performance ---")
    print("{:<20} {:<10} {:<10}".format("Metric", "No SMOTE", "With Regular SMOTE"))
    for name, no_smote, with_smote in zip(metric_names, metrics_no_smote, metrics_with_smote):
        print(f"{name:<20} {no_smote:.4f}   {with_smote:.4f}")

    return [metrics_no_smote, metrics_with_smote]


In [163]:
output=model(X,y)



--- Model Performance ---
Metric               No SMOTE   With Regular SMOTE
Accuracy             0.8830   0.8334
ROC-AUC              0.9068   0.9037
PR-AUC               0.6103   0.6001
Recall (Sensitivity) 0.4261   0.8142
F1                   0.5138   0.5864
Specificity          0.9605   0.8367
FP-Rate              0.0395   0.1633
G-Mean               0.6398   0.8254


# **Custom SMOTE with Cubic Interpolation (My Development)**

## **Introduction**
This is a modified version of the **Synthetic Minority Over-sampling Technique (SMOTE)**, where instead of linear interpolation, a **third-degree polynomial interpolation** is used to generate synthetic samples. This method helps preserve complex feature relationships and avoids overly simplistic synthetic samples.

## **Algorithm Steps**
1. **Identify the minority class** in the dataset.
2. **Find its k-nearest neighbors** using Euclidean distance.
3. **Randomly select one of these neighbors** $x_{nn}$ for interpolation.
4. **Use cubic interpolation** between the selected sample $x_i$ and its neighbor $x_{nn}$:
   - Define reference points between $x_i$ and $x_{nn}$.
   - Fit a third-degree polynomial for each feature.
   - Sample a new synthetic point using the polynomial.

## **Mathematical Formulation**
For a given minority class instance $x_i$, let $x_{nn}$ be one of its k-nearest neighbors. We define four reference points:

$$
x_0 = x_i, \quad x_1 = \frac{2x_i + x_{nn}}{3}, \quad x_2 = \frac{x_i + 2x_{nn}}{3}, \quad x_3 = x_{nn}
$$

These points correspond to $t$-values:

$$
t_0 = 0, \quad t_1 = 0.33, \quad t_2 = 0.66, \quad t_3 = 1
$$

A third-degree polynomial is fitted for each feature using these values:

$$
P(t) = a_0 + a_1 t + a_2 t^2 + a_3 t^3
$$

where the coefficients $(a_0, a_1, a_2, a_3)$ are determined by the reference points. A synthetic sample is generated by evaluating the polynomial at a randomly chosen $t_{\text{rand}} \sim U(0,1)$:

$$
x_{\text{new}} = P(t_{\text{rand}})
$$

This process is repeated until the desired number of synthetic samples is generated.

## **Advantages of Custom SMOTE with Cubic Interpolation**
- **More realistic synthetic samples**: Cubic interpolation provides a **smoother transition** between real data points.
- **Better feature relationships**: Unlike linear SMOTE, this method **captures non-linear patterns** in the data.
- **Less risk of generating outliers**: Intermediate points help **constrain synthetic samples** within a reasonable range.

## **Limitations**
- **Computationally expensive**: Fitting a polynomial for each feature requires more computation than linear interpolation.
- **Risk of overfitting**: If the minority class has a complex distribution, the interpolation might introduce synthetic samples that do not generalize well.
- **Sensitive to noisy data**: If the minority class contains outliers, the interpolation may exaggerate these variations.


In [109]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from numpy.polynomial.polynomial import Polynomial

def custom_smote_with_cubic_interpolation(X: pd.DataFrame, y: pd.Series, target_class=1, k_neighbors=5, random_state=42):
    """
    Custom SMOTE using 3rd-degree polynomial interpolation.

    Parameters:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target labels.
        target_class (int): The minority class to oversample.
        k_neighbors (int): Number of nearest neighbors to consider.
        sampling_ratio (float): Ratio of synthetic samples to generate relative to minority class.
        random_state (int): Random seed for reproducibility.

    Returns:
        X_resampled (pd.DataFrame): New feature matrix with synthetic samples.
        y_resampled (pd.Series): Updated target labels.
    """
    np.random.seed(random_state)
    
    # Ensure `y` is a 1D array
    y = y.reset_index(drop=True)  # Ensure proper indexing
    
    # Separate minority class
    X_minority = X[y == target_class]
    sampling_ratio=np.floor((X.shape[0]-X_minority.shape[0])/X_minority.shape[0])
    
    # Fit KNN on minority class
    knn = NearestNeighbors(n_neighbors=min(k_neighbors, len(X_minority)))
    knn.fit(X_minority)
    
    # Determine number of synthetic samples to generate
    n_samples = int(len(X_minority) * sampling_ratio)

    print(n_samples)
    
    synthetic_samples = []
    
    for _ in tqdm(range(n_samples)):
        # Randomly select a minority sample 
        idx = np.random.randint(0, len(X_minority))
        x_selected = X_minority.iloc[idx].values  # Convert to NumPy array
        
        # Find k-nearest neighbors
        neighbors = knn.kneighbors([x_selected], return_distance=False)[0]
        
        # Select a random neighbor
        neighbor_idx = np.random.choice(neighbors[1:])  # Exclude itself
        x_neighbor = X_minority.iloc[neighbor_idx].values  # Convert to NumPy array
        
        # Fit a 3rd-degree polynomial between x_selected and x_neighbor
        t_values = np.array([0, 0.33, 0.66, 1])  # 4 reference points in [0,1]
        x_values = np.vstack([x_selected, 
                              (2*x_selected + x_neighbor)/3, 
                              (x_selected + 2*x_neighbor)/3, 
                              x_neighbor])  # Intermediate points
        
        # Generate polynomial coefficients for each feature
        x_synthetic = np.zeros_like(x_selected)
        t_random = np.random.rand()  # Random t in [0,1]
        
        for feature_idx in range(X.shape[1]):  # Iterate over all features
            poly = Polynomial.fit(t_values, x_values[:, feature_idx], 3)  # Fit cubic polynomial
            x_synthetic[feature_idx] = poly(t_random)  # Sample new point
        
        synthetic_samples.append(x_synthetic)
    
    # Convert synthetic samples to DataFrame
    synthetic_samples_df = pd.DataFrame(synthetic_samples, columns=X.columns)
    
    # Create new dataset (append synthetic data)
    X_resampled = pd.concat([X, synthetic_samples_df], axis=0, ignore_index=True)
    y_resampled = pd.concat([y, pd.Series(target_class, index=synthetic_samples_df.index)], axis=0, ignore_index=True)
    
    return X_resampled, y_resampled


In [149]:
def model_smote_poly(X,y):
    # Train-test split (stratified)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Standardize numerical features 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    ### Train XGBoost WITHOUT SMOTE
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "gpu_hist",  # Use GPU
        "predictor": "gpu_predictor",
        "learning_rate": 0.05,
        "max_depth": 6,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "random_state": 42
    }
    
    # Train XGBoost without SMOTE
    model_no_smote = xgb.train(params, dtrain, num_boost_round=200)
    
    # Predict on test set
    y_pred_proba_no_smote = model_no_smote.predict(dtest)
    from sklearn.metrics import precision_recall_curve, f1_score
    
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_no_smote)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-6)  # Avoid division by zero
    optimal_idx = f1_scores.argmax()
    optimal_threshold = thresholds[optimal_idx]

    y_pred_binary_no_smote = [1 if p > optimal_threshold else 0 for p in y_pred_proba_no_smote]  # Convert probabilities to binary predictions

    metrics_no_smote = calculate_metrics(y_test, y_pred_proba_no_smote)


    X_train_resampled,y_train_resampled=smoteenn_with_weighted_enn(pd.DataFrame(X_train,columns=X.columns),y_train['Y'])

    
    # Train XGBoost WITH SMOTE
    dtrain_resampled = xgb.DMatrix(np.array(X_train_resampled), label=y_train_resampled)
    model_with_smote = xgb.train(params, dtrain_resampled, num_boost_round=200)
    
    # Predict on test set
    y_pred_proba_with_smote = model_with_smote.predict(dtest)
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_with_smote)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-6)  # Avoid division by zero
    optimal_idx = f1_scores.argmax()
    optimal_threshold = thresholds[optimal_idx]
    y_pred_binary_with_smote = [1 if p > optimal_threshold else 0 for p in y_pred_proba_with_smote]  # Convert probabilities to binary predictions

    metrics_with_smote = calculate_metrics(y_test, y_pred_proba_with_smote)

    # Print comparison
    metric_names = ["Accuracy", "ROC-AUC", "PR-AUC", "Recall (Sensitivity)", "F1", "Specificity", "FP-Rate", "G-Mean"]
    print("\n--- Model Performance ---")
    print("{:<20} {:<10} {:<10}".format("Metric", "No SMOTE", "With Cubic Polynomial SMOTE"))
    for name, no_smote, with_smote in zip(metric_names, metrics_no_smote, metrics_with_smote):
        print(f"{name:<20} {no_smote:.4f}   {with_smote:.4f}")

    return [metrics_no_smote, metrics_with_smote]



---

# **Model Performance Metrics for Credit Risk Default Prediction**

In credit risk modeling, correctly classifying **defaulting customers** is crucial, as misclassifications can lead to **financial losses** (false negatives) or **lost opportunities** (false positives). The following metrics help assess model performance:

## **1. Accuracy**
Accuracy measures the proportion of correctly classified instances over the total dataset:

$$
\text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN}
$$

where:
- $TP$ = True Positives (correctly predicted defaults)
- $TN$ = True Negatives (correctly predicted non-defaults)
- $FP$ = False Positives (incorrectly predicted defaults)
- $FN$ = False Negatives (incorrectly predicted non-defaults)

### **Importance in Credit Risk:**
- Accuracy gives an overall measure of correctness but can be **misleading in imbalanced datasets** (e.g., if defaults are rare, a model predicting all customers as non-defaults can still have high accuracy).

---

## **2. ROC-AUC (Receiver Operating Characteristic - Area Under Curve)**
The **ROC-AUC** measures a model’s ability to distinguish between the positive (default) and negative (non-default) classes. The **ROC curve** plots the **True Positive Rate (Recall)** against the **False Positive Rate (FPR)** at different classification thresholds.  

### **Mathematical Formulation:**
The **AUC (Area Under Curve)** is computed as:

$$
\text{AUC} = \int_0^1 \text{TPR} \, d(\text{FPR})
$$

where:
- **True Positive Rate (TPR) / Recall:**
  $$
  \text{TPR} = \frac{TP}{TP + FN}
  $$
- **False Positive Rate (FPR):**
  $$
  \text{FPR} = \frac{FP}{FP + TN}
  $$

### **Importance in Credit Risk:**
- **Higher AUC** means the model **better separates defaults from non-defaults**.
- **AUC close to 0.5** suggests the model is **random** (not useful).

---

## **3. PR-AUC (Precision-Recall Area Under Curve)**
PR-AUC measures the area under the **Precision-Recall (PR) curve**, focusing on **positive (default) predictions**.

### **Mathematical Formulation:**
The **AUC for Precision-Recall** is:

$$
\text{PR-AUC} = \int_0^1 \text{Precision} \, d(\text{Recall})
$$

where:
- **Precision (Positive Predictive Value, PPV):**
  $$
  \text{Precision} = \frac{TP}{TP + FP}
  $$

- **Recall (Sensitivity / TPR) (as defined above)**

### **Importance in Credit Risk:**
- **More useful than ROC-AUC** for **imbalanced data** since it focuses on **true defaults**.
- **Higher PR-AUC** indicates a better balance between **precision and recall**.

---

## **4. Recall (Sensitivity)**
Recall, also called **Sensitivity or True Positive Rate (TPR)**, measures the ability to detect **actual defaults**:

$$
\text{Recall} = \frac{TP}{TP + FN}
$$

### **Importance in Credit Risk:**
- **High recall** ensures **most actual defaults are detected**, minimizing **false negatives**.
- **Low recall** means many **defaulting customers** are **missed**, leading to **financial losses**.

---

## **5. F1-Score**
F1-Score is the harmonic mean of **Precision** and **Recall**, balancing both:

$$
F1 = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}
$$

### **Importance in Credit Risk:**
- **Best when both False Positives & False Negatives are costly**.
- **Useful in imbalanced datasets**, where a high precision or recall alone isn't enough.

---

## **6. Specificity (True Negative Rate)**
Specificity measures how well the model identifies **non-defaulting customers**:

$$
\text{Specificity} = \frac{TN}{TN + FP}
$$

### **Importance in Credit Risk:**
- **Higher specificity** reduces **false alarms (FP)**.
- **Too high specificity may mean recall is low**, missing many defaults.

---

## **7. False Positive Rate (FPR)**
FPR is the proportion of **non-defaulting customers incorrectly classified as defaults**:

$$
\text{FPR} = \frac{FP}{FP + TN} = 1 - \text{Specificity}
$$

### **Importance in Credit Risk:**
- **Low FPR** ensures fewer **non-defaulters are wrongly flagged**, reducing unnecessary **loan rejections**.
- **High FPR** can **hurt customer experience**, causing **unnecessary loan rejections**.

---

## **8. G-Mean (Geometric Mean)**
The **G-Mean** is a performance metric balancing **recall** and **specificity**:

$$
G\text{-Mean} = \sqrt{\text{Recall} \times \text{Specificity}}
$$

### **Importance in Credit Risk:**
- **Higher G-Mean** ensures the model performs well on **both default and non-default classes**.
- Useful for **handling class imbalance**, where one metric alone (like accuracy) can be misleading.

---

# **Summary Table of Metrics**
| **Metric**         | **Interpretation** |
|--------------------|------------------|
| **Accuracy**      | Overall correctness, but misleading in imbalanced data |
| **ROC-AUC**       | Ability to distinguish defaults vs. non-defaults |
| **PR-AUC**        | Performance on the default class, useful for imbalance |
| **Recall**        | Ability to detect defaults (avoid false negatives) |
| **F1-Score**      | Balance between Precision & Recall |
| **Specificity**   | Correctly identifying non-defaulters |
| **FPR**           | Incorrectly flagging non-defaulters as defaults |
| **G-Mean**        | Balance between Recall & Specificity (useful for imbalance) |

---

# **Final Thoughts**
For **credit risk prediction**, metrics should be **carefully chosen** based on **business priorities**:

- **If missing defaults is costly** → **High Recall (Sensitivity)**.
- **If wrongly flagging non-defaulters is a concern** → **Low False Positive Rate (FPR)**.
- **For overall balance** → **High G-Mean & F1-Score**.

# Data

## Data Fetch in YKB Computer - Taiwan Credit Data

In [151]:
df=pd.read_excel('default of credit card clients.xls',index_col=0).iloc[1:,:]

X=df.iloc[:,:-1]
y=pd.DataFrame(df.iloc[:,-1],columns=['Y'])
X=X.astype(float)
y=y.astype(int)
X.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
1,20000.0,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,...,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
2,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,0.0,0.0,0.0,...,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0
3,90000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0
4,50000.0,2.0,2.0,1.0,37.0,0.0,0.0,0.0,0.0,0.0,...,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0
5,50000.0,1.0,2.0,1.0,57.0,-1.0,0.0,-1.0,0.0,0.0,...,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0


In [138]:

from imblearn.under_sampling import TomekLinks

def remove_tomek_links(X, y):
    """
    Removes Tomek Links from the dataset.
    
    Parameters:
    X (pandas.DataFrame): Feature matrix of shape (n_samples, n_features)
    y (pandas.Series): Target vector of shape (n_samples,)
    
    Returns:
    X_resampled, y_resampled: The feature matrix and target vector after removing Tomek Links
                              returned as pandas DataFrame and Series respectively.
    """
    
    tl = TomekLinks()
    X_resampled, y_resampled = tl.fit_resample(X.values, y.values)
    
    # Convert back to DataFrame and Series
    X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    y_resampled = pd.Series(y_resampled)
    
    return X_resampled, y_resampled

# Example usage:
# Assuming you have your data in variables `X` as a DataFrame and `y` as a Series
# X_resampled, y_resampled = remove_tomek_links(X, y)
from imblearn.under_sampling import NearMiss

def apply_near_miss(X, y):
    """
    Applies the NearMiss-1 technique to under-sample the majority class.
    
    Parameters:
    X (pandas.DataFrame): Feature matrix of shape (n_samples, n_features)
    y (pandas.Series): Target vector of shape (n_samples,)
    
    Returns:
    X_resampled, y_resampled: The feature matrix and target vector after applying NearMiss,
                              returned as pandas DataFrame and Series respectively.
    """
    
    nm = NearMiss(version=1)  # Version 1 for NearMiss-1
    X_resampled, y_resampled = nm.fit_resample(X.values, y.values)
    
    # Convert back to DataFrame and Series
    X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    y_resampled = pd.Series(y_resampled)
    
    return X_resampled, y_resampled

from imblearn.under_sampling import EditedNearestNeighbours
import pandas as pd

def apply_enn(X, y):
    """
    Applies the Edited Nearest Neighbors (ENN) technique to clean the dataset.
    
    Parameters:
    X (pandas.DataFrame): Feature matrix of shape (n_samples, n_features)
    y (pandas.Series): Target vector of shape (n_samples,)
    
    Returns:
    X_resampled, y_resampled: The feature matrix and target vector after applying ENN,
                              returned as pandas DataFrame and Series respectively.
    """
    
    enn = EditedNearestNeighbours()
    X_resampled, y_resampled = enn.fit_resample(X.values, y.values)
    
    # Convert back to DataFrame and Series
    X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    y_resampled = pd.Series(y_resampled)
    
    return X_resampled, y_resampled
def manual_smoteenn(X, y, k_neighbors=5):
    """
    Replicate the behavior of SMOTEENN by applying SMOTE and then ENN only to the majority class.

    Parameters:
        X (pandas.DataFrame): Feature matrix of shape (n_samples, n_features).
        y (pandas.Series): Target labels of shape (n_samples,).
        k_neighbors (int): Number of neighbors to use for SMOTE and ENN.

    Returns:
        X_resampled (pandas.DataFrame): Resampled and cleaned feature matrix.
        y_resampled (pandas.Series): Resampled and cleaned target labels.
    """
    # Convert X and y to numpy arrays for compatibility with imblearn
    X = X.values
    y = y.values

    # Step 1: Apply SMOTE to oversample the minority class
    smote = SMOTE(k_neighbors=k_neighbors)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Step 2: Apply ENN only to the majority class
    # Identify the majority class
    majority_class = 0

    # Separate majority and minority classes
    X_majority = X_resampled[y_resampled == majority_class]
    y_majority = y_resampled[y_resampled == majority_class]
    X_minority = X_resampled[y_resampled != majority_class]
    y_minority = y_resampled[y_resampled != majority_class]

    # Apply ENN only to the majority class
    enn = EditedNearestNeighbours(n_neighbors=3)
    X_majority_resampled, y_majority_resampled = enn.fit_resample(X_majority, y_majority)

    # Combine the resampled majority class with the minority class
    X_resampled = np.vstack([X_minority, X_majority_resampled])
    y_resampled = np.hstack([y_minority, y_majority_resampled])

    # Convert back to pandas DataFrame and Series
    X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    y_resampled = pd.Series(y_resampled)

    return X_resampled, y_resampled




import numpy as np
from sklearn.neighbors import NearestNeighbors
from collections import Counter

# -------------------------------------
# SMOTE - Synthetic Minority Oversampling
# -------------------------------------

def synthetic_sample(point, neighbor, random_state=None):
    """ Generate a synthetic sample between a point and its neighbor. """
    if random_state:
        np.random.seed(random_state)
    return point + np.random.rand() * (neighbor - point)

def smote(X, y, minority_class, n_neighbors=5, random_state=None):
    """ Implements SMOTE: Generates synthetic samples for the minority class. """
    minority_samples = X[y == minority_class]
    n_samples_to_generate = len(X[y != minority_class]) - len(minority_samples)

    if n_samples_to_generate <= 0:
        return X, y

    nn = NearestNeighbors(n_neighbors=n_neighbors).fit(minority_samples)
    synthetic_samples = []

    for _ in range(n_samples_to_generate):
        idx = np.random.randint(0, len(minority_samples))
        point = minority_samples[idx]
        neighbors = nn.kneighbors([point], return_distance=False).flatten()
        neighbor = minority_samples[np.random.choice(neighbors[1:])]
        synthetic_samples.append(synthetic_sample(point, neighbor, random_state))

    synthetic_samples = np.array(synthetic_samples)
    X_resampled = np.vstack((X, synthetic_samples))
    y_resampled = np.hstack((y, np.array([minority_class] * len(synthetic_samples))))

    return X_resampled, y_resampled

# -------------------------------------
# ENN - Distance Weighted Edited Nearest Neighbors
# -------------------------------------

def weighted_majority_voting(neighbors_labels, neighbors_distances):
    """
    Performs weighted majority voting based on inverse distance.
    Closer neighbors have more influence.
    """
    weights = 1 / (neighbors_distances + 1e-5)  # Avoid division by zero
    label_weights = {}

    for label, weight in zip(neighbors_labels, weights):
        label_weights[label] = label_weights.get(label, 0) + weight

    return max(label_weights, key=label_weights.get)

def enn_with_distance_weighting(X, y, n_neighbors=3):
    """
    ENN with distance weighting. Keeps samples whose label agrees 
    with the weighted majority of their neighbors.
    """

    
    nn = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X)
    keep_indices = []

    for idx, (point, label) in enumerate(zip(X, y)):
        neighbors_distances, neighbors = nn.kneighbors([point])
        neighbors_distances = neighbors_distances.flatten()[1:]  # Exclude self
        neighbors_labels = y[neighbors.flatten()[1:]]

        # Get weighted majority class among neighbors
        majority_label = weighted_majority_voting(neighbors_labels, neighbors_distances)

        # Keep the point if the weighted majority agrees with the label
        if label == majority_label:
            keep_indices.append(idx)

    return X[keep_indices], y[keep_indices]

# -------------------------------------
# SMOTEENN - Full Pipeline
# -------------------------------------

def smoteenn_with_weighted_enn(X, y, n_neighbors_smote=5, n_neighbors_enn=3, random_state=None):
    """ Combines SMOTE oversampling and distance-weighted ENN undersampling. """
    X=X.values
    y=y.values
    
    # Identify the minority class
    class_counts = Counter(y)
    minority_class = min(class_counts, key=class_counts.get)

    # Apply SMOTE
    X_resampled, y_resampled = smote(X, y, minority_class, n_neighbors=n_neighbors_smote, random_state=random_state)

    print(type(X_resampled))

    # Apply distance-weighted ENN
    X_final, y_final = enn_with_distance_weighting(X_resampled, y_resampled, n_neighbors=n_neighbors_enn)

    print("Original class distribution:", class_counts)
    print("After SMOTE:", Counter(y_resampled))
    print("After distance-weighted ENN:", Counter(y_final))

    return X_final, y_final

## Data Fetch in Personal Computer -- Taiwan Credit Data

In [32]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
default_of_credit_card_clients = fetch_ucirepo(id=350) 
  
# data (as pandas dataframes) 
X = default_of_credit_card_clients.data.features 
y = default_of_credit_card_clients.data.targets 

X.head()

ModuleNotFoundError: No module named 'ucimlrepo'

In [26]:
print(f"Percentage of Positive targets : {((y.sum()/y.count())*100).values[0]}%")

Percentage of Positive targets : 22.12%


## Model Training And Results

In [154]:


print("Taiwan Credit dataset: \n")
output=model(X,y)
output_2=model_smote_poly(X,y)
output.append(output_2[1])

Taiwan Credit dataset: 


--- Model Performance ---
Metric               No SMOTE   With Regular SMOTE
Accuracy             0.8190   0.7567
ROC-AUC              0.7771   0.7688
PR-AUC               0.5392   0.5191
Recall (Sensitivity) 0.3566   0.6087
F1                   0.4618   0.5214
Specificity          0.9477   0.7979
FP-Rate              0.0523   0.2021
G-Mean               0.5814   0.6969
<class 'numpy.ndarray'>
Original class distribution: Counter({np.int64(0): 16324, np.int64(1): 4676})
After SMOTE: Counter({np.int64(1): 16324, np.int64(0): 16324})
After distance-weighted ENN: Counter({np.int64(1): 15671, np.int64(0): 11663})

--- Model Performance ---
Metric               No SMOTE   With Cubic Polynomial SMOTE
Accuracy             0.8190   0.7683
ROC-AUC              0.7771   0.7660
PR-AUC               0.5392   0.5190
Recall (Sensitivity) 0.3566   0.5755
F1                   0.4618   0.5197
Specificity          0.9477   0.8220
FP-Rate              0.0523   0.1780
G-Mean     

In [64]:
metric_names = ["Accuracy", "ROC-AUC", "PR-AUC", "Recall (Sensitivity)", 
                "F1", "Specificity", "FP-Rate", "G-Mean"]

print("\n--- Model Performance ---")
print("{:<30} {:<10} {:<20} {:<20}".format("Metric", "No SMOTE",
                                            "With Regular SMOTE",
                                            "With Cubic Polynomial Interpolation SMOTE"))

for i in range(len(metric_names)):
    print("{:<30} {:.3f}              {:.3f}              {:.3f}".format(
        metric_names[i], metrics[0][i], metrics[1][i], metrics[2][i]))


--- Model Performance ---
Metric                         No SMOTE   With Regular SMOTE   With Cubic Polynomial Interpolation SMOTE


NameError: name 'metrics' is not defined

# **ADASYN: Adaptive Synthetic Sampling Approach with Cubic Polynomial Interpolation**

## **1. Introduction**
In classification tasks with imbalanced datasets, where the number of instances in the minority class is significantly lower than in the majority class, machine learning models tend to be biased towards the majority class. To address this, various oversampling techniques have been developed, including the **Adaptive Synthetic Sampling (ADASYN) algorithm**.  

ADASYN improves upon traditional oversampling techniques, such as **SMOTE**, by adaptively generating synthetic samples according to the **local distribution** of the minority class. Specifically, it focuses on generating more synthetic samples in **harder-to-learn regions**, where the local class imbalance is more pronounced.  

In this work, we **replace the standard linear interpolation method** used in ADASYN with **cubic polynomial interpolation**, which provides a smoother and more diverse distribution of synthetic samples in high-dimensional feature spaces.

---

## **2. Algorithm Description**

Let $X \in \mathbb{R}^{n \times m}$ represent the dataset, where $n$ is the number of samples, and $m$ is the number of features. Let the class labels be $y \in \{C_1, C_2\}$, where $C_1$ is the minority class and $C_2$ is the majority class.

### **Step 1: Define the Minority and Majority Classes**
The number of instances in each class is computed as:

$$
n_{\text{min}} = |X_{\text{min}}|, \quad n_{\text{maj}} = |X_{\text{maj}}|
$$

where $X_{\text{min}}$ and $X_{\text{maj}}$ represent the subsets of $X$ belonging to the minority and majority classes, respectively.

The class imbalance ratio is then given by:

$$
d = \frac{n_{\text{min}}}{n_{\text{maj}}}
$$

ADASYN aims to **balance the dataset** by generating synthetic samples until $d \approx 1$.

---

### **Step 2: Compute the Number of Synthetic Samples**
The total number of synthetic samples to be generated is:

$$
G = n_{\text{maj}} - n_{\text{min}}\cdot\tilde{β} 
$$

Each minority sample $x_i$ is assigned a weight based on its difficulty of classification.Where β∈ [0, 1] is a parameter used to specify the desired
balance level after generation of the synthetic data. β = 1
means a fully balanced data set is created after the generalization process.

For each $x_i \in X_{\text{min}}$, we compute the number of its $k$-nearest neighbors belonging to the majority class $X_{\text{maj}}$. Let $k_i^{\text{maj}}$ denote this count. The local distribution ratio $r_i$ is computed as:

$$
r_i = \frac{k_i^{\text{maj}}}{k}
$$

where $k$ is the total number of nearest neighbors considered.

The normalized weight for each $x_i$ is then:

$$
\tilde{r}_i = \frac{r_i}{\sum_{j=1}^{n_{\text{min}}} r_j}
$$

The number of synthetic samples required for each $x_i$ is:

$$
G_i = G \cdot \tilde{r}_i
$$

where $G_i$ is an integer value indicating the number of new samples to generate for $x_i$.

---

### **Step 3: Generate Synthetic Samples Using Cubic Polynomial Interpolation**
For each minority sample $x_i$ requiring $G_i$ synthetic samples, a random neighbor $x_j \in X_{\text{min}}$ from its $k$-nearest neighbors is selected.

#### **Cubic Polynomial Interpolation**
Instead of using linear interpolation, we apply cubic polynomial interpolation for smoother synthetic data generation.

For each feature $f \in \{1, 2, \dots, m\}$, we define **four control points**:
- $P_0 = x_{i,f}$ (original sample)
- $P_1 = 0.5 (x_{i,f} + x_{j,f})$ (midpoint control)
- $P_2 = 0.5 (x_{i,f} + x_{j,f})$ (another midpoint control)
- $P_3 = x_{j,f}$ (selected neighbor)

The corresponding interpolation domain values are:

$$
X_{\text{points}} = [0, 0.33, 0.67, 1]
$$

The values at these points are:

$$
Y_{\text{points}} = [P_0, P_1, P_2, P_3]
$$

Using **CubicSpline interpolation**, we generate a synthetic sample by selecting a random interpolation coefficient $g \sim U(0,1)$ and computing:

$$
\tilde{x}_{f} = \text{CubicSpline}(g)
$$

This process is repeated for all $m$ features, resulting in a synthetic sample $\tilde{x}$.

---

### **Step 4: Update the Dataset**
The newly generated synthetic samples $\tilde{X}$ are added to the original dataset:

$$
X' = X \cup \tilde{X}, \quad y' = y \cup \tilde{y}
$$

where $\tilde{y}$ contains the label of the minority class.

---

## **3. Conclusion**
The proposed ADASYN implementation with cubic polynomial interpolation provides several advantages over standard linear interpolation methods:
- **Enhanced diversity of synthetic samples**: The cubic interpolation technique generates smoother and more naturally distributed synthetic points in the feature space.
- **Better generalization**: By adapting sample generation based on difficulty, ADASYN reduces the risk of overfitting caused by naive oversampling.
- **Improved robustness in high-dimensional spaces**: Unlike linear interpolation, cubic interpolation mitigates abrupt transitions in feature values, making the synthetic data more realistic.

This approach is particularly beneficial for imbalanced datasets where minority class samples exhibit complex distributions.

In [31]:
from imblearn.over_sampling import ADASYN
def model_adasyn(X,y):
    # Train-test split (stratified)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Standardize numerical features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    ### Train XGBoost WITHOUT ADASYN
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "gpu_hist",  # Use GPU
        "predictor": "gpu_predictor",
        "learning_rate": 0.05,
        "max_depth": 6,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "random_state": 42
    }
    
    # Train XGBoost without ADASYN
    model_no_smote = xgb.train(params, dtrain, num_boost_round=200)
    
    # Predict on test set
    y_pred_proba_no_smote = model_no_smote.predict(dtest)
    from sklearn.metrics import precision_recall_curve, f1_score
    
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_no_smote)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-6)  # Avoid division by zero
    optimal_idx = f1_scores.argmax()
    optimal_threshold = thresholds[optimal_idx]

    y_pred_binary_no_smote = [1 if p > optimal_threshold else 0 for p in y_pred_proba_no_smote]  # Convert probabilities to binary predictions

    metrics_no_smote = calculate_metrics(y_test, y_pred_proba_no_smote)

    
    ### Apply ADASYN
    adasyn = ADASYN(sampling_strategy=1, random_state=42)  # Fully balance classes
    X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)
    
    # Train XGBoost WITH ADASYN
    dtrain_resampled = xgb.DMatrix(X_train_resampled, label=y_train_resampled)
    model_with_smote = xgb.train(params, dtrain_resampled, num_boost_round=200)
    
    # Predict on test set
    y_pred_proba_with_smote = model_with_smote.predict(dtest)
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_with_smote)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-6)  # Avoid division by zero
    optimal_idx = f1_scores.argmax()
    optimal_threshold = thresholds[optimal_idx]
    y_pred_binary_with_smote = [1 if p > optimal_threshold else 0 for p in y_pred_proba_with_smote]  # Convert probabilities to binary predictions

    metrics_with_smote = calculate_metrics(y_test, y_pred_proba_with_smote)

    # Print comparison
    metric_names = ["Accuracy", "ROC-AUC", "PR-AUC", "Recall (Sensitivity)", "F1", "Specificity", "FP-Rate", "G-Mean"]
    print("\n--- Model Performance ---")
    print("{:<20} {:<10} {:<10}".format("Metric", "No ADASYN", "With Regular ADASYN"))
    for name, no_smote, with_smote in zip(metric_names, metrics_no_smote, metrics_with_smote):
        print(f"{name:<20} {no_smote:.4f}   {with_smote:.4f}")

    return [metrics_no_smote, metrics_with_smote]


In [38]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from numpy.polynomial.polynomial import Polynomial
from tqdm import tqdm
from scipy.interpolate import CubicSpline

def custom_adasyn_with_cubic_interpolation(X, y, k_neighbors, beta, random_state=42):
    """
    Implements the ADASYN algorithm with cubic polynomial interpolation for high-dimensional data.

    Parameters:
    - X: ndarray, shape (n_samples, n_features)
        Feature matrix.
    - y: ndarray, shape (n_samples,)
        Target labels.

    Returns:
    - X_resampled: ndarray
        Resampled feature matrix with synthetic samples.
    - y_resampled: ndarray
        Resampled target labels.
    """

    # Ensure `y` is a 1D array
    y = y.reset_index(drop=True)  # Ensure proper indexing
    
    # Separate minority class

    # Identify minority and majority class
    classes, class_counts = np.unique(y, return_counts=True)
    minority_class = classes[np.argmin(class_counts)]
    majority_class = classes[np.argmax(class_counts)]
    
    X_minority = X[y == minority_class]
    n_minority = len(X_minority)
    n_majority = len(X[y == majority_class])
    n_features = X.shape[1]

    # Step 1: Compute the number of synthetic samples to generate
    d = n_majority - n_minority  # Imbalance factor
    G = d*beta  # Total synthetic samples needed

    # Step 2: Find k-nearest neighbors for each minority sample
    k = k_neighbors
    knn = NearestNeighbors(n_neighbors=k+1).fit(X)
    neighbors = knn.kneighbors(X_minority, return_distance=False)[:, 1:]

    # Step 3: Compute the imbalance degree ri for each minority sample
    ri = np.array([sum(y[neighbors[i]] != minority_class) / k for i in range(n_minority)])
    if ri.sum() == 0:
        return X, y  # No synthetic samples needed
    ri = ri / ri.sum()  # Normalize ri to sum to 1

    # Step 4: Generate synthetic samples using cubic polynomial interpolation
    X_synthetic = []
    for i in tqdm(range(n_minority)):
        Gi = int(G * ri[i])  # Number of samples to generate for instance i
        for _ in range(Gi):

            neighbor_idx = np.random.choice(neighbors[i])  # Select a random neighbor
            x_selected = X_minority.iloc[i]  # Minority instance
            x_neighbor = X.iloc[neighbor_idx]  # Chosen neighbor

            # Create synthetic sample feature-wise
            
            idx = np.random.choice(neighbors[i])  # Select a random neighbor
            t_values = np.array([0, 0.33, 0.66, 1])  # 4 reference points in [0,1]
            x_values = np.vstack([x_selected, 
                                  (2*x_selected + x_neighbor)/3, 
                                  (x_selected + 2*x_neighbor)/3, 
                                  x_neighbor])  # Intermediate points
            # Generate polynomial coefficients for each feature
            x_synthetic = np.zeros_like(x_selected)
            t_random = np.random.rand()  # Random t in [0,1]
            
            for feature_idx in range(X.shape[1]):  # Iterate over all features
                poly = Polynomial.fit(t_values, x_values[:, feature_idx], 3)  # Fit cubic polynomial
                x_synthetic[feature_idx] = poly(t_random)  # Sample new point
            

            X_synthetic.append(x_synthetic)

    X_synthetic = np.array(X_synthetic)
    y_synthetic = np.full(len(X_synthetic), minority_class)

    # Step 5: Return the augmented dataset
    X_resampled = np.vstack((X, X_synthetic))
    y_resampled = np.hstack((y, y_synthetic))

    print(X_resampled.shape)

    return X_resampled, y_resampled


In [33]:
from sklearn.model_selection import train_test_split

In [64]:
def model_adasyn_poly(X,y):
    # Train-test split (stratified)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Standardize numerical features 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    ### Train XGBoost WITHOUT ADASYN
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "gpu_hist",  # Use GPU
        "predictor": "gpu_predictor",
        "learning_rate": 0.05,
        "max_depth": 6,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "random_state": 42
    }
    
    # Train XGBoost without ADASYN
    model_no_smote = xgb.train(params, dtrain, num_boost_round=200)
    
    # Predict on test set
    y_pred_proba_no_smote = model_no_smote.predict(dtest)
    from sklearn.metrics import precision_recall_curve, f1_score
    
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_no_smote)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-6)  # Avoid division by zero
    optimal_idx = f1_scores.argmax()
    optimal_threshold = thresholds[optimal_idx]

    y_pred_binary_no_smote = [1 if p > optimal_threshold else 0 for p in y_pred_proba_no_smote]  # Convert probabilities to binary predictions

    metrics_no_smote = calculate_metrics(y_test, y_pred_proba_no_smote)

    pd.DataFrame(X_train,columns=X.columns)

    X_train_resampled, y_train_resampled = custom_adasyn_with_cubic_interpolation(pd.DataFrame(X_train,columns=X.columns), y_train,5,1)
    
    # Train XGBoost WITH ADASYN
    dtrain_resampled = xgb.DMatrix(np.array(X_train_resampled), label=y_train_resampled)
    model_with_smote = xgb.train(params, dtrain_resampled, num_boost_round=200)
    
    # Predict on test set
    y_pred_proba_with_smote = model_with_smote.predict(dtest)
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_with_smote)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-6)  # Avoid division by zero
    optimal_idx = f1_scores.argmax()
    optimal_threshold = thresholds[optimal_idx]
    y_pred_binary_with_smote = [1 if p > optimal_threshold else 0 for p in y_pred_proba_with_smote]  # Convert probabilities to binary predictions

    metrics_with_smote = calculate_metrics(y_test, y_pred_proba_with_smote)

    # Print comparison
    metric_names = ["Accuracy", "ROC-AUC", "PR-AUC", "Recall (Sensitivity)", "F1", "Specificity", "FP-Rate", "G-Mean"]
    print("\n--- Model Performance ---")
    print("{:<20} {:<10} {:<10}".format("Metric", "No ADASYN", "With Cubic Polynomial ADASYN"))
    for name, no_smote, with_smote in zip(metric_names, metrics_no_smote, metrics_with_smote):
        print(f"{name:<20} {no_smote:.4f}   {with_smote:.4f}")

    return [metrics_no_smote, metrics_with_smote]


In [65]:
print("Taiwan Credit dataset: \n")
output=model_adasyn(X,y)
output_2=model_adasyn_poly(X,y['Y'])
output.append(output_2[1])

Taiwan Credit dataset: 


--- Model Performance ---
Metric               No ADASYN  With Regular ADASYN
Accuracy             0.8190   0.8018
ROC-AUC              0.7771   0.7582
PR-AUC               0.5392   0.5191
Recall (Sensitivity) 0.3566   0.4306
F1                   0.4618   0.4862
Specificity          0.9477   0.9051
FP-Rate              0.0523   0.0949
G-Mean               0.5814   0.6243


100%|██████████| 4676/4676 [00:16<00:00, 278.28it/s]


(30908, 23)

--- Model Performance ---
Metric               No ADASYN  With Cubic Polynomial ADASYN
Accuracy             0.8190   0.8036
ROC-AUC              0.7771   0.7596
PR-AUC               0.5392   0.5172
Recall (Sensitivity) 0.3566   0.4388
F1                   0.4618   0.4931
Specificity          0.9477   0.9051
FP-Rate              0.0523   0.0949
G-Mean               0.5814   0.6302


In [66]:
metrics=output

In [67]:
metric_names = ["Accuracy", "ROC-AUC", "PR-AUC", "Recall (Sensitivity)", 
                "F1", "Specificity", "FP-Rate", "G-Mean"]

print("\n--- Model Performance ---")
print("{:<30} {:<10} {:<20} {:<20}".format("Metric", "No ADASYN",
                                            "With Regular ADASYN",
                                            "With Cubic Polynomial Interpolation ADASYN"))

for i in range(len(metric_names)):
    print("{:<30} {:.3f}              {:.3f}              {:.3f}".format(
        metric_names[i], metrics[0][i], metrics[1][i], metrics[2][i]))


--- Model Performance ---
Metric                         No ADASYN  With Regular ADASYN  With Cubic Polynomial Interpolation ADASYN
Accuracy                       0.819              0.802              0.804
ROC-AUC                        0.777              0.758              0.760
PR-AUC                         0.539              0.519              0.517
Recall (Sensitivity)           0.357              0.431              0.439
F1                             0.462              0.486              0.493
Specificity                    0.948              0.905              0.905
FP-Rate                        0.052              0.095              0.095
G-Mean                         0.581              0.624              0.630


In [None]:


print("Taiwan Credit dataset: \n")
output=model(X,y)
output_2=model_smote_poly(X,y['Y'])
output.append(output_2[1])

In [155]:
df_lending=pd.read_csv('filtered_loans.csv')
X=df_lending.drop('loan_status',axis=1)
y=df_lending[['loan_status']]
y = y.replace({0: 1, 1: 0})
y.columns=['Y']

In [156]:


print("Lending Club Credit dataset: \n")
output=model(X,y)
output_2=model_smote_poly(X,y)
#output.append(output_2[1])

Lending Club Credit dataset: 


--- Model Performance ---
Metric               No SMOTE   With Regular SMOTE
Accuracy             0.8830   0.8334
ROC-AUC              0.9068   0.9037
PR-AUC               0.6103   0.6001
Recall (Sensitivity) 0.4261   0.8142
F1                   0.5138   0.5864
Specificity          0.9605   0.8367
FP-Rate              0.0395   0.1633
G-Mean               0.6398   0.8254
<class 'numpy.ndarray'>
Original class distribution: Counter({np.int64(0): 23497, np.int64(1): 3944})
After SMOTE: Counter({np.int64(0): 23497, np.int64(1): 23497})
After distance-weighted ENN: Counter({np.int64(1): 23467, np.int64(0): 18529})

--- Model Performance ---
Metric               No SMOTE   With Cubic Polynomial SMOTE
Accuracy             0.8830   0.8479
ROC-AUC              0.9068   0.9040
PR-AUC               0.6103   0.6009
Recall (Sensitivity) 0.4261   0.7579
F1                   0.5138   0.5911
Specificity          0.9605   0.8632
FP-Rate              0.0395   0.1368
G-Mea

In [15]:
print(f"Percentage of Positive targets : {((y.sum()/y.count())*100).values[0]}%")

Percentage of Positive targets : 14.412529972960563%


In [18]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from numpy.polynomial.polynomial import Polynomial
from tqdm import tqdm

def custom_smote_with_density(X: pd.DataFrame, y: pd.Series, target_class=1, k_neighbors=5, random_state=42):
    """
    Custom SMOTE using cubic interpolation with density-aware sampling.

    Parameters:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target labels.
        target_class (int): The minority class to oversample.
        k_neighbors (int): Number of nearest neighbors to consider.
        random_state (int): Random seed for reproducibility.

    Returns:
        X_resampled (pd.DataFrame): New feature matrix with synthetic samples.
        y_resampled (pd.Series): Updated target labels.
    """
    np.random.seed(random_state)
    
    y = y.reset_index(drop=True)  
    X_minority = X[y == target_class]
    X_majority = X[y != target_class]

    # Fit KNN on minority class to get density estimate
    knn = NearestNeighbors(n_neighbors=min(k_neighbors, len(X_minority)))
    knn.fit(X_minority)

    # Calculate minority density: Number of neighbors within distance threshold
    distances, _ = knn.kneighbors(X_minority)
    density_scores = np.mean(distances, axis=1)  # Higher value → lower density

    # Normalize density scores to get sampling weights (inverse relationship)
    sampling_weights = (density_scores - density_scores.min()) / (density_scores.max() - density_scores.min() + 1e-6)
    sampling_weights = sampling_weights / sampling_weights.sum()  # Normalize to sum=1

    # Compute the number of samples to generate based on imbalance and density
    target_samples = len(X_majority) - len(X_minority)
    num_samples_per_point = (sampling_weights * target_samples).astype(int)

    synthetic_samples = []

    for idx, count in tqdm(enumerate(num_samples_per_point), total=len(num_samples_per_point)):
        x_selected = X_minority.iloc[idx].values
        
        for _ in range(count):
            # Find k-nearest neighbors
            neighbors = knn.kneighbors([x_selected], return_distance=False)[0]

            # Select a random neighbor
            neighbor_idx = np.random.choice(neighbors[1:])  # Exclude itself
            x_neighbor = X_minority.iloc[neighbor_idx].values

            # Fit a 3rd-degree polynomial between x_selected and x_neighbor
            t_values = np.array([0, 0.33, 0.66, 1])
            x_values = np.vstack([x_selected, 
                                  (2*x_selected + x_neighbor)/3,
                                  (x_selected + 2*x_neighbor)/3,
                                  x_neighbor])
            
            # Generate synthetic sample
            x_synthetic = np.zeros_like(x_selected)
            t_random = np.random.rand()

            for feature_idx in range(X.shape[1]):
                poly = Polynomial.fit(t_values, x_values[:, feature_idx], 3)
                x_synthetic[feature_idx] = poly(t_random)

            synthetic_samples.append(x_synthetic)

    # Convert to DataFrame
    synthetic_samples_df = pd.DataFrame(synthetic_samples, columns=X.columns)

    # Append synthetic samples
    X_resampled = pd.concat([X, synthetic_samples_df], axis=0, ignore_index=True)
    y_resampled = pd.concat([y, pd.Series(target_class, index=synthetic_samples_df.index)], axis=0, ignore_index=True)

    return X_resampled, y_resampled

In [20]:
X

Unnamed: 0,loan_amnt,int_rate,installment,emp_length,annual_inc,zip_code,dti,delinq_2yrs,fico_range_high,inq_last_6mths,...,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
0,5000.0,10.65,162.87,10,24000.0,860,27.65,0.0,739.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2500.0,15.27,59.83,0,30000.0,309,1.00,0.0,744.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2400.0,15.96,84.33,10,12252.0,606,8.72,0.0,739.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,10000.0,13.49,339.31,10,49200.0,917,20.00,0.0,694.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5000.0,7.90,156.46,3,36000.0,852,11.20,0.0,734.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39197,2500.0,8.07,78.42,4,110000.0,802,11.33,0.0,764.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39198,8500.0,10.28,275.38,3,18000.0,274,6.40,1.0,694.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39199,5000.0,8.07,156.84,0,100000.0,170,2.30,0.0,744.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39200,5000.0,7.43,155.38,0,200000.0,208,3.72,0.0,814.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.stats import multivariate_normal

def gaussian_based_smote_class_independent(X, y, k=5, sampling_ratio=1.0, regularization=1e-6):
    """
    Gaussian-Based SMOTE with Class-Independent Sampling.

    Parameters:
        X (pandas.DataFrame): Feature matrix of shape (n_samples, n_features).
        y (pandas.Series): Target labels of shape (n_samples,).
        k (int): Number of nearest neighbors to consider for each minority sample.
        sampling_ratio (float): Ratio of synthetic samples to generate relative to the imbalance.
        regularization (float): Small value to add to the diagonal of the covariance matrix for regularization.

    Returns:
        X_balanced (pandas.DataFrame): Balanced feature matrix.
        y_balanced (pandas.Series): Balanced target labels.
    """
    cols=X.columns
    # Convert X and y to numpy arrays for easier manipulation
    X = X.values
    y = y.values

    # Identify minority and majority classes
    minority_class = 1
    X_minority = X[y == minority_class]
    X_majority = X[y != minority_class]
    n_minority = X_minority.shape[0]
    n_majority = X_majority.shape[0]
    n_synthetic = int(sampling_ratio * (n_majority - n_minority))

    # Find k-nearest neighbors for each minority sample (including itself)
    nbrs = NearestNeighbors(n_neighbors=k + 1).fit(X)  # +1 to include itself
    _, indices = nbrs.kneighbors(X_minority)

    synthetic_samples = []
    for i in range(n_minority):
        # Get the neighborhood of x_i (excluding itself)
        neighborhood_indices = indices[i][1:]  # Exclude the first index (itself)
        neighborhood = X[neighborhood_indices]

        # Calculate the number of synthetic samples needed to balance the minority class
        n_samples_needed = int(n_synthetic / n_minority)

        # Fit a Gaussian distribution to the neighborhood (class-independent)
        if len(neighborhood) > 1:  # Ensure there are enough samples to compute covariance
            try:
                # Compute mean and regularized covariance matrix
                mu = np.mean(neighborhood, axis=0)
                sigma = np.cov(neighborhood, rowvar=False) + regularization * np.eye(neighborhood.shape[1])

                # Generate synthetic samples from the Gaussian distribution
                samples = multivariate_normal.rvs(mean=mu, cov=sigma, size=n_samples_needed)
                synthetic_samples.extend(samples)
            except:
                # Fallback to linear interpolation if Gaussian sampling fails
                for _ in range(n_samples_needed):
                    x1 = X_minority[i]  # Use the instance itself as x1
                    x2 = neighborhood[np.random.choice(len(neighborhood))]  # Randomly select a neighbor
                    lambda_ = np.random.uniform(0, 1)
                    x_new = x1 + lambda_ * (x2 - x1)
                    synthetic_samples.append(x_new)
        else:
            # Fallback to linear interpolation if there are too few samples in the neighborhood
            for _ in range(n_samples_needed):
                x1 = X_minority[i]  # Use the instance itself as x1
                x2 = neighborhood[np.random.choice(len(neighborhood))]  # Randomly select a neighbor
                lambda_ = np.random.uniform(0, 1)
                x_new = x1 + lambda_ * (x2 - x1)
                synthetic_samples.append(x_new)

    # Combine synthetic samples with the original data
    synthetic_samples = np.array(synthetic_samples)
    X_balanced = np.vstack([X, synthetic_samples])
    y_balanced = np.hstack([y, np.full(len(synthetic_samples), minority_class)])

    # Convert back to pandas DataFrame and Series
    X_balanced = pd.DataFrame(X_balanced, columns=cols)
    y_balanced = pd.Series(y_balanced)

    return X_balanced, y_balanced

In [14]:
a,b=gaussian_based_smote_class_independent(X,y['Y'])

In [23]:
a

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
0,20000.000000,2.000000,2.000000,1.000000,24.000000,2.000000,2.000000,-1.000000,-1.000000,-2.000000,...,689.000000,0.000000,0.000000,0.000000,0.000000,689.000000,0.000000,0.000000,0.000000,0.000000
1,120000.000000,2.000000,2.000000,2.000000,26.000000,-1.000000,2.000000,0.000000,0.000000,0.000000,...,2682.000000,3272.000000,3455.000000,3261.000000,0.000000,1000.000000,1000.000000,1000.000000,0.000000,2000.000000
2,90000.000000,2.000000,2.000000,2.000000,34.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,13559.000000,14331.000000,14948.000000,15549.000000,1518.000000,1500.000000,1000.000000,1000.000000,1000.000000,5000.000000
3,50000.000000,2.000000,2.000000,1.000000,37.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,49291.000000,28314.000000,28959.000000,29547.000000,2000.000000,2019.000000,1200.000000,1100.000000,1069.000000,1000.000000
4,50000.000000,1.000000,2.000000,1.000000,57.000000,-1.000000,0.000000,-1.000000,0.000000,0.000000,...,35835.000000,20940.000000,19146.000000,19131.000000,2000.000000,36681.000000,10000.000000,9000.000000,689.000000,679.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43267,19999.997754,1.590705,0.690469,2.000980,20.140812,-0.427931,-1.767432,-2.346084,-1.717785,0.000014,...,-2343.358514,18933.902872,19595.052821,19538.304847,4988.422335,-546.119572,23810.272459,148.529457,294.484036,293.191282
43268,86971.859065,1.353860,1.708047,1.999114,26.868859,-0.337955,-0.693410,-0.000248,-0.000434,0.000522,...,67707.924713,60457.667099,49660.394272,55211.826387,67551.083507,3381.324091,3472.497479,3162.551651,9440.956293,3334.792868
43269,70410.304003,1.968966,1.489951,1.999000,22.726311,0.363263,-0.605962,0.000176,-0.000506,0.000807,...,51855.362798,46788.393379,38264.989374,21628.928316,62030.914811,1766.358053,1695.613379,1312.018989,-2237.339539,817.783333
43270,50000.000468,2.210320,1.609737,2.207398,15.204226,-0.415965,-0.415813,-0.417778,-0.417467,-0.417081,...,43883.285731,39913.173032,30423.695417,22435.766820,3154.384491,2908.568465,1742.417325,2017.077081,-87.138071,1511.930330


In [33]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.stats import multivariate_normal

def gaussian_based_smote_minority_neighborhood(X, y, k=10, sampling_ratio=1.0, regularization=1e-6):
    """
    Gaussian-Based SMOTE with Sampling from Minority Class in Neighborhood.

    Parameters:
        X (pandas.DataFrame): Feature matrix of shape (n_samples, n_features).
        y (pandas.Series): Target labels of shape (n_samples,).
        k (int): Number of nearest neighbors to consider for each minority sample.
        sampling_ratio (float): Ratio of synthetic samples to generate relative to the imbalance.
        regularization (float): Small value to add to the diagonal of the covariance matrix for regularization.

    Returns:
        X_balanced (pandas.DataFrame): Balanced feature matrix.
        y_balanced (pandas.Series): Balanced target labels.

    
    """
    cols=X.columns

    # Convert X and y to numpy arrays for easier manipulation
    X = X.values
    y = y.values

    # Identify minority and majority classes
    minority_class = np.argmin(np.bincount(y))
    X_minority = X[y == minority_class]
    X_majority = X[y != minority_class]
    n_minority = X_minority.shape[0]
    n_majority = X_majority.shape[0]
    n_synthetic = int(sampling_ratio * (n_majority - n_minority))

    # Find k-nearest neighbors for each minority sample (including itself)
    nbrs = NearestNeighbors(n_neighbors=k + 1).fit(X)  # +1 to include itself
    _, indices = nbrs.kneighbors(X_minority)

    synthetic_samples = []
    for i in range(n_minority):
        # Get the neighborhood of x_i (excluding itself)
        neighborhood_indices = indices[i][1:]  # Exclude the first index (itself)
        neighborhood = X[neighborhood_indices]
        neighborhood_labels = y[neighborhood_indices]

        # Filter minority class instances in the neighborhood
        minority_neighborhood = neighborhood[neighborhood_labels == minority_class]

        # Skip if there are no minority instances in the neighborhood
        if len(minority_neighborhood) == 0:
            continue

        # Calculate the number of synthetic samples needed to balance the minority class
        n_samples_needed = int(n_synthetic / n_minority)

        # Fit a Gaussian distribution to the minority neighborhood
        if len(minority_neighborhood) > 1:  # Ensure there are enough samples to compute covariance
            try:
                # Compute mean and regularized covariance matrix
                mu = np.mean(minority_neighborhood, axis=0)
                sigma = np.cov(minority_neighborhood, rowvar=False) + regularization * np.eye(minority_neighborhood.shape[1])

                # Generate synthetic samples from the Gaussian distribution
                samples = multivariate_normal.rvs(mean=mu, cov=sigma, size=n_samples_needed)
                synthetic_samples.extend(samples)
            except:
                # Fallback to linear interpolation if Gaussian sampling fails
                for _ in range(n_samples_needed):
                    x1 = X_minority[i]  # Use the instance itself as x1
                    x2 = minority_neighborhood[np.random.choice(len(minority_neighborhood))]  # Randomly select a minority neighbor
                    lambda_ = np.random.uniform(0, 1)
                    x_new = x1 + lambda_ * (x2 - x1)
                    synthetic_samples.append(x_new)
        else:
            # Fallback to linear interpolation if there are too few minority samples in the neighborhood
            for _ in range(n_samples_needed):
                x1 = X_minority[i]  # Use the instance itself as x1
                x2 = minority_neighborhood[np.random.choice(len(minority_neighborhood))]  # Randomly select a minority neighbor
                lambda_ = np.random.uniform(0, 1)
                x_new = x1 + lambda_ * (x2 - x1)
                synthetic_samples.append(x_new)

    # Combine synthetic samples with the original data
    synthetic_samples = np.array(synthetic_samples)
    X_balanced = np.vstack([X, synthetic_samples])
    y_balanced = np.hstack([y, np.full(len(synthetic_samples), minority_class)])

    # Convert back to pandas DataFrame and Series
    X_balanced = pd.DataFrame(X_balanced, columns=cols)
    y_balanced = pd.Series(y_balanced)

    return X_balanced, y_balanced

