In [2]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

# Generate imbalanced data
X, y = make_classification(
    n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=42
)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Before undersampling:")
print("Number of samples in each class in the training set:")
print("Class 0:", sum(y_train == 0))
print("Class 1:", sum(y_train == 1))

# Apply oversampling to the training set
oversampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

print("\nAfter undersampling:")
print("Number of samples in each class in the training set after oversampling:")
print("Class 0:", sum(y_train_resampled == 0))
print("Class 1:", sum(y_train_resampled == 1))


Before undersampling:
Number of samples in each class in the training set:
Class 0: 87
Class 1: 713

After undersampling:
Number of samples in each class in the training set after oversampling:
Class 0: 87
Class 1: 87
