In [32]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

### Create severe imblanced dataset 

In [8]:
from sklearn.model_selection import train_test_split


X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0)
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, stratify=y)
model = LogisticRegression(solver='liblinear')
model.fit(trainX, trainy)
yhat = model.predict(testX)

### Model evaluation

In [14]:
print('Accuracy: %.3f' % accuracy_score(testy, yhat))
print('Precision: %.3f' % precision_score(testy, yhat))
print('Recall: %.3f' % recall_score(testy, yhat))
print('F-measure: %.3f' % f1_score(testy, yhat))
print('ROC AUC score: %.3f' % roc_auc_score(testy, yhat))

Accuracy: 0.991
Precision: 0.700
Recall: 0.140
F-measure: 0.233
ROC AUC score: 0.570


### Undersampling the Majority Class

In [23]:
# class distribution
print(f"Imbalanced class distribution: {Counter(y)}")

# define undersample strategy
undersample = RandomUnderSampler(sampling_strategy=0.5)
X_under, y_under = undersample.fit_resample(X, y)

print(f"Class distribution after undersampling the majority class: {Counter(y_under)}")

Imbalanced class distribution: Counter({0: 9900, 1: 100})
Class distribution after undersampling the majority class: Counter({0: 200, 1: 100})


In [28]:
trainX_under, testX_under, trainy_under, testy_under = train_test_split(X_under, y_under, test_size=0.5, stratify=y_under)
model_under = LogisticRegression(solver='liblinear')
model_under.fit(trainX_under, trainy_under)
yhat_under = model_under.predict(testX_under)
print('F-measure after undersampling: %.3f' % f1_score(testy_under, yhat_under))

F-measure after undersampling: 0.923


### Oversampling the Minority Class

In [26]:
oversample = SMOTE(sampling_strategy=0.5)
X_over, y_over = oversample.fit_resample(X, y)
print(f"Class distribution after oversampling the minority class: {Counter(y_over)}")

Class distribution after oversampling the minority class: Counter({0: 9900, 1: 4950})


In [29]:
trainX_over, testX_over, trainy_over, testy_over = train_test_split(X_over, y_over, test_size=0.5, stratify=y_over)
model_over = LogisticRegression(solver='liblinear')
model_over.fit(trainX_over, trainy_over)
yhat_over = model_over.predict(testX_over)
print('F-measure after oversampling: %.3f' % f1_score(testy_over, yhat_over))

F-measure after oversampling: 0.943


### Combine Oversampling and Undersampling

In [37]:
sample = SMOTEENN(sampling_strategy=0.5)
X_combine, y_combine = sample.fit_resample(X, y)
print(f"Class distribution after Combine: {Counter(y_combine)}")

Class distribution after Combine: Counter({0: 9853, 1: 4927})


In [38]:
trainX_combine, testX_combine, trainy_combine, testy_combine = train_test_split(X_combine, y_combine, test_size=0.5, stratify=y_combine)
model_combine = LogisticRegression(solver='liblinear')
model_combine.fit(trainX_combine, trainy_combine)
yhat_combine = model_combine.predict(testX_combine)
print('F-measure after Combine: %.3f' % f1_score(testy_combine, yhat_combine))

F-measure after Combine: 0.946


### Cost-Sensitive Algorithms


In [66]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0)
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, stratify=y)
model_sensitive = LogisticRegression(solver='liblinear', class_weight="balanced")
model_sensitive.fit(trainX, trainy)
yhat_sensitive = model_sensitive.predict(testX)
print('F-measure after adding weight to the model: %.3f' % f1_score(testy, yhat_sensitive))

F-measure after adding weight to the model: 0.990
