In [8]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from numpy import mean
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import Pipeline

### Create severe imblanced dataset 

In [8]:
from sklearn.model_selection import train_test_split


X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0)
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, stratify=y)
model = LogisticRegression(solver='liblinear')
model.fit(trainX, trainy)
yhat = model.predict(testX)

### Model evaluation

In [14]:
print('Accuracy: %.3f' % accuracy_score(testy, yhat))
print('Precision: %.3f' % precision_score(testy, yhat))
print('Recall: %.3f' % recall_score(testy, yhat))
print('F-measure: %.3f' % f1_score(testy, yhat))
print('ROC AUC score: %.3f' % roc_auc_score(testy, yhat))

Accuracy: 0.991
Precision: 0.700
Recall: 0.140
F-measure: 0.233
ROC AUC score: 0.570


### Undersampling the Majority Class

In [23]:
# class distribution
print(f"Imbalanced class distribution: {Counter(y)}")

# define undersample strategy
undersample = RandomUnderSampler(sampling_strategy=0.5)
X_under, y_under = undersample.fit_resample(X, y)

print(f"Class distribution after undersampling the majority class: {Counter(y_under)}")

Imbalanced class distribution: Counter({0: 9900, 1: 100})
Class distribution after undersampling the majority class: Counter({0: 200, 1: 100})


In [28]:
trainX_under, testX_under, trainy_under, testy_under = train_test_split(X_under, y_under, test_size=0.5, stratify=y_under)
model_under = LogisticRegression(solver='liblinear')
model_under.fit(trainX_under, trainy_under)
yhat_under = model_under.predict(testX_under)
print('F-measure after undersampling: %.3f' % f1_score(testy_under, yhat_under))

F-measure after undersampling: 0.923


### Oversampling the Minority Class

In [26]:
oversample = SMOTE(sampling_strategy=0.5)
X_over, y_over = oversample.fit_resample(X, y)
print(f"Class distribution after oversampling the minority class: {Counter(y_over)}")

Class distribution after oversampling the minority class: Counter({0: 9900, 1: 4950})


In [29]:
trainX_over, testX_over, trainy_over, testy_over = train_test_split(X_over, y_over, test_size=0.5, stratify=y_over)
model_over = LogisticRegression(solver='liblinear')
model_over.fit(trainX_over, trainy_over)
yhat_over = model_over.predict(testX_over)
print('F-measure after oversampling: %.3f' % f1_score(testy_over, yhat_over))

F-measure after oversampling: 0.943


### Combine Oversampling and Undersampling

In [37]:
sample = SMOTEENN(sampling_strategy=0.5)
X_combine, y_combine = sample.fit_resample(X, y)
print(f"Class distribution after Combine: {Counter(y_combine)}")

Class distribution after Combine: Counter({0: 9853, 1: 4927})


In [38]:
trainX_combine, testX_combine, trainy_combine, testy_combine = train_test_split(X_combine, y_combine, test_size=0.5, stratify=y_combine)
model_combine = LogisticRegression(solver='liblinear')
model_combine.fit(trainX_combine, trainy_combine)
yhat_combine = model_combine.predict(testX_combine)
print('F-measure after Combine: %.3f' % f1_score(testy_combine, yhat_combine))

F-measure after Combine: 0.946


### Cost-Sensitive Algorithms


In [66]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0)
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, stratify=y)
model_sensitive = LogisticRegression(solver='liblinear', class_weight="balanced")
model_sensitive.fit(trainX, trainy)
yhat_sensitive = model_sensitive.predict(testX)
print('F-measure after adding weight to the model: %.3f' % f1_score(testy, yhat_sensitive))

F-measure after adding weight to the model: 0.990


### K-Fold Cross-Validation


In [2]:


X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
	n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

for train_ix, test_ix in kfold.split(X, y):
	# select rows
	train_X, test_X = X[train_ix], X[test_ix]
	train_y, test_y = y[train_ix], y[test_ix]
	# summarize train and test composition
	train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
	test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
	print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10
>Train: 0=8910, 1=90, Test: 0=990, 1=10


In [7]:
model = DecisionTreeClassifier()
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=kfold, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.771


### Pipeline for oversampling and undersampling for different k-neighbors

In [10]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
	n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

k_values = [1, 2, 3, 4, 5, 6, 7]

for k in k_values:
	model = DecisionTreeClassifier()
	over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
	under = RandomUnderSampler(sampling_strategy=0.5)
	steps = [('over', over), ('under', under), ('model', model)]
	pipeline = Pipeline(steps=steps)
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
	score = mean(scores)
	print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=1, Mean ROC AUC: 0.825
> k=2, Mean ROC AUC: 0.818
> k=3, Mean ROC AUC: 0.825
> k=4, Mean ROC AUC: 0.838
> k=5, Mean ROC AUC: 0.841
> k=6, Mean ROC AUC: 0.849
> k=7, Mean ROC AUC: 0.851
