# Imbalanced Data

In [1]:
#conda install -c conda-forge imbalanced-learn

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics 
from sklearn.model_selection import train_test_split 
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import ADASYN

ModuleNotFoundError: No module named 'imblearn'

In [None]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
data = pd.concat([pd.DataFrame(X),pd.DataFrame(y, columns =['y'])], axis =1)
data.head()

In [None]:
def plot_data(data):
    for label in data['y'].unique():
        row = data[data['y'] == label]
        plt.scatter(row[0], row[1], label=str(label))
    plt.legend()
    plt.show()

In [None]:
plot_data(data)

## Baseline Methods 

In [None]:
n0 = len(data[data['y']==0])
n1 = len(data[data['y']==1])
print(n0,n1)

### Under-sampling
Removing samples from the majority class

In [None]:
undersampled_data = pd.concat([data[data['y']==0].sample(n1, random_state=1), data[data['y']==1]])
print(undersampled_data['y'].value_counts())
plot_data(undersampled_data)

### Over-sampling
Adding more examples from the minority class.

In [None]:
oversampled_data = pd.concat([data[data['y']==0], data[data['y']==1].sample(n0, replace=True, random_state=1)])
print(oversampled_data['y'].value_counts())
plot_data(oversampled_data)

Those methods can balance the class distribution but do not provide any additional information to the model.

## SMOTE
The most widely used approach to synthesizing new examples from the minority class. This is a type of data augmentation for tabular data and can be very effective.

In [None]:
smote_oversample = SMOTE(random_state = 1)
smote_X, smote_y = smote_oversample.fit_resample(X, y)

In [None]:
smote_data = pd.concat([pd.DataFrame(smote_X),pd.DataFrame(smote_y, columns =['y'])], axis =1)
print(smote_data['y'].value_counts())
plot_data(smote_data)

In [None]:
smote_oversample = SMOTE(random_state = 1, k_neighbors = 3)
smote_X, smote_y = smote_oversample.fit_resample(X, y)

In [None]:
smote_data = pd.concat([pd.DataFrame(smote_X),pd.DataFrame(smote_y, columns =['y'])], axis =1)
print(smote_data['y'].value_counts())
plot_data(smote_data)

The original paper on SMOTE suggested combining SMOTE with random undersampling of the majority class.<br>
We can first oversample the minority class using SMOTE, then use random undersampling to reduce the number of examples in the majority class.

In [None]:
over = SMOTE(sampling_strategy=0.2, random_state = 1)
under = RandomUnderSampler(sampling_strategy=0.5, random_state = 1)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
smote2_X, smote2_y = pipeline.fit_resample(X, y)
smote2_data = pd.concat([pd.DataFrame(smote2_X),pd.DataFrame(smote2_y, columns =['y'])], axis =1)
print(smote2_data['y'].value_counts())
plot_data(smote2_data)

### SMOTE for Classification

First, let's generate a decision tree classifier on the original dataset.

In [None]:
clf = DecisionTreeClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(clf, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % np.mean(scores))
# scores = cross_val_score(clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# print('Mean Accuracy: %.3f' % np.mean(scores))

Now, let's use a SMOTE transformed version of the dataset.<br>
When using k-fold cross-validation, the oversampling should be applied on the training dataset only, then evaluate the model on the non-transformed test set.

In [None]:
steps = [('over', SMOTE()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % np.mean(scores))

In [None]:
clf = DecisionTreeClassifier()
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under), ('model', clf)]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % np.mean(scores))

SMOTE use k-nearest neighbors to create the new synthetic examples. <br>
The default is k=5, although larger or smaller values will influence the types of examples created, and may impact the performance of the model.


In [None]:
mean_score = []
for k in range(1,11):
    model = DecisionTreeClassifier()
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = np.mean(scores)
    mean_score.append(score)
    print('k=%d, Mean ROC AUC: %.3f' % (k, score))

In [None]:
plt.plot(range(1,11), mean_score)
plt.xlabel('k')
plt.ylabel('mean score')

## Adaptive Synthetic Sampling (ADASYN)
Generating synthetic samples inversely proportional to the density of the examples in the minority class.

In [None]:
oversample = ADASYN(random_state = 1)
adasyn_X, adasyn_y = oversample.fit_resample(X, y)
adasyn_data = pd.concat([pd.DataFrame(adasyn_X),pd.DataFrame(adasyn_y, columns =['y'])], axis =1)
print(adasyn_data['y'].value_counts())
plot_data(adasyn_data)

In [None]:
oversample = ADASYN(random_state = 1, n_neighbors = 3)
adasyn_X, adasyn_y = oversample.fit_resample(X, y)
adasyn_data = pd.concat([pd.DataFrame(adasyn_X),pd.DataFrame(adasyn_y, columns =['y'])], axis =1)
print(adasyn_data['y'].value_counts())
plot_data(adasyn_data)

In [None]:
clf = DecisionTreeClassifier()
oversample = ADASYN(random_state = 1)
steps = [('over', oversample),('model', clf)]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % np.mean(scores))