# Balance datasets

## Imports

In [6]:
import pandas as pd
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from imblearn.datasets import make_imbalance
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

## Multiclass classification with under-sampling

Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com> License: MIT

#### Load data and split for validation

In [16]:
rs = 42
ss = {0: 25, 1: 50, 2: 50}

iris = load_iris()

X, y = make_imbalance(iris.data, iris.target, sampling_strategy=ss, random_state=rs)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rs)

print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))

Training target statistics: Counter({1: 38, 2: 38, 0: 17})
Testing target statistics: Counter({1: 12, 2: 12, 0: 8})


#### Create pipeline, classify and report results

In [17]:
pip = make_pipeline(NearMiss(version=2), LinearSVC(random_state=rs, max_iter=1000))
pip.fit(X_train, y_train)

report = classification_report_imbalanced(y_test, pip.predict(X_test))
print(report)

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00         8
          1       1.00      0.83      1.00      0.91      0.91      0.82        12
          2       0.86      1.00      0.90      0.92      0.95      0.91        12

avg / total       0.95      0.94      0.96      0.94      0.95      0.90        32





## Credits & Links

https://github.com/scikit-learn-contrib/imbalanced-learn
https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/