<a href="https://colab.research.google.com/github/nickprock/corso_data_science/blob/master/imbalanced_classification/imbalanced_learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imbalanced Dataset
## Imblearn

[Imbalanced-learn](https://imbalanced-learn.org/stable/index.html) is an open source, MIT-licensed library relying on scikit-learn and provides tools when dealing with classification with imbalanced classes.

<br>

![logo](https://imbalanced-learn.org/stable/_static/logo.png)

<br>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
random_state = 1

In [None]:
from sklearn.datasets import make_classification

In [None]:
X, y = make_classification(n_samples=1000, n_features=2, random_state=random_state, n_classes=2, weights=[0.90,0.10], n_informative=2, 
                           n_clusters_per_class=1, n_redundant=0, n_repeated=0)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(X[:,0], X[:,1], c=y, s = 100)
plt.title("Imbalanced Dataset?\n 90% - 10%")
#plt.legend(["class_1", "class_2", "class_3"])
plt.show()

In [None]:
print(len(y[y==1]))
print(len(y[y==0]))

# Undersampling

## RandomUnderSampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
rus = RandomUnderSampler(random_state=random_state, sampling_strategy= 0.3, replacement=True)

In [None]:
rus_x, rus_y = rus.fit_resample(X, y)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(rus_x[:,0], rus_x[:,1], c=rus_y, s = 100)
plt.title("Random Undersampling")
plt.show()

In [None]:
print(X.shape)
print(y.shape)

In [None]:
print(rus_x.shape)
print(rus_y.shape)

In [None]:
print(len(rus_y[rus_y==1]))
print(len(rus_y[rus_y==0]))

## NearMiss

In [None]:
from imblearn.under_sampling import NearMiss 

In [None]:
nm = NearMiss(sampling_strategy= 0.3, version=3, n_jobs=-1, n_neighbors_ver3=7, n_neighbors=3)
nm_x, nm_y = nm.fit_resample(X, y)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(nm_x[:,0], nm_x[:,1], c=nm_y, s = 100)
plt.title("NearMiss: 3")
plt.show()

In [None]:
nm = NearMiss(sampling_strategy= 0.3, version=2, n_jobs=-1, n_neighbors_ver3=7, n_neighbors=3)
nm_x, nm_y = nm.fit_resample(X, y)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(nm_x[:,0], nm_x[:,1], c=nm_y, s = 100)
plt.title("NearMiss: 2")
plt.show()

In [None]:
nm = NearMiss(sampling_strategy= 0.3, version=1, n_jobs=-1, n_neighbors_ver3=7, n_neighbors=3)
nm_x, nm_y = nm.fit_resample(X, y)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(nm_x[:,0], nm_x[:,1], c=nm_y, s = 100)
plt.title("NearMiss: 1")
plt.show()

In [None]:
print(nm_x.shape)
print(nm_y.shape)
print(len(nm_y[nm_y==1]))
print(len(nm_y[nm_y==0]))

## Tomek's Link

In [None]:
from imblearn.under_sampling import TomekLinks

In [None]:
tl = TomekLinks(n_jobs= -1, sampling_strategy="majority")

In [None]:
tl_x, tl_y = tl.fit_resample(X, y)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(tl_x[:,0], tl_x[:,1], c=tl_y, s = 100)
plt.title("Tomek's Link")
plt.show()

In [None]:
print(tl_x.shape)
print(tl_y.shape)
print(len(tl_y[tl_y==1]))
print(len(tl_y[tl_y==0]))

## KNN - Based

### RepeatedEditedNearestNeighbours

In [None]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours

In [None]:
renn = RepeatedEditedNearestNeighbours(sampling_strategy="majority", kind_sel="mode", n_jobs=-1, n_neighbors=3)

In [None]:
renn_x, renn_y = renn.fit_resample(X, y)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(renn_x[:,0], renn_x[:,1], c=renn_y, s = 100)
plt.title("RepeatedEditedNearestNeighbours")
plt.show()

In [None]:
print(renn_x.shape)
print(renn_y.shape)
print(len(renn_y[renn_y==1]))
print(len(renn_y[renn_y==0]))

### AllKNN

In [None]:
from imblearn.under_sampling import AllKNN

In [None]:
aknn = AllKNN(sampling_strategy="majority", kind_sel="mode", n_jobs=-1, allow_minority=False)

In [None]:
aknn_x, aknn_y = aknn.fit_resample(X, y)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(aknn_x[:,0], aknn_x[:,1], c=aknn_y, s = 100)
plt.title("AllKNN")
plt.show()

In [None]:
print(aknn_x.shape)
print(aknn_y.shape)
print(len(aknn_y[aknn_y==1]))
print(len(aknn_y[aknn_y==0]))

# Oversampling

## RandomOverSampler

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
ros = RandomOverSampler(random_state=random_state, sampling_strategy= 0.3)
ros_x, ros_y = ros.fit_resample(X, y)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(ros_x[:,0], ros_x[:,1], c=ros_y, s = 100)
plt.title("Random Oversampling")
plt.show()

In [None]:
print(ros_x.shape)
print(ros_y.shape)
print(len(ros_y[ros_y==1]))
print(len(ros_y[ros_y==0]))

## SMOTE e ADASYN

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN

In [None]:
smote = SMOTE(sampling_strategy=0.3, n_jobs=-1, random_state=random_state)

In [None]:
smote_x, smote_y = smote.fit_resample(X, y)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(smote_x[:,0], smote_x[:,1], c=smote_y, s = 100)
plt.title("SMOTE")
plt.show()

In [None]:
print(smote_x.shape)
print(smote_y.shape)
print(len(smote_y[smote_y==1]))
print(len(smote_y[smote_y==0]))

In [None]:
adasyn = ADASYN(sampling_strategy=0.3, n_jobs=-1, random_state=random_state)

In [None]:
as_x, as_y = adasyn.fit_resample(X, y)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(as_x[:,0], as_x[:,1], c=as_y, s = 100)
plt.title("ADASYN")
plt.show()

In [None]:
print(as_x.shape)
print(as_y.shape)
print(len(as_y[as_y==1]))
print(len(as_y[as_y==0]))

## Combination of over- and under-sampling

### Combine SMOTE and RandomUnderSampler

In [None]:
step1 = SMOTE(sampling_strategy=0.6, n_jobs=-1, random_state=random_state)
step2 = RandomUnderSampler(sampling_strategy=0.3, random_state=random_state, replacement=True)

In [None]:
X1, y1 = step1.fit_resample(X,y)
X2, y2 = step2.fit_resample(X1, y1)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(X2[:,0], X2[:,1], c=y2, s = 100)
plt.title("SMOTE+RUS")
plt.show()

In [None]:
print("Dimensioni dopo il primo step: SMOTE")
print("\n")
print(X1.shape)
print(X1.shape)
print("y=1: ", len(y1[y1==1]))
print("y=0: ", len(y1[y1==0]))
print("\n")
print("Dimensioni dopo il secondo step: RandomUnderSampler")
print("\n")
print(X2.shape)
print(X2.shape)
print("y=1: ", len(y2[y2==1]))
print("y=0: ", len(y2[y2==0]))
