In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Dependencies
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
import sklearn.metrics as sk_metrics
from sklearn.ensemble import RandomForestClassifier

from imblearn.datasets import fetch_datasets
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced

In [3]:
def print_scores(headline, true_value, pred):
    print(headline)
    print(f"accuracy: {accuracy_score(true_value, pred)}")
    print(f"precision: {precision_score(true_value, pred)}")
    print(f"recall: {recall_score(true_value, pred)}")
    print(f"f1: {f1_score(true_value, pred)}")

In [4]:
# our model
classifier = RandomForestClassifier

In [5]:
data = fetch_datasets()['wine_quality']
print(f"{data.keys()}")

dict_keys(['data', 'target', 'DESCR'])


In [6]:
data['DESCR']

'wine_quality'

In [7]:
Counter(data['target'])

Counter({-1: 4715, 1: 183})

In [8]:
# columns = chemical properties of wines grown in the same region in Italy but derived from three different cultivars
data['data'][-1]

array([6.0000e+00, 2.1000e-01, 3.8000e-01, 8.0000e-01, 2.0000e-02,
       2.2000e+01, 9.8000e+01, 9.8941e-01, 3.2600e+00, 3.2000e-01,
       1.1800e+01])

In [9]:
# split the data into test and train
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], random_state=42)

## Normal Model

In [10]:
pipeline = make_pipeline(classifier(random_state=42))
model = pipeline.fit(X_train, y_train)

In [11]:
prediction = model.predict(X_test)

## Balanced Models

In [12]:
# Use SMOTE oversampling
smote_pipeline = make_pipeline_imb(SMOTE(random_state=42), classifier(random_state=42))
smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)

In [13]:
# Use NearMiss undersampling
nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=42), classifier(random_state=42))
nearmiss_model = nearmiss_pipeline.fit(X_train, y_train)
nearmiss_prediction = nearmiss_model.predict(X_test)

## Compare Distributions
After applying SMOTE and NearMiss how many events are left compared to doing no balancing

In [14]:
print(f"normal: {Counter(data['target'])}")
X_smote, y_smote = SMOTE().fit_sample(data['data'], data['target'])
print(f"SMOTE: {Counter(y_smote)}")
X_nearmiss, y_nearmiss = NearMiss().fit_sample(data['data'], data['target'])
print(f"NearMiss: {Counter(y_nearmiss)}")

normal: Counter({-1: 4715, 1: 183})
SMOTE: Counter({-1: 4715, 1: 4715})
NearMiss: Counter({-1: 183, 1: 183})


In [15]:
len(nearmiss_prediction)

1225

In [16]:
# classification report
print("Normal Classification Report")
print(classification_report(y_test, prediction))

Normal Classification Report
              precision    recall  f1-score   support

          -1       0.97      1.00      0.99      1186
           1       0.67      0.21      0.31        39

   micro avg       0.97      0.97      0.97      1225
   macro avg       0.82      0.60      0.65      1225
weighted avg       0.96      0.97      0.96      1225



In [17]:
print("SMOTE Classification Report")
print(classification_report_imbalanced(y_test, smote_prediction))

SMOTE Classification Report
                   pre       rec       spe        f1       geo       iba       sup

         -1       0.98      0.97      0.44      0.98      0.65      0.45      1186
          1       0.33      0.44      0.97      0.38      0.65      0.40        39

avg / total       0.96      0.95      0.45      0.96      0.65      0.44      1225



Notice the improvement in the Recall score for predicting good wines.

The F1 score for predicting bad wines drops by 1% but the prediction on good wines increases by 7%

In [18]:
print(f"Accuracy (normal pipeline): {pipeline.score(X_test,y_test)*100}%")
print(f"Accuracy (SMOTE pipeline): {smote_pipeline.score(X_test,y_test)*100}%")
print(f"Accurcay (NearMiss pipeline): {nearmiss_pipeline.score(X_test,y_test)*100}%")

Accuracy (normal pipeline): 97.14285714285714%
Accuracy (SMOTE pipeline): 95.42857142857143%
Accurcay (NearMiss pipeline): 34.69387755102041%


# K-Fold Cross Validation

Splitting data just one way can lead to bad estimations on the quality of the model. One technique for splitting the data for better evaluating models is [K-Fold cross validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)#k-fold_cross-validation)

In [19]:
from sklearn.model_selection import KFold

In [20]:
kf = KFold(n_splits=5, random_state=42)
accuracy = []
precision = []
recall = []
f1 = []
auc = []

## Cross validation with Over or Under Sampled datasets

When doing over sampling we are synthetically generating data so we need to careful when do that with cross validation. The entire dataset is first split and for each iteration the over or under sampling technique is applied

# The Correct Method

Below is the right way to use K-Fold cross validation with over/under sampling methods

Split raw data and then over/under sample

In [21]:
for train, test in kf.split(X_train, y_train):
    pipeline = make_pipeline_imb(SMOTE(), classifier(random_state=42))
    model = pipeline.fit(X_train[train], y_train[train])
    prediction = model.predict(X_train[test])
    
    accuracy.append(pipeline.score(X_train[test], y_train[test]))
    precision.append(sk_metrics.precision_score(y_train[test], prediction))
    recall.append(sk_metrics.recall_score(y_train[test], prediction))
    f1.append(sk_metrics.f1_score(y_train[test], prediction))
    auc.append(sk_metrics.roc_auc_score(y_train[test], prediction))

print(f"accuracy: {np.mean(accuracy)}")
print(f"precision: {np.mean(precision)}")
print(f"recall: {np.mean(recall)}")
print(f"f1: {np.mean(f1)}")

accuracy: 0.9458184581734601
precision: 0.32365203424026956
recall: 0.32849492461561425
f1: 0.318705450879618


# Wrong Way To Split Data with Over/Under Sampling
Don't do this as it brings in synthetic data into your testing data (data leakage)

In [22]:
# GOTCHA
X_bad, y_bad = SMOTE().fit_sample(X_train, y_train)
for train, test in kf.split(X_bad, y_bad):
    pipeline = make_pipeline(classifier(random_state=42))
    model = pipeline.fit(X_bad[train], y_bad[train])
    prediction = model.predict(X_bad[test])
    
    accuracy.append(pipeline.score(X_bad[test], y_bad[test]))
    precision.append(sk_metrics.precision_score(y_bad[test], prediction))
    recall.append(sk_metrics.recall_score(y_bad[test], prediction))
    f1.append(sk_metrics.f1_score(y_bad[test], prediction))

print(f"accuracy: {np.mean(accuracy)}")
print(f"precision: {np.mean(precision)}")
print(f"recall: {np.mean(recall)}")
print(f"f1: {np.mean(f1)}")

accuracy: 0.9462710984940378
precision: 0.5441232084418782
recall: 0.6384699254281219
f1: 0.5682206821921938
