# Imbalanced Classification

In [2]:
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, classification_report

In [3]:
# Load dataset
cancer = load_breast_cancer()

data = pd.DataFrame(cancer.data, columns = cancer.feature_names)
data['target'] = cancer.target

X = data.drop('target', axis = 1)
y = data['target']

# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

# Check target
print('Full', data.target.value_counts(True), end = '\n\n')
print('Train', y_train.value_counts(True), end = '\n\n')
print('Test', y_test.value_counts(True))

Full target
1    0.627417
0    0.372583
Name: proportion, dtype: float64

Train target
1    0.626374
0    0.373626
Name: proportion, dtype: float64

Test target
1    0.631579
0    0.368421
Name: proportion, dtype: float64


In [10]:
print(X_train.value_counts().sum())
print(y_train.value_counts().sum())

455
455


In [4]:
# Baseline pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression())
])

# SMOTE pipeline
pipeline_SMOTE = Pipeline([
    ('smote', SMOTE(random_state = 42)),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression())
])

In [None]:
# Hyperparameter optimization
param_grid = {
    'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
    'clf__max_iter': [200, 500, 1000]
}

grid_search = GridSearchCV(pipeline, param_grid, cv = 5, scoring = 'accuracy', n_jobs = -1)
grid_search.fit(X_train, y_train)

# Best estimator results
print('Best Cross-Validation Score:', grid_search.best_score_)
print('Best Parameters:', grid_search.best_params_)

# Evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_te)
print('\nClassification Report for Best Model:\n', classification_report(y_train, y_pred))

750 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/maria/.pyenv/versions/3.12.4/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/maria/.pyenv/versions/3.12.4/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maria/.pyenv/versions/3.12.4/lib/python3.12/site-packages/imblearn/pipeline.py", line 333, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])


Best Cross-Validation Score: 0.9802197802197803
Best Parameters: {'clf__C': 0.1, 'clf__max_iter': 100, 'clf__penalty': 'l2', 'clf__solver': 'newton-cg'}


ValueError: Found input variables with inconsistent numbers of samples: [455, 569]

In [None]:
# Cross-validation scores
scores = cross_val_score(pipeline, X_train, y_train, cv = 5, scoring = 'balanced_accuracy')
scores_SMOTE = cross_val_score(pipeline_SMOTE, X_train, y_train, cv = 5, scoring = 'balanced_accuracy')

print(f'Baseline cross_val balanced_accuracy (weighted): {np.mean(scores)}')
print(f'SMOTE cross_val balanced_accuracy (weighted): {np.mean(scores_SMOTE)}')

cross_val balanced_accuracy SMOTE (weighted): 0.9718266253869968
cross_val balanced_accuracy (weighted): 0.9770897832817337


[Errno 2] No such file or directory: 'sysctl'
  File "/Users/maria/.pyenv/versions/3.12.4/lib/python3.12/site-packages/joblib/externals/loky/backend/context.py", line 270, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "/Users/maria/.pyenv/versions/3.12.4/lib/python3.12/subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maria/.pyenv/versions/3.12.4/lib/python3.12/subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "/Users/maria/.pyenv/versions/3.12.4/lib/python3.12/subprocess.py", line 1955, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)


In [7]:
pipeline_SMOTE.fit(X_train, y_train)
pipeline.fit(X_train, y_train)

In [8]:
# Baseline predictions
y_preds = pipeline.predict(X_test)
y_preds_train = pipeline.predict(X_train)

# SMOTE predictions
y_preds_SMOTE = pipeline_SMOTE.predict(X_test)
y_preds_SMOTE_train = pipeline_SMOTE.predict(X_train)

In [9]:
# Baseline scores
test_f1 = f1_score(y_test, y_preds, average = 'weighted')
train_recall = recall_score(y_train, y_preds_train, average = 'weighted')
test_recall = recall_score(y_test, y_preds, average = 'weighted')
print('Baseline scores:')
print(f'Test F1 score: {test_f1}')
print(f'Train recall: {train_recall}')
print(f'Test recall: {test_recall}', end = '\n\n')


# SMOTE scores
train_recall_SMOTE = recall_score(y_train, y_preds_SMOTE_train, average = 'weighted')
test_recall_SMOTE = recall_score(y_test, y_preds_SMOTE, average = 'weighted')

print('SMOTE scores:')
print(f'Train recall SMOTE: {train_recall_SMOTE}')
print(f'Test recall SMOTE: {test_recall_SMOTE}')

Baseline scores:
Test F1 score: 0.9824561403508771
Train recall: 0.989010989010989
Test recall: 0.9824561403508771

SMOTE scores:
Train recall SMOTE: 0.989010989010989
Test recall SMOTE: 0.9649122807017544


In [None]:
print('Baseline Test', classification_report(y_test, y_preds))
print('SMOTE Test', classification_report(y_test, y_preds_SMOTE))

Test SMOTE               precision    recall  f1-score   support

           0       0.93      0.98      0.95        42
           1       0.99      0.96      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.97      0.96       114
weighted avg       0.97      0.96      0.97       114

Test               precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [None]:
print('Baseline Train', classification_report(y_train, y_preds_train))
print('SMOTE Train', classification_report(y_train, y_preds_SMOTE_train))

Train SMOTE               precision    recall  f1-score   support

           0       0.99      0.98      0.99       170
           1       0.99      1.00      0.99       285

    accuracy                           0.99       455
   macro avg       0.99      0.99      0.99       455
weighted avg       0.99      0.99      0.99       455

Train               precision    recall  f1-score   support

           0       0.99      0.98      0.99       170
           1       0.99      1.00      0.99       285

    accuracy                           0.99       455
   macro avg       0.99      0.99      0.99       455
weighted avg       0.99      0.99      0.99       455

