In [1]:
# The adult dataset contains data about adult persons extracted from the 
# 1994 Census database.
# The task is to determine whether a person makes over 50K a year.
from sklearn.datasets import fetch_openml

df, y = fetch_openml("adult", version=2, as_frame=True, return_X_y=True)
df = df.drop(columns=["fnlwgt", "education-num"]) #not needed

In [2]:
classes_count = y.value_counts()
classes_count

class
<=50K    37155
>50K     11687
Name: count, dtype: int64

In [3]:
# To better highlight the effect of learning from an imbalanced dataset,
# we increase its ratio to 30:1

from imblearn.datasets import make_imbalance

df, y = make_imbalance(df, y, sampling_strategy={classes_count.idxmin(): classes_count.max() // 30},)

y.value_counts()

class
<=50K    37155
>50K      1238
Name: count, dtype: int64

In [4]:
# data preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

num_pipe = make_pipeline(
    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
)
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore"),
)

from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector

preprocessor_linear = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

In [5]:
df = preprocessor_linear.fit_transform(df)

In [6]:
# We calculate the balanced accuracy to deal with imbalanced datasets. 
# It is defined as the average of recall obtained on each class.

from sklearn.model_selection import cross_validate

def performance(clf):
    ret = cross_validate(clf, df, y, scoring=["accuracy", "balanced_accuracy"])

    print("accuracy=%.2f; balanced accuracy=%.2f" % (ret['test_accuracy'].mean(), ret["test_balanced_accuracy"].mean()))

In [7]:
# We will compare the performance of different classifiers 
# We start with a DummyClassifier which always predicts the majority class

from sklearn.dummy import DummyClassifier

clf = DummyClassifier(strategy="most_frequent")

performance(clf)

accuracy=0.97; balanced accuracy=0.50


In [8]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)

performance(clf)

accuracy=0.97; balanced accuracy=0.57


In [9]:
# Now let's try a RandomForest

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42, n_jobs=2)
performance(clf)

accuracy=0.97; balanced accuracy=0.62


In [10]:
# Let's try to use Event-Weighting 

print("LR with Event-Weighting:")
clf = LogisticRegression(class_weight="balanced", max_iter=1000)
performance(clf)

print("RF with Event-Weighting:")
clf = RandomForestClassifier(class_weight="balanced", random_state=42, n_jobs=2)
performance(clf)

LR with Event-Weighting:
accuracy=0.80; balanced accuracy=0.81
RF with Event-Weighting:
accuracy=0.96; balanced accuracy=0.62


In [11]:
# Next let's try the specialized RF from imbalanced-learn

from imblearn.ensemble import BalancedRandomForestClassifier

clf = BalancedRandomForestClassifier(random_state=42, n_jobs=2)
performance(clf)

accuracy=0.84; balanced accuracy=0.79


In [12]:
# Finally, we try Under-sampling

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
df, y = rus.fit_resample(df,y)

print("Undersampling + LR:")
clf = LogisticRegression(max_iter=1000)
performance(clf)

print("Undersampling + RF:")
clf = RandomForestClassifier(random_state=42, n_jobs=2)
performance(clf)

Undersampling + LR:
accuracy=0.79; balanced accuracy=0.79
Undersampling + RF:
accuracy=0.79; balanced accuracy=0.79
