In [32]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bri-data-hackathon-pa/sample_submission.csv
/kaggle/input/bri-data-hackathon-pa/data_description.csv
/kaggle/input/bri-data-hackathon-pa/train.csv
/kaggle/input/bri-data-hackathon-pa/test.csv


Hello everyone!

This notebook presents an example of how to deal with imbalanced data. I will start with several oversampling methods, combination methods, and balanced classifiers.

If you have any questions regarding the code, please comment below. I will update the notebook accordingly.

**Please do upvote the notebook if this notebook helps you, as it will be a benchmark for me to do more work in the future. Thank you :)**

**Note: I do not do the feature engineering here, so the result may sub-optimal**

In [70]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [38]:
# Read train and test set
train = pd.read_csv("/kaggle/input/bri-data-hackathon-pa/train.csv")
test = pd.read_csv("/kaggle/input/bri-data-hackathon-pa/test.csv")

In [46]:
# Drop null values
train = train.dropna()

In [47]:
# Split train set into dependent variables and independent variable
y = train['Best Performance']
X = train.drop('Best Performance', axis=1)

In [62]:
y.value_counts()

0    9515
1    1637
Name: Best Performance, dtype: int64

In [48]:
# Convert to dummy variables
X = pd.get_dummies(X)
test = pd.get_dummies(test)

In [49]:
# Extract the common features between train and test set and use it to filter the train and test set
common = list(set(X.columns).intersection(set(test.columns)))
X = X[common]
test = test[common]

## Oversampling

In [51]:
# ADASYN, BorderlineSMOTE, KMeansSMOTE, RandomOverSampler, SMOTE, SVMSMOTE
from imblearn.over_sampling import *

In [83]:
over_methods = [
    ADASYN(random_state=7),
    BorderlineSMOTE(random_state=7),
    RandomOverSampler(random_state=7),
    SMOTE(random_state=7),
    SVMSMOTE(random_state=7)
]

In [84]:
classifiers =[
    RandomForestClassifier(n_estimators=1000),
    ExtraTreesClassifier(n_estimators=1000)
]

In [87]:
# Benchmarks
for classifier in classifiers:
        
    steps = [('model', classifier)]

    pipeline = Pipeline(steps=steps)

    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=7)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    
    print("ROC-AUC Score for", classifier, "without over-sampling is", round(np.mean(scores),5))

ROC-AUC Score for RandomForestClassifier(n_estimators=1000) without over-sampling is 0.57059
ROC-AUC Score for ExtraTreesClassifier(n_estimators=1000) without over-sampling is 0.55623


In [86]:
# Iterates over classifiers and over_methods
for classifier in classifiers:
    for method in over_methods:
        
        steps = [('over', method),
                 ('model', classifier)]

        pipeline = Pipeline(steps=steps)

        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=7)
        scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    
        print("ROC-AUC Score for", classifier, "and", method, "is", round(np.mean(scores),5))

ROC-AUC Score for RandomForestClassifier(n_estimators=1000) and ADASYN(random_state=7) is 0.55706
ROC-AUC Score for RandomForestClassifier(n_estimators=1000) and BorderlineSMOTE(random_state=7) is 0.55706
ROC-AUC Score for RandomForestClassifier(n_estimators=1000) and RandomOverSampler(random_state=7) is 0.574
ROC-AUC Score for RandomForestClassifier(n_estimators=1000) and SMOTE(random_state=7) is 0.55566
ROC-AUC Score for RandomForestClassifier(n_estimators=1000) and SVMSMOTE(random_state=7) is 0.56108
ROC-AUC Score for ExtraTreesClassifier(n_estimators=1000) and ADASYN(random_state=7) is 0.53394
ROC-AUC Score for ExtraTreesClassifier(n_estimators=1000) and BorderlineSMOTE(random_state=7) is 0.535
ROC-AUC Score for ExtraTreesClassifier(n_estimators=1000) and RandomOverSampler(random_state=7) is 0.55442
ROC-AUC Score for ExtraTreesClassifier(n_estimators=1000) and SMOTE(random_state=7) is 0.53567
ROC-AUC Score for ExtraTreesClassifier(n_estimators=1000) and SVMSMOTE(random_state=7) is 

## Combination Sampling
#### Combination of over- and under-sampling methods


In [88]:
from imblearn.combine import SMOTEENN, SMOTETomek

In [89]:
combinations = [
    SMOTEENN(random_state=7),
    SMOTETomek(random_state=7)
]

In [90]:
classifiers =[
    RandomForestClassifier(n_estimators=1000),
    ExtraTreesClassifier(n_estimators=1000)
]

In [93]:
# Iterates over classifiers and combinations
for classifier in classifiers:
    for combination in combinations:
        
        steps = [('comb', combination),
                 ('model', classifier)]

        pipeline = Pipeline(steps=steps)

        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=7)
        scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    
        print("ROC-AUC Score for", classifier, "and", combination, "is", round(np.mean(scores),5))

ROC-AUC Score for RandomForestClassifier(n_estimators=1000) and SMOTEENN(random_state=7) is 0.55594
ROC-AUC Score for RandomForestClassifier(n_estimators=1000) and SMOTETomek(random_state=7) is 0.5553
ROC-AUC Score for ExtraTreesClassifier(n_estimators=1000) and SMOTEENN(random_state=7) is 0.53167
ROC-AUC Score for ExtraTreesClassifier(n_estimators=1000) and SMOTETomek(random_state=7) is 0.53454


## Ensemble of Samplers
#### Classifier including inner balancing samplers
For more info: https://imbalanced-learn.org/stable/ensemble.html

In [97]:
from imblearn.ensemble import BalancedBaggingClassifier, RUSBoostClassifier, BalancedRandomForestClassifier

In [98]:
# Balanced Bagging Classifier
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                n_estimators = 1000,
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=0)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=7)
scores = cross_val_score(bbc, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print("ROC-AUC Score for Balanced Bagging Classifier is", round(np.mean(scores),5))

ROC-AUC Score for Balanced Bagging Classifier is 0.58633


In [99]:
# Balanced Random Forest Classifier
brf = BalancedRandomForestClassifier(n_estimators = 1000,
                                     sampling_strategy='auto',
                                     replacement=False,
                                     random_state=0)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=7)
scores = cross_val_score(brf, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print("ROC-AUC Score for Balanced Random Forest Classifier is", round(np.mean(scores),5))

ROC-AUC Score for Balanced Random Forest Classifier is 0.57839


In [100]:
# RUSBoostClassifier
rusbc = RUSBoostClassifier(n_estimators = 1000,
                           random_state = 0)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=7)
scores = cross_val_score(rusbc, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print("ROC-AUC Score for RUSBoostClassifier is", round(np.mean(scores),5))

ROC-AUC Score for RUSBoostClassifier is 0.54722


## Summary

Now we obtain the highest score are:
- Balanced Bagging Classifier is 0.58633
- Balanced Random Forest Classifier is 0.57839
- RandomForestClassifier(n_estimators=1000) and RandomOverSampler() is 0.574
- RandomForestClassifier(n_estimators=1000) without over-sampling is 0.57059 [BENCHMARK]

### What's next?

- Try to do feature engineering first or try another encoder method and run all the code to calculate the scores.
- Tune the classifier with GridSearchCV, RandomizedSearchCV, or Bayesian Optimization. See my other notebook here: https://www.kaggle.com/yevonnaelandrew/starter-xgboost-bayesian-optimization
- Try other classification algorithms, like XGBoost, CatBoost, LGBM.