In [1]:
import pandas as pd
import seaborn as sns
import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.model_selection import cross_validate

import xgboost

from collections import Counter

sns.set_theme(style="whitegrid")
sns.set_palette(palette="Paired")
SEED: int = 42

# Load Data
We load the already processed data from the `data/processed` directory

In [2]:
train_data = pd.read_csv("../../data/processed/train_data_cleaned.csv", index_col="building_id")
y_train = pd.read_csv("../../data/processed/train_labels.csv", index_col="building_id", usecols=["building_id", "damage_grade"])

In [3]:
encoder = LabelEncoder()
train_labels = encoder.fit_transform(y_train["damage_grade"].to_numpy())

In [4]:
print('Original counts of labels %s' % Counter(train_labels))

Original counts of labels Counter({1: 148259, 2: 87218, 0: 25124})


In [5]:
model = xgboost.XGBClassifier(n_estimators=100,
                              max_depth=20,
                              learning_rate=0.1,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              random_state=42,
                              n_jobs=-1)

scoring = make_scorer(matthews_corrcoef)

In [6]:
%%time

cv_results = cross_validate(model, train_data, train_labels, cv=5,
                            scoring=scoring,
                            n_jobs=-1,
                            return_train_score=True)

CPU times: user 200 ms, sys: 167 ms, total: 366 ms
Wall time: 13min 25s


In [7]:
print(f"CV Test: {round(np.mean(cv_results['test_score']), 4)} +/- {round(np.std(cv_results['test_score']), 4)} MCC")

CV Test: 0.5125 +/- 0.0047 MCC


# Random Undersampling

In [8]:
from imblearn.under_sampling import RandomUnderSampler

In [9]:
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(train_data, train_labels)
print('Resampled dataset shape %s' % Counter(y_rus))

Resampled dataset shape Counter({0: 25124, 1: 25124, 2: 25124})


In [10]:
%%time

cv_results_rus = cross_validate(model, X_rus, y_rus, cv=5,
                                scoring=scoring,  #"accuracy",
                                n_jobs=-1,
                                return_train_score=True,
                                verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.2min remaining:  4.8min


CPU times: user 150 ms, sys: 175 ms, total: 325 ms
Wall time: 3min 17s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.3min finished


In [11]:
print(f"CV Test: {round(np.mean(cv_results_rus['test_score']), 4)} +/- {round(np.std(cv_results_rus['test_score']), 4)} MCC")

CV Test: 0.5765 +/- 0.0016 MCC


# Random Oversampling

In [12]:
from imblearn.over_sampling import RandomOverSampler

In [13]:
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(train_data, train_labels)
print('Resampled dataset shape %s' % Counter(y_ros))

Resampled dataset shape Counter({2: 148259, 1: 148259, 0: 148259})


In [14]:
%%time

cv_results_ros = cross_validate(model, X_ros, y_ros, cv=5,
                                scoring=scoring,
                                n_jobs=-1,
                                return_train_score=True, 
                                verbose=0)

CPU times: user 213 ms, sys: 97.8 ms, total: 311 ms
Wall time: 26min 12s


In [15]:
print(f"CV Test: {round(np.mean(cv_results_ros['test_score']), 4)} +/- {round(np.std(cv_results_ros['test_score']), 4)} MCC")

CV Test: 0.8056 +/- 0.029 MCC


# Combinations of over- and undersampling

## SMOTEENN

In [16]:
from imblearn.combine import SMOTEENN

In [17]:
%%time

sme = SMOTEENN(random_state=42)
X_sme, y_sme = sme.fit_resample(train_data, train_labels)

In [18]:
print('Resampled dataset shape %s' % Counter(y_sme))

Resampled dataset shape Counter({0: 125951, 2: 82828, 1: 50540})


In [19]:
%%time

cv_results_ros = cross_validate(model, X_sme, y_sme, cv=5,
                                scoring=scoring,
                                n_jobs=-1,
                                return_train_score=True, 
                                verbose=0)

CPU times: user 175 ms, sys: 156 ms, total: 331 ms
Wall time: 14min 26s


In [20]:
print(f"CV Test: {round(np.mean(cv_results_ros['test_score']), 4)} +/- {round(np.std(cv_results_ros['test_score']), 4)} MCC")

CV Test: 0.9066 +/- 0.0542 MCC


# Conclusion

Positive results: 
- Random undersampling increased the MCC of about 0.06