In [1]:
import pandas as pd
import seaborn as sns
import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.model_selection import cross_validate

import xgboost

from collections import Counter

sns.set_theme(style="whitegrid")
sns.set_palette(palette="Paired")
SEED: int = 42

In [2]:
train_data = pd.read_csv("../../data/processed/train_data_cleaned.csv", index_col="building_id")
y_train = pd.read_csv("../../data/processed/train_labels.csv", index_col="building_id", usecols=["building_id", "damage_grade"])

In [3]:
encoder = LabelEncoder()
train_labels = encoder.fit_transform(y_train["damage_grade"].to_numpy())

In [4]:
print('Original counts of labels %s' % Counter(train_labels))

Original counts of labels Counter({1: 148259, 2: 87218, 0: 25124})


In [5]:
model = xgboost.XGBClassifier(n_estimators=100,
                              max_depth=20,
                              learning_rate=0.1,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              random_state=42,
                              n_jobs=-1)

scoring = make_scorer(matthews_corrcoef)

In [None]:
 %%time

cv_results = cross_validate(model, train_data, train_labels, cv=5,
                            scoring=scoring,
                            n_jobs=-1,
                            return_train_score=True)

In [None]:
print(f"CV Test: {round(np.mean(cv_results['test_score']), 4)} +/- {round(np.std(cv_results['test_score']), 4)} MCC")

# Ensemble methods

## EasyEnsembleClassifier

In [None]:
%%time

from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(random_state=42)
cv_results = cross_validate(eec, train_data, train_labels, cv=5,
                            scoring=scoring,  #"accuracy",
                            n_jobs=-1,
                            return_train_score=True,
                            verbose=1)

print(f"CV Test: {round(np.mean(cv_results['test_score']), 4)} +/- {round(np.std(cv_results['test_score']), 4)} MCC")

## RUSBoostClassifier

In [None]:
%%time

from imblearn.ensemble import RUSBoostClassifier

model = RUSBoostClassifier(random_state=42)
cv_results = cross_validate(model, train_data, train_labels, cv=5,
                            scoring=scoring,
                            n_jobs=-1,
                            return_train_score=True,
                            verbose=1)

print(f"CV Test: {round(np.mean(cv_results['test_score']), 4)} +/- {round(np.std(cv_results['test_score']), 4)} MCC")

## BalancedBaggingClassifier

In [None]:
%%time

from imblearn.ensemble import BalancedBaggingClassifier

bbc = BalancedBaggingClassifier(random_state=42)
cv_results = cross_validate(bbc, train_data, train_labels, cv=5,
                            scoring=scoring,  #"accuracy",
                            n_jobs=-1,
                            return_train_score=True,
                            verbose=1)

print(f"CV Test: {round(np.mean(cv_results['test_score']), 4)} +/- {round(np.std(cv_results['test_score']), 4)} MCC")

## BalancedRandomForestClassifier

In [None]:
%%time

from imblearn.ensemble import BalancedRandomForestClassifier

model = BalancedRandomForestClassifier(random_state=42)
cv_results = cross_validate(model, train_data, train_labels, cv=5,
                            scoring=scoring,  #"accuracy",
                            n_jobs=-1,
                            return_train_score=True,
                            verbose=1)

print(f"CV Test: {round(np.mean(cv_results['test_score']), 4)} +/- {round(np.std(cv_results['test_score']), 4)} MCC")

# Using hold out set

## EasyEnsembleClassifier