In [5]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder

import xgboost

sns.set_theme(style="whitegrid")
sns.set_palette(palette="Paired")
SEED: int = 42

In [6]:
train_data = pd.read_csv("../../data/processed/train_data_cleaned.csv", index_col="building_id")
y_train = pd.read_csv("../../data/processed/train_labels.csv", index_col="building_id", usecols=["building_id", "damage_grade"])

In [7]:
encoder = LabelEncoder()
train_labels = encoder.fit_transform(y_train["damage_grade"].to_numpy())

In [8]:
def hyperparameter_optimization(model=None, hyperparameter_grid=None, train_data=None, train_labels=None, scoring=None):
    # Get train labels in shape that .fit() expects
    if model == "XGBoost":
        # XGBoost expects [0, 1, 2] instead of [1, 2, 3]
        encoder = LabelEncoder()
        train_labels = encoder.fit_transform(train_labels["damage_grade"].to_numpy())
    else:
        train_labels = train_labels["damage_grade"].ravel()

    # Choose model based on input
    if model == "Dummy":
        model = DummyClassifier(strategy="most_frequent")
    elif model == "RandomForest":
        print("Fitting RandomForest ...")
        model = RandomForestClassifier(random_state=42)
    elif model == "DecisionTree":
        print("Fitting DecisionTree ...")
        model = DecisionTreeClassifier(random_state=42)
    elif model == "XGBoost":
        print("Fitting XGBoost ...")
        #model = xgboost.XGBClassifier(random_state=42, n_jobs=-1)
        model = xgboost.XGBClassifier(n_estimators=100,
                                      max_depth=20,
                                      learning_rate=0.1,
                                      subsample=0.8,
                                      colsample_bytree=0.8,
                                      random_state=42,
                                      n_jobs=-1)


    cv_results = cross_validate(model, train_data, train_labels, cv=5,
                                scoring=scoring,
                                n_jobs=-1,
                                return_train_score=True)
    model.fit(train_data, train_labels)


    print("")
    print(f"CV Training ACC: {round(np.mean(cv_results['train_accuracy']), 4)} +/- {round(np.std(cv_results['train_accuracy']), 4)} ")
    print(f"CV Test ACC: {round(np.mean(cv_results['test_accuracy']), 4)} +/- {round(np.std(cv_results['test_accuracy']), 4)}")
    print("")
    print(f"CV Training MCC: {round(np.mean(cv_results['train_matthews_corrcoef']), 4)} +/- {round(np.std(cv_results['train_matthews_corrcoef']), 4)} ")
    print(f"CV Test MCC: {round(np.mean(cv_results['test_matthews_corrcoef']), 4)} +/- {round(np.std(cv_results['test_matthews_corrcoef']), 4)}")
    print("")
    
    return model, cv_results

# Baseline (XGBoost)

In [9]:
model, cv_results = hyperparameter_optimization(model="XGBoost",
                                                train_data=train_data,
                                                train_labels=y_train,
                                                scoring=["accuracy", "matthews_corrcoef"])

Fitting XGBoost ...


KeyboardInterrupt: 