In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from dataloader import dataloader
from encoder import get_basen_encoder, get_onehot_encoder, encode_labels, get_target_encoder
from sklearn.compose import ColumnTransformer



In [2]:
X_train, y_train, X_test = dataloader()
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# adjust labels for XGBoost
y_train = encode_labels(y_train)

In [3]:
# columns for base-n encoding
base_n_columns = []

# columns for base-n encoding
target_columns = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

# columns for one-hot encoding
one_hot_columns = ['foundation_type', 'land_surface_condition', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status', 'roof_type']

# delete columns
X_train.drop(columns=['building_id',], inplace=True)
X_valid.drop(columns=['building_id',], inplace=True)

base_pipeline = get_basen_encoder(base_n_columns)
onehot_pipeline = get_onehot_encoder(one_hot_columns)
target_pipeline = get_target_encoder(target_columns)

In [4]:
params = {
    'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 
    'eta': 0.025, 'eval_metric': 'auc', 'gamma': 0.9, 'max_depth': 11, 
    'min_child_weight': 6.0, 'n_estimators': 969, 'nthread': 6, 
    'seed': 42, 'subsample': 0.8, 'device': 'cuda'
}

In [5]:
from sklearn.pipeline import Pipeline

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("oh_pipe", onehot_pipeline, one_hot_columns),
        ('base_pipe', base_pipeline, base_n_columns),
        ('target_pipe', target_pipeline ,target_columns)
    ],remainder='passthrough'
)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', XGBClassifier(**params))
])

my_pipeline.fit(X_train, y_train['damage_grade'])

In [6]:
preds = my_pipeline.predict(X_valid)

# remap predictions to original labels
preds = encode_labels(preds, reverse=True)

In [7]:
from sklearn.metrics import f1_score
f1_score(y_valid.damage_grade, preds, average='micro')

0.7473954835862704

# Feature importances

In [8]:
xgb_model = my_pipeline.named_steps['model']
pd.DataFrame(zip(X_train.columns, xgb_model.feature_importances_), columns=['feature', 'importance']).sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
37,has_secondary_use_other,0.088421
9,roof_type,0.080752
3,count_floors_pre_eq,0.074867
0,geo_level_1_id,0.057082
15,has_superstructure_mud_mortar_stone,0.015083
16,has_superstructure_stone_flag,0.012199
2,geo_level_3_id,0.010624
23,has_superstructure_rc_engineered,0.009988
8,foundation_type,0.009641
1,geo_level_2_id,0.009183


# Predict on test set and prepare submission

In [9]:
test_preds = my_pipeline.predict(X_test)
test_preds = encode_labels(test_preds, reverse=True)

In [10]:
submission = pd.DataFrame({'building_id': X_test.building_id,
                           'damage_grade': test_preds})

In [11]:
submission.to_csv('submission.csv', index=False)