In [3]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from dataloader import dataloader
from encoder import get_basen_encoder, get_onehot_encoder, encode_labels
from sklearn.compose import ColumnTransformer

In [4]:
X_train, y_train, X_test = dataloader()
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# adjust labels for XGBoost
y_train = encode_labels(y_train)

In [5]:
# columns for base-n encoding
base_n_columns = ['geo_level_1_id', 'geo_level_2_id']

# columns for one-hot encoding
one_hot_columns = ['foundation_type', 'land_surface_condition', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']

base_pipeline = get_basen_encoder(base_n_columns)
onehot_pipeline = get_onehot_encoder(one_hot_columns)



In [6]:
from sklearn.pipeline import Pipeline


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("oh_pipe", onehot_pipeline, one_hot_columns),
        ('base_pipe', base_pipeline, base_n_columns),

])

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', XGBClassifier(n_estimators=1000))
])

my_pipeline.fit(X_train, y_train['damage_grade'])

In [7]:
preds = my_pipeline.predict(X_valid)

# remap predictions to original labels
preds = encode_labels(preds, reverse=True)

In [8]:
from sklearn.metrics import f1_score
f1_score(y_valid.damage_grade, preds, average='micro')

0.7151244220179965

# Predict on test set and prepare submission

In [9]:
test_preds = my_pipeline.predict(X_test)

In [10]:
submission = pd.DataFrame({'building_id': X_test.building_id,
                           'damage_grade': test_preds})

In [11]:
submission.to_csv('submission.csv', index=False)