In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from dataloader import dataloader
from data_cleaner import clean_data
from encoder import get_basen_encoder, get_onehot_encoder, encode_labels, get_target_encoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
import warnings

## Settings

In [2]:
PARAMETER_TUNING = False
MODEL_VALIDATION = True
SAVE_MODEL = False
DO_SUBMISSSION = False
warnings.filterwarnings("ignore")

## Loading Data

In [3]:
X_train, y_train, X_test = dataloader()
X_train, y_train = clean_data(X_train, y_train)

if MODEL_VALIDATION:
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# adjust labels for XGBoost
y_train = encode_labels(y_train)

if MODEL_VALIDATION:
    y_valid = encode_labels(y_valid)

## Building Preprocessor

In [4]:
# columns for base-n encoding
base_n_columns = []

# columns for base-n encoding
target_columns = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

# columns for one-hot encoding
one_hot_columns = ['foundation_type', 'land_surface_condition', 'ground_floor_type', 'other_floor_type', 
                   'position', 'plan_configuration', 'legal_ownership_status', 'roof_type']

# delete columns
X_train.drop(columns=['building_id',], inplace=True)
X_valid.drop(columns=['building_id',], inplace=True)

# Get pipelines for encoding
base_pipeline = get_basen_encoder(base_n_columns)
onehot_pipeline = get_onehot_encoder(one_hot_columns)
target_pipeline = get_target_encoder(target_columns)

In [5]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("oh_pipe", onehot_pipeline, one_hot_columns),
        ('base_pipe', base_pipeline, base_n_columns),
        ('target_pipe', target_pipeline, target_columns)
    ], remainder='passthrough'
)

## Optuna Hyperparameter tuning

In [6]:
if PARAMETER_TUNING:
    from optuna_tuning import do_study, transform_data
    params = do_study(X_train, y_train, X_valid, y_valid, preprocessor).params
    params.update({'eta': 0.025, 'eval_metric': 'auc', 'nthread': 6, 
                'seed': 42, 'subsample': 0.8, 'device': 'cuda'})

else:
    params = {
    'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 
    'eta': 0.025, 'eval_metric': 'auc', 'gamma': 0.9, 'max_depth': 11, 
    'min_child_weight': 6.0, 'n_estimators': 969, 'nthread': 6, 
    'seed': 42, 'subsample': 0.8, 'device': 'cuda'
    }

## Model

In [7]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', XGBClassifier(**params))
                             ])

my_pipeline.fit(X_train, y_train['damage_grade'])

## Scoring the Model

In [8]:
train_preds = my_pipeline.predict(X_train)
print(f"f1_score on training data  : {f1_score(y_train.damage_grade, train_preds, average='micro')}")

if MODEL_VALIDATION:    
    valid_preds = my_pipeline.predict(X_valid)
    print(f"f1_score on validation data: {f1_score(y_valid.damage_grade, valid_preds, average='micro')}")

f1_score on training data: 0.7869488059874233
f1_score on validation data: 0.7510560731439153


# Save model and pipeline to file

In [9]:
if SAVE_MODEL:    
    import pickle
    xgb_model = my_pipeline.named_steps['model']
    # Save the model to a file
    with open('models/xgb_model.pkl', 'wb') as file:
        pickle.dump(xgb_model, file)

    # Save the pipeline to a file
    with open('models/xgb_pipeline_0.751.pkl', 'wb') as file:
        pickle.dump(my_pipeline, file)

# Predict on test set and prepare submission

In [10]:
if DO_SUBMISSSION:
    test_preds = my_pipeline.predict(X_test)
    test_preds = encode_labels(test_preds, reverse=True)

    submission = pd.DataFrame({'building_id': X_test.building_id,
                            'damage_grade': test_preds})

    submission.to_csv('submission.csv', index=False)