In [4]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from Richters_Predictor.dataloader import dataloader

In [5]:
X_train, y_train, X_test = dataloader()
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [6]:
from Richters_Predictor.encoder import get_basen_encoder, get_onehot_encoder
from sklearn.compose import ColumnTransformer

# columns for base-n encoding
base_n_columns = ['geo_level_1_id', 'geo_level_2_id']

# columns for one-hot encoding
one_hot_columns = ['foundation_type', 'land_surface_condition', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']

base_pipeline = get_basen_encoder(base_n_columns)
onehot_pipeline = get_onehot_encoder(one_hot_columns)

In [7]:
from sklearn.pipeline import Pipeline


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("oh_pipe", onehot_pipeline, one_hot_columns),
        ('base_pipe', base_pipeline, base_n_columns),

])

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestClassifier())
])

my_pipeline.fit(X_train, y_train['damage_grade'])

In [8]:
rfc_preds = my_pipeline.predict(X_valid)

In [10]:
from sklearn.metrics import f1_score
f1_score(y_valid.damage_grade, rfc_preds, average='micro')

0.710155215747971