In [5]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.preprocessing import FunctionTransformer, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

train_data = pd.read_csv('datasets/dataset_train_final.csv')
test_data = pd.read_csv('datasets/dataset_test_final.csv')



train_data = train_data.dropna(subset=['TOTAL_FIRE_SIZE'])
X = train_data.drop(columns=['TOTAL_FIRE_SIZE'])
y = train_data['TOTAL_FIRE_SIZE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
            ('target_encoder', TargetEncoder(target_type='continuous'), ['STATE'])
        ],
        remainder='passthrough'
    )


# Define a pipeline with a target encoder and XGBoost regressor using squared log error.
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', XGBRegressor(random_state=42, objective='reg:squaredlogerror'))
])

param_grid = {
    'regressor__max_depth': [3, 5, 7],
    'regressor__n_estimators': [100, 200],
    'regressor__learning_rate': [0.01, 0.1, 0.2]
}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best score (negative MSE):", grid.best_score_)

best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test)

def calc_log_clamped_score_np(y_true, y_pred):
    eps = 1e-15
    y_pred = np.maximum(y_pred, eps)
    y_true = np.maximum(y_true, eps)
    log_errors = np.abs(np.log(y_pred / y_true))
    log_errors_clamped = np.minimum(log_errors, 10.0)
    return np.mean(log_errors_clamped)

score = calc_log_clamped_score_np(y_test, y_pred)
print("Score:", score)


Best parameters: {'regressor__learning_rate': 0.2, 'regressor__max_depth': 3, 'regressor__n_estimators': 200}
Best score (negative MSE): -8090533752.824155
Score: 2.33913416400103
