In [28]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.preprocessing import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

train_data = pd.read_csv('datasets/dataset_train_final.csv')
train_data = train_data.dropna(subset=['TOTAL_FIRE_SIZE'])

X = train_data.drop(columns=['TOTAL_FIRE_SIZE'])
y = train_data['TOTAL_FIRE_SIZE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
            ('target_encoder', TargetEncoder(target_type='continuous'), ['STATE'])
        ],
        remainder='passthrough'
    )

# Define a pipeline with a target encoder and XGBoost regressor using squared log error.
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', XGBRegressor(random_state=42, objective='reg:squaredlogerror'))
])

param_grid = {
    'regressor__max_depth': [3, 5, 7, 9],
    'regressor__n_estimators': [100, 200, 300, 500, 1000],
    'regressor__learning_rate': [0.01, 0.1, 0.2]
}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best score (negative MSE):", grid.best_score_)

best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test)

def calc_log_clamped_score_np(y_true, y_pred):
    eps = 1e-15
    y_pred = np.maximum(y_pred, eps)
    y_true = np.maximum(y_true, eps)
    log_errors = np.abs(np.log(y_pred / y_true))
    log_errors_clamped = np.minimum(log_errors, 10.0)
    return np.mean(log_errors_clamped)

score = calc_log_clamped_score_np(y_test, y_pred)
print("Score:", score)


Best parameters: {'regressor__learning_rate': 0.2, 'regressor__max_depth': 9, 'regressor__n_estimators': 1000}
Best score (negative MSE): -8087188358.709325
Score: 2.2943942732297504


In [14]:
test_data = pd.read_csv('datasets/dataset_test_final.csv')
test_actual = test_data[test_data['YEAR'] <= 2015]
print(len(test_actual))

2160


In [None]:

predictions = best_estimator.predict(test_actual)

In [27]:
predictions_copy = pd.DataFrame(predictions.copy())
# predictions_copy.head()

predictions_copy = predictions_copy.rename(columns={0: 'total_fire_size'})
predictions_copy['ID'] = range(len(predictions_copy))
predictions_copy['STATE'] = test_actual['STATE']
predictions_copy['MONTH'] = test_actual['YEAR'].astype(str) + '-' + train_data['MONTH'].astype(str).str.zfill(2)

# Reorder the DataFrame so that 'ID' is the first column
cols = ['ID'] + [col for col in predictions_copy.columns if col != 'ID']
submission_df = predictions_copy[cols]

null_entries = submission_df.isnull()

# # Display the DataFrame with True/False indicating null entries
# print("Null entries in the DataFrame:")
# print(null_entries)
#
# # If you want to see the rows with any null values
# rows_with_nulls = submission_df[submission_df.isnull().any(axis=1)]
# print("\nRows with any null values:")
# print(rows_with_nulls)

submission_df = submission_df.dropna()
submission_df.to_csv('submission_final_1.csv', index=False)
print("Submission file saved as 'submission.csv'.")
print(len(submission_df))

Submission file saved as 'submission.csv'.
997
