In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Real, Integer

# Load the data
file_path = 'forestfires.csv'
forest_fires_df = pd.read_csv(file_path)

# Handle missing values (if any)
forest_fires_df = forest_fires_df.dropna()

# Log transform the 'area' to handle skewness
forest_fires_df['log_area'] = np.log1p(forest_fires_df['area'])

# Select only the most important features
important_features = ['FFMC', 'DMC', 'DC', 'ISI', 'temp']

# Filter the dataset to include only important features
X = forest_fires_df[important_features]
y = forest_fires_df['log_area']

# Remove outliers
q1 = y.quantile(0.25)
q3 = y.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
X = X[(y >= lower_bound) & (y <= upper_bound)]
y = y[(y >= lower_bound) & (y <= upper_bound)]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify continuous features for standardization and interaction
continuous_features = important_features

# Create the column transformer with standard scaler and polynomial features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features)
    ]
)

# Create the polynomial feature generator
poly = PolynomialFeatures(degree=2, include_bias=False)

# Create a pipeline with the preprocessor, polynomial features, and XGBoost model
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', poly),
    ('model', XGBRegressor(random_state=42))
])

# Define the search space for BayesSearchCV
search_spaces = {
    'model__n_estimators': Integer(100, 300),
    'model__max_depth': Integer(3, 10),
    'model__learning_rate': Real(0.01, 0.2, prior='log-uniform'),
    'model__subsample': Real(0.7, 1.0),
    'model__colsample_bytree': Real(0.7, 1.0),
    'model__reg_alpha': Real(0, 1),
    'model__reg_lambda': Real(0, 1)
}

# Initialize BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_pipeline,
    search_spaces=search_spaces,
    n_iter=100,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit the model with BayesSearchCV
bayes_search.fit(X_train, y_train)

# Get the best model
best_xgb_model = bayes_search.best_estimator_

# Evaluate Model Performance on Test Data
y_test_pred_xgb = best_xgb_model.predict(X_test)
test_rmse_xgb = np.sqrt(mean_squared_error(y_test, y_test_pred_xgb))
test_mae_xgb = mean_absolute_error(y_test, y_test_pred_xgb)
test_r2_xgb = r2_score(y_test, y_test_pred_xgb)

print(f'Test RMSE (XGBoost with BayesSearchCV): {test_rmse_xgb}')
print(f'Test MAE (XGBoost with BayesSearchCV): {test_mae_xgb}')
print(f'Test R² (XGBoost with BayesSearchCV): {test_r2_xgb}')
print(f'Best Parameters (XGBoost with BayesSearchCV): {bayes_search.best_params_}')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi