In [525]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor, plot_importance
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint
import matplotlib.pyplot as plt


In [526]:
# Load the data
forest_fires_df = pd.read_csv('forestfires.csv')

# Handle missing values (if any)
forest_fires_df = forest_fires_df.dropna()

In [527]:
# Log transform the 'area' to handle skewness
forest_fires_df['log_area'] = np.log1p(forest_fires_df['area'])

# Define columns to be dropped
columns_to_drop = ['day', 'area', 'rain', 'X', 'Y', 'month']
forest_fires_df = forest_fires_df.drop(columns=columns_to_drop)

In [528]:
# Split the data into features and target
X = forest_fires_df.drop(['log_area'], axis=1)
y = forest_fires_df['log_area']

In [529]:
# Remove outliers
q1 = y.quantile(0.25)
q3 = y.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
X = X[(y >= lower_bound) & (y <= upper_bound)]
y = y[(y >= lower_bound) & (y <= upper_bound)]

In [530]:
# Identify continuous features for standardization
continuous_features = ['FFMC', 'DMC', 'DC', 'ISI', 'temp']

In [531]:
# Create the column transformer with standard scaler
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features)
    ],
    remainder='passthrough'  # leave the rest of the columns unchanged
)

# Add polynomial features
poly_features = PolynomialFeatures(degree=2, include_bias=False)

In [532]:
# Create a pipeline with the preprocessor, polynomial features, and XGBoost model
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', poly),
    ('model', XGBRegressor(random_state=42))
])

In [533]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [534]:
# Define the parameter distribution for randomized search
param_dist = {
    'model__n_estimators': randint(100, 300),
    'model__max_depth': randint(3, 10),
    'model__learning_rate': uniform(0.01, 0.2),
    'model__subsample': uniform(0.7, 0.3),
    'model__colsample_bytree': uniform(0.7, 0.3)
}

# Initialize RandomizedSearchCV with a more robust cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(estimator=xgb_pipeline, param_distributions=param_dist, n_iter=100, cv=kf, n_jobs=-1, verbose=2, random_state=42)

# Fit the model with RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best model
best_xgb_model = random_search.best_estimator_

# Evaluate Model Performance on Test Data
y_test_pred_xgb = best_xgb_model.predict(X_test)
test_rmse_xgb = np.sqrt(mean_squared_error(y_test, y_test_pred_xgb))
test_mae_xgb = mean_absolute_error(y_test, y_test_pred_xgb)
test_r2_xgb = r2_score(y_test, y_test_pred_xgb)

print(f'Test RMSE (XGBoost with RandomizedSearchCV): {test_rmse_xgb}')
print(f'Test MAE (XGBoost with RandomizedSearchCV): {test_mae_xgb}')
print(f'Test R² (XGBoost with RandomizedSearchCV): {test_r2_xgb}')
print(f'Best Parameters (XGBoost with RandomizedSearchCV): {random_search.best_params_}')


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Test RMSE (XGBoost with RandomizedSearchCV): 1.2997128091199317
Test MAE (XGBoost with RandomizedSearchCV): 1.0934636921681715
Test R² (XGBoost with RandomizedSearchCV): -0.030981559691388938
Best Parameters (XGBoost with RandomizedSearchCV): {'model__colsample_bytree': 0.8834959481464842, 'model__learning_rate': 0.011413261043943482, 'model__max_depth': 3, 'model__n_estimators': 148, 'model__subsample': 0.8574323980775167}
