In [442]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint
import matplotlib.pyplot as plt
import seaborn as sns


In [443]:
# Load the data
forest_fires_df = pd.read_csv('forestfires.csv')

# Handle missing values (if any)
forest_fires_df = forest_fires_df.dropna()

In [444]:
# Log transform the 'area' to handle skewness
forest_fires_df['log_area'] = np.log1p(forest_fires_df['area'])

# Define columns to be dropped
columns_to_drop = ['day', 'area', 'rain', 'X', 'Y', 'month']
forest_fires_df = forest_fires_df.drop(columns=columns_to_drop)

In [445]:
# Split the data into features and target
X = forest_fires_df.drop(['log_area'], axis=1)
y = forest_fires_df['log_area']

In [446]:
# Identify continuous features for standardization
continuous_features = ['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind']

In [447]:
# Create the column transformer with standard scaler
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features)
    ],
    remainder='passthrough'  # leave the rest of the columns unchanged
)

In [448]:
# Create a pipeline with the preprocessor and XGBoost model
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42))
])

In [449]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [450]:
# Define the parameter distribution for randomized search
param_dist = {
    'model__n_estimators': randint(50, 200),
    'model__max_depth': randint(2, 10),
    'model__learning_rate': uniform(0.01, 0.2),
    'model__subsample': uniform(0.6, 0.4),
    'model__colsample_bytree': uniform(0.6, 0.4)
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb_pipeline, param_distributions=param_dist, n_iter=100, cv=3, n_jobs=-1, verbose=2, random_state=42)

# Fit the model with RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best model
best_xgb_model = random_search.best_estimator_

# Evaluate Model Performance on Test Data
y_test_pred_xgb = best_xgb_model.predict(X_test)
test_rmse_xgb = np.sqrt(mean_squared_error(y_test, y_test_pred_xgb))
test_mae_xgb = mean_absolute_error(y_test, y_test_pred_xgb)
test_r2_xgb = r2_score(y_test, y_test_pred_xgb)

print(f'Test RMSE (XGBoost with RandomizedSearchCV): {test_rmse_xgb}')
print(f'Test MAE (XGBoost with RandomizedSearchCV): {test_mae_xgb}')
print(f'Test R² (XGBoost with RandomizedSearchCV): {test_r2_xgb}')
print(f'Best Parameters (XGBoost with RandomizedSearchCV): {random_search.best_params_}')

Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 