In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Real
from skopt.space import Real, Integer

# Load the data
forest_fires_df = pd.read_csv('forestfires.csv')

# Handle missing values (if any)
forest_fires_df = forest_fires_df.dropna()

# Log transform the 'area' to handle skewness
forest_fires_df['log_area'] = np.log1p(forest_fires_df['area'])

# Select only the most important features
important_features = ['FFMC', 'DMC', 'DC', 'ISI', 'temp']

# Filter the dataset to include only important features
X = forest_fires_df[important_features]
y = forest_fires_df['log_area']

# Remove outliers
q1 = y.quantile(0.25)
q3 = y.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
X = X[(y >= lower_bound) & (y <= upper_bound)]
y = y[(y >= lower_bound) & (y <= upper_bound)]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify continuous features for standardization and interaction
continuous_features = important_features

# Create the column transformer with standard scaler and polynomial features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features)
    ]
)

# Create the polynomial feature generator
poly = PolynomialFeatures(degree=2, include_bias=False)

# Create a pipeline with the preprocessor, polynomial features, and Ridge regression model
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', poly),
    ('model', Ridge())
])

# Define the search space for BayesSearchCV
search_spaces = {
    'poly__degree': Integer(1, 3),  # Considering polynomial degrees from 1 to 3
    'model__alpha': Real(1e-6, 1e+6, prior='log-uniform')
}

# Initialize BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=ridge_pipeline,
    search_spaces=search_spaces,
    n_iter=50,  # You can adjust the number of iterations
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit the model with BayesSearchCV
bayes_search.fit(X_train, y_train)

# Get the best model
best_ridge_model = bayes_search.best_estimator_

# Evaluate Model Performance on Test Data
y_test_pred_ridge = best_ridge_model.predict(X_test)
test_rmse_ridge = np.sqrt(mean_squared_error(y_test, y_test_pred_ridge))
test_mae_ridge = mean_absolute_error(y_test, y_test_pred_ridge)
test_r2_ridge = r2_score(y_test, y_test_pred_ridge)

print(f'Test RMSE (Ridge with BayesSearchCV): {test_rmse_ridge}')
print(f'Test MAE (Ridge with BayesSearchCV): {test_mae_ridge}')
print(f'Test R² (Ridge with BayesSearchCV): {test_r2_ridge}')
print(f'Best Parameters (Ridge with BayesSearchCV): {bayes_search.best_params_}')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi