In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from scipy.stats import uniform, randint
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'forestfires.csv'
df = pd.read_csv(file_path)

# Remove the specified columns
df.drop(columns=['rain', 'X', 'Y', 'day'], inplace=True)

# Convert months to a binary summer category
def is_summer(month):
    return int(month in ['jun', 'jul', 'aug'])

df['is_summer'] = df['month'].apply(is_summer)
df.drop(columns='month', inplace=True)

# Function to remove outliers using IQR method
def remove_outliers_iqr(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[~((df[columns] < lower_bound) | (df[columns] > upper_bound)).any(axis=1)]

# Remove outliers
numeric_features = ['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'area']
df = remove_outliers_iqr(df, numeric_features)

# Apply log transformation to the target variable
df['log_area'] = np.log1p(df['area'])  # log1p is used to handle log(0)

# Create new features
df['temp_RH_interaction'] = df['temp'] * df['RH']
df['wind_ISI_interaction'] = df['wind'] * df['ISI']

# Generate polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[numeric_features[:-1]])
poly_feature_names = [f"poly_{i}" for i in range(poly_features.shape[1])]

# Combine polynomial features with the original data
df_poly = pd.DataFrame(poly_features, columns=poly_feature_names)
df = pd.concat([df, df_poly], axis=1)

# Check for and handle NaN or infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# Split the data into features and target
X = df.drop(columns=['area', 'log_area'])
y = df['log_area']

# Normalize the numeric features
scaler = StandardScaler()
X[numeric_features[:-1]] = scaler.fit_transform(X[numeric_features[:-1]])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model
xgb_model = xgb.XGBRegressor()

# Set up the parameter grid for RandomizedSearchCV
param_grid = {
    'colsample_bytree': np.linspace(0.1, 1.0, 10),
    'gamma': np.linspace(0, 0.5, 10),
    'learning_rate': np.linspace(0.01, 0.3, 20),
    'max_depth': range(3, 12),
    'n_estimators': range(50, 300, 20),
    'subsample': np.linspace(0.1, 1.0, 10)
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    xgb_model, 
    param_distributions=param_grid, 
    n_iter=300,  # Increase the number of iterations
    cv=5, 
    scoring='neg_mean_squared_error', 
    verbose=1, 
    n_jobs=-1, 
    random_state=42
)
random_search.fit(X_train, y_train)

# Get the best model
best_xgb_model = random_search.best_estimator_

# Predict on the test set
y_pred = best_xgb_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Test RMSE: {rmse}")
print(f"Test MAE: {mae}")
print(f"Test R²: {r2}")

# Compare with RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)
rf_param_grid = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(3, 12),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20)
}

rf_random_search = RandomizedSearchCV(
    rf_model,
    param_distributions=rf_param_grid,
    n_iter=300,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    random_state=42
)
rf_random_search.fit(X_train, y_train)

best_rf_model = rf_random_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)

# Evaluate the RandomForest model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Best Parameters: {rf_random_search.best_params_}")
print(f"Random Forest Test RMSE: {rmse_rf}")
print(f"Random Forest Test MAE: {mae_rf}")
print(f"Random Forest Test R²: {r2_rf}")

# Ensemble method (Stacking Regressor)
estimators = [
    ('xgb', best_xgb_model),
    ('rf', best_rf_model)
]
stacking_regressor = StackingRegressor(
    estimators=estimators, 
    final_estimator=LinearRegression(),
    cv=5,
    n_jobs=-1
)

stacking_regressor.fit(X_train, y_train)
y_pred_ensemble = stacking_regressor.predict(X_test)

# Evaluate the ensemble method
mae_ensemble = mean_absolute_error(y_test, y_pred_ensemble)
mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
rmse_ensemble = np.sqrt(mse_ensemble)
r2_ensemble = r2_score(y_test, y_pred_ensemble)

print(f"Stacking Regressor Test RMSE: {rmse_ensemble}")
print(f"Stacking Regressor Test MAE: {mae_ensemble}")
print(f"Stacking Regressor Test R²: {r2_ensemble}")


Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Best Parameters: {'subsample': 0.7000000000000001, 'n_estimators': 250, 'max_depth': 9, 'learning_rate': 0.07105263157894737, 'gamma': 0.16666666666666666, 'colsample_bytree': 0.1}
Test RMSE: 0.8262663352228432
Test MAE: 0.6717272779675366
Test R²: 0.06358847756388497
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Random Forest Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 265}
Random Forest Test RMSE: 0.7933749545155441
Random Forest Test MAE: 0.6531715111970238
Random Forest Test R²: 0.1366565374854688
Stacking Regressor Test RMSE: 0.8228626495970558
Stacking Regressor Test MAE: 0.6642860686230873
Stacking Regressor Test R²: 0.07128741334933386
