In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

data = pd.read_csv('../dataset/forestfires.csv')
data['month'] = data['month'].astype('category').cat.codes
data['day'] = data['day'].astype('category').cat.codes
data['log_area'] = np.log(data['area'] + 1)

X = data.drop(columns=['area', 'log_area'])
y = data['log_area']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

joblib.dump(model, '../models/model.pkl')

predictions = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')


Root Mean Squared Error: 1.5237110838614512
Mean Absolute Error: 1.2159976105646635
R-squared: -0.056342899945638125


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Load dataset
data = pd.read_csv('../dataset/forestfires.csv')

# Data Preprocessing
data['month'] = data['month'].astype('category').cat.codes
data['day'] = data['day'].astype('category').cat.codes
data['log_area'] = np.log(data['area'] + 1)

# Split data into features and target
X = data.drop(columns=['area', 'log_area'])
y = data['log_area']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the model and parameter grid for tuning
model = GradientBoostingRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Use the best model from Grid Search
best_model = grid_search.best_estimator_
joblib.dump(best_model, '../models/gradient_boosting_model.pkl')

# Evaluate the model
predictions = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Root Mean Squared Error: 1.4618640558994027
Mean Absolute Error: 1.1921377871243348
R-squared: 0.027670102269433228


In [7]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

# Set up the XGBoost model and parameter grid
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# RandomizedSearchCV for quicker tuning
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, 
                                   scoring='neg_mean_squared_error', n_iter=20, cv=5, 
                                   verbose=1, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Select the best model from Random Search
best_xgb_model = random_search.best_estimator_
joblib.dump(best_xgb_model, '../models/best_xgb_model.pkl')

# Evaluate the model
xgb_predictions = best_xgb_model.predict(X_test)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_predictions))
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)

print(f'Best Parameters: {random_search.best_params_}')
print(f'Root Mean Squared Error: {xgb_rmse}')
print(f'Mean Absolute Error: {xgb_mae}')
print(f'R-squared: {xgb_r2}')


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'subsample': 0.6, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.6}
Root Mean Squared Error: 1.4778049086585063
Mean Absolute Error: 1.2009715519117807
R-squared: 0.006349000561891027
