In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
forest_fires1 = pd.read_csv("forestfires.csv")
forestfire_numerical_features = ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']

#One hot encode the categorical columns
forest_fires1 = pd.get_dummies(forest_fires1, columns = ["month", "day"], dtype = "int")

In [None]:
#Split into train and test set
forestfire_X1 = forest_fires.drop("area", axis = 1)
forestfire_y1 = forest_fires["area"]

forestfire_X_train1, forestfire_X_test1, forestfire_y_train1, forestfire_y_test1 = train_test_split(forestfire_X1, forestfire_y1,
                                                                              train_size = 0.8, random_state = 10)

In [None]:
#Pipeline for preprocessing forest fires data
numeric_transformer = Pipeline([("scaler", StandardScaler())])
transformer = ColumnTransformer(transformers = [("numeric", numeric_transformer, forestfire_numerical_features)])

In [None]:
forestfire_knn_pipeline = Pipeline(steps = [("transformer", transformer),("knn", KNeighborsRegressor())])
forestfire_rf_pipeline = Pipeline(steps = [("transformer", transformer),("rf", RandomForestRegressor(random_state = 42))])
forestfire_gtb_pipeline = Pipeline(steps = [("transformer", transformer),("gtb", GradientBoostingRegressor(random_state = 42))])

#K-Nearest Neighbours Model
#Hyperparameter tuning with grid search
knn_parameters = {"knn__n_neighbors":[i for i in range(1,21)], "knn__weights":["uniform","distance"]}
knn_gridsearch = GridSearchCV(forestfire_knn_pipeline, param_grid = knn_parameters, scoring='neg_mean_squared_error', cv = 5)

#Random Forest Model
#Hyperparameter tuning with grid search
rf_parameters = {"rf__n_estimators": [i for i in range(3, 50, 10)] , "rf__max_depth": [j for j in range(2,10)]}
rf_gridsearch = GridSearchCV(forestfire_rf_pipeline, param_grid = rf_parameters,scoring='neg_mean_squared_error', cv = 5)

#Gradient Tree Boosting Model
#Hyperparameter Tuning:
gtb_parameters = {"gtb__n_estimators": [i for i in range(3, 50, 10)] , "gtb__max_depth": [j for j in range(2,10)]}
gtb_gridsearch = GridSearchCV(forestfire_gtb_pipeline, param_grid = gtb_parameters, scoring='neg_mean_squared_error', cv = 5)

models = [knn_gridsearch, rf_gridsearch, gtb_gridsearch]
for model in models:
    #Fit the training data to the grid search cross validation for each model
    model.fit(X = forestfire_X_train , y = forestfire_y_train)

In [None]:
model_names = ["knn", "random_forest", "gradient_tree_boosting"]
forestfire_rmse_results = {}

for i, model in enumerate(models):
    #Get the test result using the model with best found parameters
    y_pred = model.predict(X = forestfire_X_test)
    rmse_val = root_mean_squared_error(y_true = forestfire_y_test, y_pred = y_pred)
    forestfire_rmse_results[model_names[i]] = rmse_val

    print("Regression results for model ", model_names[i],": ")
    print("Best parameter combination: ", model.best_params_)
    print("Mean cross validation score of best estimator: ", model.best_score_)
    print("Variance of cross validation test score of best estimator:", model.cv_results_["std_test_score"][model.best_index_] )
    print("RMSE:", rmse_val,"\n")