In [None]:
#Importing Libraries

import numpy as np
from numpy import mean, absolute
import pandas as pd
import matplotlib.pyplot as pl

#Importing libraries for pre-processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss

#Importing libraries for Regressors
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer
from sklearn import tree

In [None]:
# Load the ForestFires data set
forestfire_data = pd.read_csv('./forestfires.csv')

# Encode the data
encodeData = LabelEncoder()

#Convert months to integer using label encoder
encodeData.fit(forestfire_data['month'])
forestfire_data['encoded_months'] = encodeData.transform(forestfire_data['month'])

#Convert days to integer using label encoder
encodeData.fit(forestfire_data['day'])
forestfire_data['encoded_days'] = encodeData.transform(forestfire_data['day'])

#FEATURE SELECTION
#select top 3 features based on highest co-relation
featureSelected = ['temp', 'FFMC', 'DMC']
dataX = forestfire_data[featureSelected]

#Select area into Y
dataY = forestfire_data['area']

#split the dataset into training data (80%) and testing data (20%) using sklearn's train_test_split method
trainingData, testingData, trainingArea, testingArea = train_test_split(dataX, dataY, test_size = 0.2)

#Reshape the trainingArea
trainingArea = trainingArea.values.reshape(trainingArea.size, 1)

In [None]:
#Define function for calculation root mean squared error
def root_mean_squared_error(givenValues, predictedValues):
    return np.sqrt(mean_squared_error(givenValues, predictedValues))

#Define function for calculating mean absolute deviation
def mean_absolute_deviation(predictedValues):
    return np.mean(np.absolute(predictedValues - mean(predictedValues)))

#Define function for calulating negative log likelihood
def negative_log_likelihood(givenValues, predictedValues):
    givenValues_length = givenValues.shape[0]
    
    m = - (givenValues_length / 2) * np.log(2 * np.pi * (np.var(predictedValues)))
    n = 0
    
    for iterations in range(givenValues_length):
        n = n + (givenValues[iterations] - predictedValues[iterations]) ** 2
    n = n - n / (2 * (np.var(predictedValues)))
    
    return (m + n)

In [None]:
#Implementing DECISION TREE REGRESSOR

def DecisionTree_Regressor():

    #Initialize scaler
    scaler_DTR = StandardScaler()

    #Define parameter grid for decision tree regressor
    parameterGrid_DTR = {'decisiontreeregressor__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 
                     'decisiontreeregressor__min_samples_leaf': [1, 5, 10, 20, 50, 100]}
    
    #Define scorer
    scorerRMSE = make_scorer(root_mean_squared_error, greater_is_better = False)
    pipeTree = make_pipeline(tree.DecisionTreeRegressor(random_state = 1))

    #Implement grid search cross validation using 10 folds
    DTR_grid = GridSearchCV(estimator = pipeTree, param_grid = parameterGrid_DTR, scoring = scorerRMSE, cv = 10)
    DTR_grid.fit(scaler_DTR.fit_transform(trainingData), scaler_DTR.fit_transform(trainingArea))

    #Predict the values for areas
    predictedValues = DTR_grid.predict(testingData)

    #Calculate Root Mean Squared Error
    DTR_RMSE = root_mean_squared_error(testingArea, predictedValues)
    
    #Calculate Mean Absolut Deviation
    DTR_MAD = mean_absolute_deviation(predictedValues)
    
    #Calcute negative log likelihood
    RFR_NLL = negative_log_likelihood(testingArea.values, predictedValues)

    return DTR_RMSE, DTR_MAD, RFR_NLL
    

In [None]:
#Implementing RANDOM FOREST REGRESSOR

def RandomForest_Regressor():

    #Initialize scaler
    scaler_RFR = StandardScaler()

    #Define parameter grid for random forest regressor
    parameterGrid_RFR = {'max_depth': [5, 10, 15, 20, 50], 'max_leaf_nodes': [2, 5, 10],
                     'min_samples_leaf': [2, 5, 10], 'min_samples_split': [2, 5, 10]}
    
    #Define scorer
    scorerRMSE = make_scorer(root_mean_squared_error, greater_is_better = False)
    
    #Implement grid search cross validation using 10 folds
    RFR_grid = GridSearchCV(RandomForestRegressor(), param_grid = parameterGrid_RFR, refit = True, verbose = 0, scoring = scorerRMSE, cv = 10)
    RFR_grid.fit(scaler_RFR.fit_transform(trainingData), scaler_RFR.fit_transform(trainingArea))

    #Predict the values for areas
    predictedValues = RFR_grid.predict(testingData)

    #Calculate Root Mean Squared Error
    RFR_RMSE = root_mean_squared_error(testingArea, predictedValues)
    
    #Calculate Mean Absolut Deviation
    RFR_MAD = mean_absolute_deviation(predictedValues)
    
    #Calcute negative log likelihood
    RFR_NLL = negative_log_likelihood(testingArea.values, predictedValues)

    return RFR_RMSE, RFR_MAD, RFR_NLL
    

In [None]:
print('FOREST FIRES REGRESSION:\n\n')
print('\nPlease enter your choice and press enter \n')

#Ask for input from an user
makeSelection = input('1) Decision Tree Regressor\n2) Random Forest Regressor\n')

if(makeSelection == '1'):
    
    # Decision Tree Regressor
    print('Decision Tree Regressor:')
    DTR_RMSE, DTR_MAD, DTR_NLL = DecisionTree_Regressor()
        
    print('Root Mean Squared Error for Decision Tree Regressor: ', DTR_RMSE)
    print('Mean Absolute Deviation for Decision Tree Regressor: ', DTR_MAD)
    print('Negative Log Likelihood for Decision Tree Regressor: ', DTR_NLL)
    
elif(makeSelection == '2'):
    
    #Random Forest Regression
    print('Random Forest Regressor:')
    RFR_RMSE, RFR_MAD, RFR_NLL = RandomForest_Regressor()
        
    print('Root Mean Squared Error for Random Forest Regressor: ', RFR_RMSE)
    print('Mean Absolute Deviation for Random Forest Regressor: ', RFR_MAD)
    print('Negative Log Likelihood for Random Forest Regressor: ', RFR_NLL)
    
else: print('Invalid Choice') #Invalid Choice