# Predicting the burned area of forest fires

In [None]:
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt
%matplotlib inline

### Reading the dataset

In [None]:
# Downloaded from: https://archive.ics.uci.edu/ml/datasets/forest+fires
forest_df = pd.read_csv('./data/forestfires.csv', sep = ',')

In [None]:
forest_df.shape

In [None]:
forest_df.dtypes

In [None]:
forest_df.head()

### Analyzing the target attribute

In [None]:
forest_df['area'].describe(percentiles = [.25, .5, .75, .95, .99])

In [None]:
plt.figure(figsize = (28, 3))
plt.boxplot(forest_df['area'], vert = False)
plt.show()

In [None]:
forest_no_outliers_df = forest_df.loc[forest_df['area'] <= forest_df['area'].quantile(.99)]

In [None]:
forest_no_outliers_df.shape

In [None]:
plt.figure(figsize = (28, 3))
plt.boxplot(forest_no_outliers_df['area'], vert = False)
plt.show()

### Splitting train and test datasets

In [None]:
X = forest_no_outliers_df[['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']]

In [None]:
Y = forest_no_outliers_df['area']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

In [None]:
X_train.shape

In [None]:
X_test.shape

### Training the model

In [None]:
regr = linear_model.LinearRegression()

In [None]:
regr.fit(X_train, Y_train)

### Evaluating the model

In [None]:
predictions = regr.predict(X_test)

In [None]:
plt.figure(figsize = (28, 3))
plt.boxplot((Y_test - predictions), vert = False)
plt.show()

In [None]:
# MAE: Mean Absolute Error
mean_absolute_error(Y_test, predictions)

In [None]:
# (R)MSE: (Root) Mean Squared Error
np.sqrt(mean_squared_error(Y_test, predictions))

In [None]:
# R2: Coefficient of determination
r2_score(Y_test, predictions)