# Pipeline

In [128]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense ,Dropout

from sklearn import set_config

set_config(display='diagram')

In [120]:
#Import data as df
df = pd.read_csv('../wildfire_prediction/data/merged_file.csv', index_col=0)

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40971 entries, 0 to 40970
Data columns (total 49 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Date_x                                                    40971 non-null  object 
 1   Region                                                    40971 non-null  object 
 2   count()[unit: km^2]                                       40971 non-null  float64
 3   max() Precipitation                                       40964 non-null  float64
 4   max() RelativeHumidity                                    40929 non-null  float64
 5   max() SoilWaterContent                                    40971 non-null  float64
 6   max() SolarRadiation                                      40957 non-null  float64
 7   max() Temperature                                         40957 non-null  float64
 8   max() WindSpeed 

In [122]:
df.isna().sum()

Date_x                                                          0
Region                                                          0
count()[unit: km^2]                                             0
max() Precipitation                                             7
max() RelativeHumidity                                         42
max() SoilWaterContent                                          0
max() SolarRadiation                                           14
max() Temperature                                              14
max() WindSpeed                                                28
mean() Precipitation                                            7
mean() RelativeHumidity                                        42
mean() SoilWaterContent                                         0
mean() SolarRadiation                                          14
mean() Temperature                                             14
mean() WindSpeed                                               28
min() Prec

In [123]:
df.Estimated_fire_area.fillna(0, inplace=True)
df.Mean_estimated_fire_brightness.fillna(0, inplace=True)
df.Mean_estimated_fire_radiative_power.fillna(0, inplace=True)

In [124]:
df.shape

(40971, 49)

In [109]:
#KNN imputer
imputer = KNNImputer()

# model = Sequential()
# model.add(layers.SimpleRNN(units=2, activation='tanh'))
# model.add(layers.Dense(1, activation="linear"))

# # The compilation
# model.compile(loss='mse', 
#               optimizer='rmsprop')

model = KNeighborsRegressor()

In [110]:
X = df.drop(columns=['Estimated_fire_area', 'Date_x'])
y = df.Estimated_fire_area

X_train, X_test, y_train, y_test = train_test_split(X,y)

In [111]:
X.dtypes[(df.dtypes == 'float64') | (X.dtypes == 'int64')].index

Index(['count()[unit: km^2]', 'max() Precipitation', 'max() RelativeHumidity',
       'max() SoilWaterContent', 'max() SolarRadiation', 'max() Temperature',
       'max() WindSpeed', 'mean() Precipitation', 'mean() RelativeHumidity',
       'mean() SoilWaterContent', 'mean() SolarRadiation',
       'mean() Temperature', 'mean() WindSpeed', 'min() Precipitation',
       'min() RelativeHumidity', 'min() SoilWaterContent',
       'min() SolarRadiation', 'min() Temperature', 'min() WindSpeed',
       'variance() Precipitation', 'variance() RelativeHumidity',
       'variance() SoilWaterContent', 'variance() SolarRadiation',
       'variance() Temperature', 'variance() WindSpeed', 'Year', 'Month',
       'Day', 'Mean_estimated_fire_brightness',
       'Mean_estimated_fire_radiative_power', 'Vegetation_index_mean',
       'Vegetation_index_variance', 'Shrubs', 'Herbaceous vegetation',
       'Cultivated and managed vegetation/agriculture (cropland)',
       'Urban / built up', 'Bare / sparse

In [112]:
numeric_features = X.dtypes[(X.dtypes == 'float64') | (X.dtypes == 'int64')].index
categorical_features = X.dtypes[(X.dtypes != 'float64') | (X.dtypes != 'int64')].index


numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
        remainder='passthrough'
        )



pipe = Pipeline([
    ('pre', preprocessor),
    ('model', model),
])

In [113]:
pipe.fit(X_train, y_train)

In [114]:
pipe.score(X_test, y_test)

0.6835340693756753

In [116]:
mse = np.square(np.subtract(y_test,pipe.predict(X_test))).mean()

In [117]:
import math 

rmse = math.sqrt(mse)

In [118]:
rmse

145.02813114989223