In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR


In [20]:
df = pd.read_csv('../../wildfire_prediction/data/merged_file.csv', index_col=0)

In [21]:
len(df)

40971

In [22]:
df.drop(columns=['max() Precipitation', 
                 'max() RelativeHumidity',
                 'max() SoilWaterContent',
                 'max() SolarRadiation',
                 'max() WindSpeed',
                 'mean() SoilWaterContent',
                 'min() Precipitation', 
                 'min() RelativeHumidity',
                 'min() SoilWaterContent',
                 'min() SolarRadiation',
                 'min() WindSpeed',
                 'variance() Precipitation', 
                 'variance() RelativeHumidity',
                 'variance() SoilWaterContent',
                 'variance() SolarRadiation',
                 'variance() WindSpeed',
                 'variance() Temperature',
                 'Mean_estimated_fire_brightness',
                 'Mean_estimated_fire_radiative_power',
                 'Year',
                 'Month',
                 'Day'
                 ], inplace=True)

In [23]:
#Replacing na by 0
df.Estimated_fire_area.fillna(0, inplace=True)
# df.Mean_estimated_fire_brightness.fillna(0, inplace=True)
# df.Mean_estimated_fire_radiative_power.fillna(0, inplace=True)

#Solar Radiation to UV
df['mean() SolarRadiation'] = df['mean() SolarRadiation']*0.1
#Defining features and target
X = df.drop(columns=['Estimated_fire_area', 'Date_x'])
y = df.Estimated_fire_area


#Region Encoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded = encoder.fit_transform(X[['Region']])
encoder.categories_
for i, col in enumerate(encoder.categories_[0]):
    X[col] = pd.DataFrame(encoded)[i]
X.drop(columns='Region', inplace=True)

#Imputing null values
numerical_features = X.dtypes[(df.dtypes == 'float64') | (X.dtypes == 'int64')].index
imputer = KNNImputer()
values = imputer.fit_transform(X[X.dtypes[(df.dtypes == 'float64') | (X.dtypes == 'int64')].index])
X[X.dtypes[(df.dtypes == 'float64') | (X.dtypes == 'int64')].index] = values

#Scaler
scaler = RobustScaler()
scaled = scaler.fit_transform(X[X.dtypes[(df.dtypes == 'float64') | (X.dtypes == 'int64')].index])
X[X.dtypes[(df.dtypes == 'float64') | (X.dtypes == 'int64')].index] = scaled

#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=0)

In [24]:
svr = SVR()
grid_search = GridSearchCV(
    [svr,d_tree], 
    param_grid={
        'kernel': ['linear', 'poly', 'rbf'],
#         'coef0' : [0.001, 0.01, 0.1],
#         'C': [200, 300, 500 , 900],
        'epsilon': [0.00001, 0.0001, 0.01]},
    n_jobs=-1,
    cv=5,
    scoring="r2")

grid_search.fit(X_train, y_train)
grid_search.best_params_

NameError: name 'd_tree' is not defined

In [None]:
svr_model = grid_search.best_estimator_
svr_model.fit(X_train, y_train)
print(svr_model.score(X_test, y_test))
y_pred = svr_model.predict(X_test)
print(np.sqrt((y_test-y_pred)**2).mean())
cross_val_score(svr_model, X_test, y_test).mean()

In [None]:
d_tree = DecisionTreeRegressor()
d_tree.fit(X_train, y_train)
print(d_tree.score(X_test, y_test))
y_pred = d_tree.predict(X_test)
print('rmse', np.sqrt((y_test-y_pred)**2).mean())
cross_val_score(d_tree, X_test, y_test).mean()

In [None]:
lasso = Lasso()
lasso.fit(X_train, y_train)
print(lasso.score(X_test, y_test))
y_pred = lasso.predict(X_test)
print('rmse', np.sqrt((y_test-y_pred)**2).mean())
cross_val_score(lasso, X_test, y_test).mean()

In [None]:
ridge = Ridge()
ridge.fit(X_train, y_train)
print(ridge.score(X_test, y_test))
y_pred = ridge.predict(X_test)
print('rmse', np.sqrt((y_test-y_pred)**2).mean())
cross_val_score(ridge, X_test, y_test).mean()

In [None]:
X_train.info()

In [25]:
rdf = RandomForestRegressor()
grid_search = GridSearchCV(
    rdf, 
    param_grid={
        'n_estimators' : [60, 100, 200],
        'max_features' : ["auto", "sqrt", "log2"],
        'criterion' : ["mae"]},
    n_jobs=-1,
    cv=5,
    scoring="r2")

grid_search.fit(X_train, y_train)
grid_search.best_params_

{'criterion': 'mae', 'max_features': 'auto', 'n_estimators': 200}

In [None]:
rdf = grid_search.best_estimator_
rdf.fit(X_train, y_train)
print(rdf.score(X_test, y_test))
y_pred = rdf.predict(X_test)
print(np.sqrt((y_test-y_pred)**2).mean())
cross_val_score(rdf, X_test, y_test).mean()

In [18]:
rdf = RandomForestRegressor(criterion= 'mae', max_features= 'log2', n_estimators= 100)
rdf.fit(X_train, y_train)

print(rdf.score(X_test, y_test))
y_pred = rdf.predict(X_test)
print(np.sqrt((y_test-y_pred)**2).mean())
print(cross_val_score(rdf, X_test, y_test).mean())

0.5552042984865755
60.07776647304248
0.47943986231394015
