In [16]:
# importing libraries

import numpy as np
import pandas as pd
import os
import pandas_tfrecords as pdtfr

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
from scipy.stats import randint
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier


In [17]:
# checking filenames and setting dataset path
dataset_path = '../../datasets/next_day_wildfire_compiled/'

for subdirs, dirs, files in os.walk(dataset_path):
    for file in files:
      print(file)
    
df_train = pd.read_csv(dataset_path + "next_day_wildfire_train.csv")
df_test = pd.read_csv(dataset_path + "next_day_wildfire_test.csv")
df_eval = pd.read_csv(dataset_path + "next_day_wildfire_eval.csv")

df_train = df_train.loc[:, ~df_train.columns.str.contains('^Unnamed')]
df_test = df_test.loc[:, ~df_test.columns.str.contains('^Unnamed')]
df_eval = df_eval.loc[:, ~df_eval.columns.str.contains('^Unnamed')]

next_day_wildfire_test.csv
next_day_wildfire_eval.csv
next_day_wildfire_train.csv


In [18]:
display(df_train)

Unnamed: 0,FireMask,NDVI,PrevFireMask,elevation,erc,pdsi,population,pr,sph,th,tmmn,tmmx,vs
0,0.0,7056.0,0.0,428.0,73.261055,-3.394206,0.000000,0.000000,0.005614,324.948303,286.561035,305.208984,5.077084
1,0.0,2868.0,0.0,1716.0,102.291351,-0.996114,0.162685,0.000000,0.002494,226.580185,286.417297,307.309601,3.292861
2,0.0,5191.0,0.0,379.0,48.888779,-0.712426,3.007233,0.000000,0.004208,106.675331,276.637939,295.362274,3.990284
3,0.0,8238.0,0.0,93.0,29.164780,1.214143,0.999623,0.000000,0.009241,165.663513,285.633179,301.712311,4.095358
4,0.0,6424.0,0.0,690.0,84.601860,-2.687754,38.195526,0.000000,0.006435,136.512070,286.821198,300.153137,3.095662
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14974,0.0,7603.0,0.0,15.0,20.404251,4.108630,0.593232,0.486105,0.013636,80.315552,291.995392,301.733337,2.712262
14975,0.0,3022.0,0.0,1533.0,108.958961,-2.842595,0.094746,0.000000,0.001563,121.595604,282.154327,307.580811,3.079950
14976,0.0,3361.0,0.0,2815.0,81.171783,-3.095725,0.239165,0.000067,0.000917,296.743988,265.049835,284.043213,5.425783
14977,0.0,8032.0,0.0,127.0,31.529081,2.172393,27.643627,0.000000,0.009275,193.973862,284.530640,303.075012,2.775929


In [19]:
# The column headings you provided are related to forest fire data. Here's a brief explanation of each column:

# FireMask: This column likely contains information about the presence or absence of a forest fire. It could be binary, indicating whether a fire is detected or not.

# NDVI (Normalized Difference Vegetation Index): NDVI is a measure of vegetation greenness or health. It is calculated from the difference between the near-infrared (NIR) and red reflectance of vegetation. Higher NDVI values generally indicate healthier vegetation.

# PrevFireMask (Previous FireMask): This column might contain information about the presence or absence of a forest fire in the previous time period. Similar to FireMask, it could be binary.

# Elevation: The elevation of the location. This could have an impact on fire behavior and spread.

# ERC (Energy Release Component): ERC is a measure of the potential energy released from a wildfire. It takes into account fuel moisture and is often used in fire danger rating systems.

# PDSI (Palmer Drought Severity Index): PDSI is a measure of long-term drought. It indicates whether an area is experiencing wetter or drier than normal conditions.

# Population: The population of the area. Human presence can influence fire risk and management strategies.

# PR (Precipitation): The amount of precipitation in the area. It is a crucial factor in determining fuel moisture and fire risk.

# SPH (Specific Humidity): Specific humidity is a measure of the water vapor content in the air.

# TH (Temperature/Humidity): This could represent a combination of temperature and humidity, both of which are important factors in fire behavior.

# TMMN (Minimum Temperature): The minimum temperature recorded in the area.

# TMMX (Maximum Temperature): The maximum temperature recorded in the area.

# VS (Wind Speed): The wind speed in the area. Wind speed is a critical factor in determining the spread of a forest fire.

# These variables collectively provide a comprehensive set of features that can be used to understand and model forest fire behavior. When using machine learning models like RandomForestClassifier, these features are used to predict the target variable, which in this case might be whether a forest fire is present or not.


In [30]:
y_train = df_train['FireMask']
X_train = df_train.drop(columns='FireMask')

# X_train_balanced = df_train.drop(df_train[df_train['FireMask'] == 1].index)
X_train_0, X_train_m1 = df_train[df_train['FireMask'] == 0], df_train[df_train['FireMask'] == -1]
X_train_0 = X_train_0.head(500)
X_train_balanced = pd.concat([X_train_0, X_train_m1], axis=0)
y_train_balanced = X_train_balanced['FireMask']
X_train_balanced = X_train_balanced.drop(columns='FireMask')

y_test = df_test['FireMask']
X_test = df_test.drop(columns='FireMask')

y_eval = df_eval['FireMask']
X_eval = df_eval.drop(columns='FireMask')

display(y_train)
display(X_train)
display(y_train.value_counts())
display(y_test.value_counts())
display(y_eval.value_counts())

# display(y_train_balanced.value_counts())
# display(len(X_train_0), len(X_train_m1))
display(len(X_train_balanced))

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
14974    0.0
14975    0.0
14976    0.0
14977    0.0
14978    0.0
Name: FireMask, Length: 14979, dtype: float64

Unnamed: 0,NDVI,PrevFireMask,elevation,erc,pdsi,population,pr,sph,th,tmmn,tmmx,vs
0,7056.0,0.0,428.0,73.261055,-3.394206,0.000000,0.000000,0.005614,324.948303,286.561035,305.208984,5.077084
1,2868.0,0.0,1716.0,102.291351,-0.996114,0.162685,0.000000,0.002494,226.580185,286.417297,307.309601,3.292861
2,5191.0,0.0,379.0,48.888779,-0.712426,3.007233,0.000000,0.004208,106.675331,276.637939,295.362274,3.990284
3,8238.0,0.0,93.0,29.164780,1.214143,0.999623,0.000000,0.009241,165.663513,285.633179,301.712311,4.095358
4,6424.0,0.0,690.0,84.601860,-2.687754,38.195526,0.000000,0.006435,136.512070,286.821198,300.153137,3.095662
...,...,...,...,...,...,...,...,...,...,...,...,...
14974,7603.0,0.0,15.0,20.404251,4.108630,0.593232,0.486105,0.013636,80.315552,291.995392,301.733337,2.712262
14975,3022.0,0.0,1533.0,108.958961,-2.842595,0.094746,0.000000,0.001563,121.595604,282.154327,307.580811,3.079950
14976,3361.0,0.0,2815.0,81.171783,-3.095725,0.239165,0.000067,0.000917,296.743988,265.049835,284.043213,5.425783
14977,8032.0,0.0,127.0,31.529081,2.172393,27.643627,0.000000,0.009275,193.973862,284.530640,303.075012,2.775929


FireMask
 0.0    14505
-1.0      459
 1.0       15
Name: count, dtype: int64

FireMask
 0.0    1636
-1.0      53
Name: count, dtype: int64

FireMask
 0.0    1820
-1.0      55
 1.0       2
Name: count, dtype: int64

959

In [23]:
# MODEL

rf_classifier = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=30)

# param_grid = {
#     'n_estimators': [50, 100, 150],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# Create a GridSearchCV instance
# grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV instance to the training data
# grid_search.fit(X_train, y_train)

rf_classifier.fit(X_train, y_train)

# Get the best parameters
# best_params = grid_search.best_params_
# print(f"Best Hyperparameters: {best_params}")

# Get the best model
# best_model = grid_search.best_estimator_

# Make predictions on the testing data using the best model

y_pred = rf_classifier.predict(X_test)

In [7]:
# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# You can also print other metrics such as classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

Accuracy: 0.97
Classification Report:
              precision    recall  f1-score   support

        -1.0       0.50      0.04      0.07        53
         0.0       0.97      1.00      0.98      1636

    accuracy                           0.97      1689
   macro avg       0.73      0.52      0.53      1689
weighted avg       0.95      0.97      0.96      1689

[[   2   51]
 [   2 1634]]


In [9]:
# Make predictions on the testing data
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Calculate the AUC score
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.75


In [10]:
# Create a LogisticRegression model and train the model
logreg = LogisticRegression(random_state=42, multi_class='auto')
logreg.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = logreg.predict(X_test)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(cm)
# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logreg.classes_)

Classification Report:
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00        53
         0.0       0.97      1.00      0.98      1636

    accuracy                           0.97      1689
   macro avg       0.48      0.50      0.49      1689
weighted avg       0.94      0.97      0.95      1689

[[   0   53]
 [   2 1634]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Make predictions on the testing data
y_pred_proba = logreg.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Calculate the AUC score
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.68


In [13]:
# Create an MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=10000, random_state=42)

# Train the classifier on the training data
mlp.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = mlp.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_pred, y_test)
print(cm)

Accuracy: 0.97
Classification Report:
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00        53
         0.0       0.97      1.00      0.98      1636

    accuracy                           0.97      1689
   macro avg       0.48      0.50      0.49      1689
weighted avg       0.94      0.97      0.95      1689

[[   0    2]
 [  53 1634]]


In [14]:
# Make predictions on the testing data
y_pred_proba = mlp.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Calculate the AUC score
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.61


In [32]:
# BALANCED MODEL

rf_classifier = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=30)

# param_grid = {
#     'n_estimators': [50, 100, 150],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# Create a GridSearchCV instance
# grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV instance to the training data
# grid_search.fit(X_train, y_train)

rf_classifier.fit(X_train_balanced, y_train_balanced)

# Get the best parameters
# best_params = grid_search.best_params_
# print(f"Best Hyperparameters: {best_params}")

# Get the best model
# best_model = grid_search.best_estimator_

# Make predictions on the testing data using the best model

y_pred = rf_classifier.predict(X_test)

In [33]:
# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# You can also print other metrics such as classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("CONFUSION MATRIX\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.60
Classification Report:
              precision    recall  f1-score   support

        -1.0       0.05      0.64      0.09        53
         0.0       0.98      0.60      0.74      1636

    accuracy                           0.60      1689
   macro avg       0.51      0.62      0.42      1689
weighted avg       0.95      0.60      0.72      1689

[[ 34  19]
 [661 975]]


In [34]:
# Create a LogisticRegression model and train the model
logreg = LogisticRegression(random_state=42, multi_class='auto', max_iter=10000)
logreg.fit(X_train_balanced, y_train_balanced)

# Make predictions on the testing data
y_pred = logreg.predict(X_test)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("CONFUSION MATRIX\n", cm)
# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logreg.classes_)

Classification Report:
              precision    recall  f1-score   support

        -1.0       0.07      0.49      0.12        53
         0.0       0.98      0.77      0.86      1636

    accuracy                           0.76      1689
   macro avg       0.52      0.63      0.49      1689
weighted avg       0.95      0.76      0.84      1689

[[  26   27]
 [ 373 1263]]


In [35]:
# Create an MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=10000, random_state=42)

# Train the classifier on the training data
mlp.fit(X_train_balanced, y_train_balanced)

# Make predictions on the testing data
y_pred = mlp.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_pred, y_test)
print("CONFUSION MATRIX\n", cm)

Accuracy: 0.69
Classification Report:
              precision    recall  f1-score   support

        -1.0       0.06      0.62      0.11        53
         0.0       0.98      0.70      0.81      1636

    accuracy                           0.69      1689
   macro avg       0.52      0.66      0.46      1689
weighted avg       0.95      0.69      0.79      1689

[[  33  498]
 [  20 1138]]
