In [2]:
# https://github.com/matoaster/CT4101-assignment1.git
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Training and testing data
training_data = pd.read_csv("wildfires_training.csv")
test_data = pd.read_csv("wildfires_test.csv")

In [12]:
# Split the data between the fire feature, the feature we want to predict 
# and the other features
X_training = training_data.drop(columns=['fire'])
y_training = training_data['fire']
X_testing = test_data.drop(columns=['fire'])
y_testing = test_data['fire']

# Tried to see if SVM's would work without first scaling data. Numerous sources say 
# is crucial and my accuracy was stuck at 0.56 without any preprocessing.

# Source: https://www.geeksforgeeks.org/machine-learning/how-to-make-better-models-in-python-using-svm-classifier-and-rbf-kernel/
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_training_scaled = scaler.fit_transform(X_training)
X_testing_scaled = scaler.fit_transform(X_testing)

from sklearn.svm import SVC

# Highest accuracy I can obtain with 'rbf' kernel = 0.82
model = SVC(kernel='rbf', C=100, gamma=0.001)
#Trying fitting with a linear kernel. Accuracy seems much higher @ 0.9.
model2 = SVC(kernel='linear', C=10, gamma=1)
model2.fit(X_training_scaled, y_training)
model.fit(X_training_scaled, y_training)

from sklearn.metrics import accuracy_score

y_pred = model.predict(X_testing_scaled)
y2_pred = model2.predict(X_testing_scaled)
accuracy = accuracy_score(y_testing, y_pred)
accuracy2 = accuracy_score(y_testing, y2_pred)
print("Accuracy: ", accuracy, "\n Accuracy: ", accuracy2)

# comparing classification reports with linear and rbf kernel.
classification_rep_svm1 = classification_report(y_testing, y_pred)
classification_rep_svm2 = classification_report(y_testing, y2_pred)

print(f'{classification_rep_svm1} \n')
print(f'{classification_rep_svm2} \n')

Accuracy:  0.82 
 Accuracy:  0.9
              precision    recall  f1-score   support

          no       0.76      0.86      0.81        22
         yes       0.88      0.79      0.83        28

    accuracy                           0.82        50
   macro avg       0.82      0.82      0.82        50
weighted avg       0.83      0.82      0.82        50
 

              precision    recall  f1-score   support

          no       0.95      0.82      0.88        22
         yes       0.87      0.96      0.92        28

    accuracy                           0.90        50
   macro avg       0.91      0.89      0.90        50
weighted avg       0.90      0.90      0.90        50
 



In [9]:
# Now try all different hyperparameters together for SVM.
# Test C and gamma on logarithmic scale

C_vals = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
gamma_vals = [0.0001, 0.001, 0.01, 0.1, 1]

for C in C_vals:
    for gamma in gamma_vals:
        model_1 = SVC(kernel='rbf', C=C, gamma=gamma)
        model_1.fit(X_training_scaled, y_training)
        accuracy_score = model_1.score(X_testing_scaled, y_testing)
        print(f"C={C}, gamma={gamma}, accuracy={accuracy_score:.3f}")
print('\n--------------------------------------')
for C in C_vals:
    for gamma in gamma_vals:
        model_2 = SVC(kernel='linear', C=C, gamma=gamma)
        model_2.fit(X_training_scaled, y_training)
        accuracy_score = model_2.score(X_testing_scaled, y_testing)
        print(f"C={C}, gamma={gamma}, accuracy={accuracy_score:.3f}")

# We can see from these results here that when the kernel=rbf, accuracy peaks at 0.88 with C = 1000, gamma=0.001
# and with kernel=linear, accuracy peaks at 0.900 with numerous different entries. Now it's time to plot them

C=0.001, gamma=0.0001, accuracy=0.560
C=0.001, gamma=0.001, accuracy=0.560
C=0.001, gamma=0.01, accuracy=0.560
C=0.001, gamma=0.1, accuracy=0.560
C=0.001, gamma=1, accuracy=0.560
C=0.01, gamma=0.0001, accuracy=0.560
C=0.01, gamma=0.001, accuracy=0.560
C=0.01, gamma=0.01, accuracy=0.560
C=0.01, gamma=0.1, accuracy=0.560
C=0.01, gamma=1, accuracy=0.560
C=0.1, gamma=0.0001, accuracy=0.560
C=0.1, gamma=0.001, accuracy=0.560
C=0.1, gamma=0.01, accuracy=0.740
C=0.1, gamma=0.1, accuracy=0.700
C=0.1, gamma=1, accuracy=0.560
C=1, gamma=0.0001, accuracy=0.560
C=1, gamma=0.001, accuracy=0.740
C=1, gamma=0.01, accuracy=0.740
C=1, gamma=0.1, accuracy=0.800
C=1, gamma=1, accuracy=0.640
C=10, gamma=0.0001, accuracy=0.740
C=10, gamma=0.001, accuracy=0.740
C=10, gamma=0.01, accuracy=0.820
C=10, gamma=0.1, accuracy=0.740
C=10, gamma=1, accuracy=0.620
C=100, gamma=0.0001, accuracy=0.740
C=100, gamma=0.001, accuracy=0.820
C=100, gamma=0.01, accuracy=0.820
C=100, gamma=0.1, accuracy=0.700
C=100, gamma=1, a

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Looking simple code to understand how Random Forest works from a programming point of view.
rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=None, random_state=5)
rf_classifier.fit(X_training, y_training)
y_pred_rf = rf_classifier.predict(X_testing)

# From code I have seen online the procedure between SVM and random forest
# seems similar.
rf_accuracy = accuracy_score(y_testing, y_pred_rf)
classification_rep = classification_report(y_testing, y_pred_rf)

#i.e accuracy - classification report
print(f"Accuracy: {accuracy:.2f}")
print("\n Classification Report: \n", classification_rep)

# take a sample from dataset to predict.
sample = X_testing.iloc[4:5]
prediction = rf_classifier.predict(sample)

# TODO: need to figure out how to make random forest a better predictor. At the moment with this code 
# It sits at a 64% accuracy. Which is slightly better than guessing, but not by much.
sample_dict = sample.iloc[0].to_dict()
print(f"\nSample: {sample_dict}")

Accuracy: 0.82

 Classification Report: 
               precision    recall  f1-score   support

          no       0.76      1.00      0.86        22
         yes       1.00      0.75      0.86        28

    accuracy                           0.86        50
   macro avg       0.88      0.88      0.86        50
weighted avg       0.89      0.86      0.86        50


Sample: {'year': 2017.0, 'temp': 31.0, 'humidity': 72.0, 'rainfall': 0.3, 'drought_code': 30.47, 'buildup_index': 5.63, 'day': 7.0, 'month': 9.0, 'wind_speed': 17.0}

Predicted Fire: No Fire
