In [21]:
# https://github.com/matoaster/CT4101-assignment1.git
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


In [22]:
# Training and testing data
training_data = pd.read_csv("wildfires_training.csv")
test_data = pd.read_csv("wildfires_test.csv")

In [23]:
# Split the data between the fire feature, the feature we want to predict 
# and the other features
X_training = training_data.drop(columns=['fire'])
y_training = training_data['fire']
X_testing = test_data.drop(columns=['fire'])
y_testing = test_data['fire']

# Scaling the data "standardizes" it 
# by removing the mean and scaling to the unit variance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_training_scaled = scaler.fit_transform(X_training)
X_testing_scaled = scaler.transform(X_testing)
# Source: https://www.geeksforgeeks.org/machine-learning/how-to-make-better-models-in-python-using-svm-classifier-and-rbf-kernel/


In [32]:
# Now try all different hyperparameters together for SVM.
# Test C and gamma on logarithmic scale

C_vals = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
gamma_vals = [0.0001, 0.001, 0.01, 0.1, 1]

# create a d
SVM_results = []

for C in C_vals:
    for gamma in gamma_vals:
        model_1 = SVC(kernel='rbf', C=C, gamma=gamma)
        model_1.fit(X_training_scaled, y_training)
        testing_accuracy= model_1.score(X_testing_scaled, y_testing)
        training_accuracy= model_1.score(X_training_scaled, y_training)
        #print(f"C={C}, gamma={gamma}, testing accuracy={testing_accuracy:.3f}, training_accuracy ={training_accuracy:.3f}")
        SVM_results.append({
            "kernel":"rbf","C":C,"gamma":gamma,"training_accuracy":training_accuracy,"testing_accuracy":testing_accuracy})
        
for C in C_vals:
        model_2 = SVC(kernel='linear', C=C)
        model_2.fit(X_training_scaled, y_training)
        testing_accuracy = model_2.score(X_testing_scaled, y_testing)
        training_accuracy = model_2.score(X_training_scaled, y_training)
        #print(f"C={C}, testing accuracy={testing_accuracy:.3f}, training accuracy={training_accuracy:.3f}")
        SVM_results.append({
            "kernel":"linear","C":C,"gamma":np.nan,"training_accuracy":training_accuracy,"testing_accuracy":testing_accuracy})
# We can see from these results here that when the kernel=rbf, accuracy peaks at 0.88 with C = 1000, gamma=0.001
# and with kernel=linear, accuracy peaks at 0.900 with numerous different entries. Now it's time to plot them

# Save the results from SVM algorithms to a dataframe
SVM_df = pd.DataFrame(SVM_results)

# Pick out best results from rbf and linear kernels
best_test_result_rbf = SVM_df[SVM_df.kernel=="rbf"].sort_values("testing_accuracy", ascending = False).iloc[0]
best_train_result_rbf = SVM_df[SVM_df.kernel=="rbf"].sort_values("training_accuracy", ascending = False).iloc[0]

best_test_result_linear = SVM_df[SVM_df.kernel=="linear"].sort_values("testing_accuracy", ascending = False).iloc[0]
best_train_result_linear = SVM_df[SVM_df.kernel=="linear"].sort_values("training_accuracy", ascending = False).iloc[0]

# Print out best achieved results for each testing, training, linear rbf. sort in des
print(f"Best Testing RBF Results: C={best_test_result_rbf.C}, gamma={best_test_result_rbf.gamma}, accuracy={best_test_result_rbf.testing_accuracy}")
print(f"Best Training RBF Results: C={best_train_result_rbf.C}, gamma={best_train_result_rbf.gamma}, accuracy={best_test_result_rbf.training_accuracy}")
print("---------------------------------------")
print(f"Best Testing Linear Results: C={best_test_result_linear.C}, accuracy={best_test_result_linear.testing_accuracy}")
print(f"Best Training Linear Results: C={best_train_result_linear.C}, accuracy={best_test_result_linear.training_accuracy}")

Best Testing RBF Results: C=1000.0, gamma=0.001, accuracy=0.88
Best Training RBF Results: C=1000.0, gamma=0.1, accuracy=0.9155844155844156
---------------------------------------
Best Testing Linear Results: C=10.0, accuracy=0.9
Best Training Linear Results: C=100.0, accuracy=0.922077922077922


In [9]:


# Looking simple code to understand how Random Forest works from a programming point of view.
rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=None, random_state=5)
rf_classifier.fit(X_training, y_training)
y_pred_rf = rf_classifier.predict(X_testing)

# From code I have seen online the procedure between SVM and random forest
# seems similar.
rf_accuracy = accuracy_score(y_testing, y_pred_rf)
classification_rep = classification_report(y_testing, y_pred_rf)

#i.e accuracy - classification report
print(f"Accuracy: {accuracy:.2f}")
print("\n Classification Report: \n", classification_rep)

# take a sample from dataset to predict.
sample = X_testing.iloc[4:5]
prediction = rf_classifier.predict(sample)

# TODO: need to figure out how to make random forest a better predictor. At the moment with this code 
# It sits at a 64% accuracy. Which is slightly better than guessing, but not by much.
sample_dict = sample.iloc[0].to_dict()
print(f"\nSample: {sample_dict}")

Accuracy: 0.82

 Classification Report: 
               precision    recall  f1-score   support

          no       0.76      1.00      0.86        22
         yes       1.00      0.75      0.86        28

    accuracy                           0.86        50
   macro avg       0.88      0.88      0.86        50
weighted avg       0.89      0.86      0.86        50


Sample: {'year': 2017.0, 'temp': 31.0, 'humidity': 72.0, 'rainfall': 0.3, 'drought_code': 30.47, 'buildup_index': 5.63, 'day': 7.0, 'month': 9.0, 'wind_speed': 17.0}
