In [1]:
import importlib

import copy
from tabulate import tabulate
import random

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MySimpleLinearRegressor, MyNaiveBayesClassifier, MyDecisionTreeClassifier, MyRandomForestClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

In [2]:
wildfire_fname = os.path.join("data","clean_fire_data.csv")
wildfire_table = MyPyTable()
wildfire_table.load_from_file(wildfire_fname)

fire_date = wildfire_table.get_column("date")
county = wildfire_table.get_column("county")
acres = wildfire_table.get_column("acres")
cause = wildfire_table.get_column("cause")
lat = wildfire_table.get_column("lat")
lng = wildfire_table.get_column("lon")
binlat = wildfire_table.get_column("binlat")
binlon = wildfire_table.get_column("binlon")
binacres = wildfire_table.get_column("binacres")

wildfire_X = [[fire_date[i], county[i], cause[i], binlat[i], binlon[i]] for i in range(len(fire_date))]
wildfire_y = [x for x in binacres]





In [3]:
print(len(wildfire_X))
print(set(fire_date))
print(set(county))
print(set(cause))
print(set(binlat))
print(set(binlon))
print(set(binacres))

3737
{'Aug', 'May', 'Dec', 'Sep', 'Jul', 'Jan', 'Nov', 'Oct', 'Jun', 'Apr', 'Feb', 'Mar'}
{'No Data', 'Cowlitz', 'Walla Walla', 'Franklin', 'Columbia', 'Spokane', 'King', 'Clallam', 'Ferry', 'Klickitat', 'Wahkiakum', 'Lincoln', 'Whatcom', 'Grays Harbor', 'Yakima', 'Adams', 'Chelan', 'Kittitas', 'Mason', 'Island', 'Asotin', 'Lewis', 'Skamania', 'Benton', 'Skagit', 'Pend Oreille', 'Whitman', 'Pierce', 'Pacific', 'Okanogan', 'Clark', 'Garfield', 'Grant', 'San Juan', 'Douglas', 'Kitsap', 'Stevens', 'Thurston', 'Snohomish', 'Jefferson'}
{'Lightning', 'Undetermined', 'Miscellaneou', 'Debris Burn', 'Recreation', 'None', 'Smoker', 'Under Invest', 'Logging', 'Arson', 'Railroad', 'Children'}
{1.0, 2.0, 3.0, 4.0}
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}
{2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}


## Finding the best tree

The below code tries to find a better 'best tree' than previously found
so far the best tree that we have found is 

F: 1 N: 5 M: 4 Accuracy: 0.433

In [4]:
X_train, X_test, y_train, y_test = myevaluation.train_test_split(copy.deepcopy(wildfire_X), copy.deepcopy(wildfire_y), test_size=1000)

best_trees = []
max_accuracy = .433

for i in range(20):
    F = random.randint(1,5)
    N = random.randint(1,30)
    M = random.randint(1,N)

    rf = MyRandomForestClassifier()
    rf.fit(wildfire_X, wildfire_y, F=1, N=5, M=4)

    predictions = []
    for i, x in enumerate(X_test):
        prediction = rf.predict([x])
        # print(prediction, y_test[i])
        # print(prediction)
        predictions.append(int(prediction[0] == y_test[i]))
    if sum(predictions)/len(predictions) > max_accuracy:
        print("F:", F, "N:", N, "M:", M, "Accuracy:", sum(predictions)/len(predictions))
        f = open("best_tree.txt", "w")
        f.write(str(rf.trees))
        f.close()
        max_accuracy = sum(predictions)/len(predictions)
        best_trees = rf.trees

print("done")

done


## Testing of best tree

next we test the best tree with some random samples

In [5]:
import ast

if best_trees == []:
    with open("best_tree.txt", "r") as data:
        best_trees = ast.literal_eval(data.read())

In [6]:
test_size = 1000

X_train, X_test, y_train, y_test = myevaluation.train_test_split(copy.deepcopy(wildfire_X), copy.deepcopy(wildfire_y), test_size=test_size, shuffle=True)

rf = MyRandomForestClassifier()
rf.trees = copy.deepcopy(best_trees)
predictions = []
for i, x in enumerate(X_test):
    sys.stdout.write("\r" + str(i) + "/" + str(len(X_test) -1) + "    ")
    sys.stdout.flush()
    prediction = rf.predict([x])
    predictions.append(prediction[0])

print()
acc = round(sum([int(x==y) for x,y in zip(predictions, y_test)])/len(predictions), 2)
print("Random Forest: accuracy = " + str(acc) + " error rate = " + str(1-acc))


999/999    
Random Forest: accuracy = 0.42 error rate = 0.5800000000000001


In [7]:
headers = ["acres", "1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", "total", "recognition %"]
mat = myevaluation.confusion_matrix(predictions, y_test, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
myutils.build_confusion_matrix(mat)
print("Random Forests Results")
print(tabulate(mat, headers))

Random Forests Results
  acres    1.0    2.0    3.0    4.0    5.0    6.0    7.0    8.0    9.0    total    recognition %
-------  -----  -----  -----  -----  -----  -----  -----  -----  -----  -------  ---------------
      1      0      0      0      0      0      0      0      0      0        0             0
      2      0     83     59      9     12      4      2      1      0      170            48.82
      3      0    200    319     85    111     37     13     10      7      782            40.79
      4      0      2      6      5      2      1      2      0      0       18            27.78
      5      0      2      5      1      4      0      2      0      0       14            28.57
      6      0      0      4      2      2      3      0      0      2       13            23.08
      7      0      0      0      0      0      1      2      0      0        3            66.67
      8      0      0      0      0      0      0      0      0      0        0             0
      9      

## KNN
The code below tests intstances of the dataset using the the knn classifier

In [8]:
test_size = 1000

X_train, X_test, y_train, y_test = myevaluation.train_test_split(copy.deepcopy(wildfire_X), copy.deepcopy(wildfire_y), test_size=test_size, shuffle=True)

nb = MyKNeighborsClassifier()
nb.fit(X_train, y_train)
predictions = []
for i, x in enumerate(X_test):
    sys.stdout.write("\r" + str(i) + "/" + str(len(X_test) -1) + "    ")
    sys.stdout.flush()
    prediction = nb.predict([x])
    predictions.append(prediction[0])

print()
acc = round(sum([int(x==y) for x,y in zip(predictions, y_test)])/len(predictions), 2)
print("KNN: accuracy = " + str(acc) + " error rate = " + str(1-acc))

999/999    
KNN: accuracy = 0.31 error rate = 0.69


In [9]:
headers = ["acres", "1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", "total", "recognition %"]
mat = myevaluation.confusion_matrix(predictions, y_test, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
myutils.build_confusion_matrix(mat)
print("KNN Results")
print(tabulate(mat, headers))

KNN Results
  acres    1.0    2.0    3.0    4.0    5.0    6.0    7.0    8.0    9.0    total    recognition %
-------  -----  -----  -----  -----  -----  -----  -----  -----  -----  -------  ---------------
      1      0      0      0      0      0      0      0      0      0        0             0
      2      0    104    128     22     34     14      8      3      2      315            33.02
      3      0    137    171     44     40     19      5      3      1      420            40.71
      4      0     21     22      5      3      2      0      0      0       53             9.43
      5      0     26     47     12     21      4      3      1      0      114            18.42
      6      0     10     23      7     17      8      2      2      0       69            11.59
      7      0      3      6      2      6      4      2      0      0       23             8.7
      8      0      0      0      0      1      0      0      0      0        1             0
      9      0      0    

## Naive Bayes

The code below tests the acuracy of the dataset using the Naive Bayes classifier

In [10]:
test_size = 1000

X_train, X_test, y_train, y_test = myevaluation.train_test_split(copy.deepcopy(wildfire_X), copy.deepcopy(wildfire_y), test_size=test_size, shuffle=True)

nb = MyNaiveBayesClassifier()
nb.fit(X_train, y_train)
predictions = []
for i, x in enumerate(X_test):
    sys.stdout.write("\r" + str(i) + "/" + str(len(X_test) -1) + "    ")
    sys.stdout.flush()
    prediction = nb.predict([x])
    predictions.append(prediction[0])

print()
acc = round(sum([int(x==y) for x,y in zip(predictions, y_test)])/len(predictions), 2)
print("Naive Bayes: accuracy = " + str(acc) + " error rate = " + str(1-acc))

999/999    
Naive Bayes: accuracy = 0.36 error rate = 0.64


In [11]:
headers = ["acres", "1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", "total", "recognition %"]
mat = myevaluation.confusion_matrix(predictions, y_test, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
myutils.build_confusion_matrix(mat)
print("KNN Results")
print(tabulate(mat, headers))

KNN Results
  acres    1.0    2.0    3.0    4.0    5.0    6.0    7.0    8.0    9.0    total    recognition %
-------  -----  -----  -----  -----  -----  -----  -----  -----  -----  -------  ---------------
      1      0      0      0      0      0      0      0      0      0        0             0
      2      0    127    120     16     26      5      1      1      0      296            42.91
      3      0    140    200     49     64     21      6      6      0      486            41.15
      4      0      0      2      0      3      0      0      0      0        5             0
      5      0     22     34      9     22      7      8      4      2      108            20.37
      6      0     11     13      8      4      5      1      2      1       45            11.11
      7      0      2      5      1      1      2      2      0      0       13            15.38
      8      0      1      3      1      1      2      3      1      1       13             7.69
      9      0      6   