In [1]:
import importlib

import copy
from tabulate import tabulate
import random

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MySimpleLinearRegressor, MyNaiveBayesClassifier, MyDecisionTreeClassifier, MyRandomForestClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

In [2]:
wildfire_fname = os.path.join("data","clean_fire_data.csv")
wildfire_table = MyPyTable()
wildfire_table.load_from_file(wildfire_fname)

fire_date = wildfire_table.get_column("date")
county = wildfire_table.get_column("county")
acres = wildfire_table.get_column("acres")
cause = wildfire_table.get_column("cause")
lat = wildfire_table.get_column("lat")
lng = wildfire_table.get_column("lon")
binlat = wildfire_table.get_column("binlat")
binlon = wildfire_table.get_column("binlon")
binacres = wildfire_table.get_column("binacres")

wildfire_X = [[fire_date[i], county[i], cause[i], binlat[i], binlon[i]] for i in range(len(fire_date))]
wildfire_y = [x for x in binacres]





In [3]:
print(len(wildfire_X))
print(set(fire_date))
print(set(county))
print(set(cause))
print(set(binlat))
print(set(binlon))
print(set(binacres))

3737
{'Mar', 'Sep', 'Jan', 'Oct', 'May', 'Jul', 'Feb', 'Dec', 'Jun', 'Apr', 'Nov', 'Aug'}
{'Okanogan', 'Yakima', 'No Data', 'Jefferson', 'Snohomish', 'Wahkiakum', 'Grays Harbor', 'Garfield', 'Stevens', 'Benton', 'Mason', 'Columbia', 'Spokane', 'Klickitat', 'Lincoln', 'Island', 'Whitman', 'Ferry', 'Thurston', 'Lewis', 'Cowlitz', 'Chelan', 'Walla Walla', 'Kittitas', 'Pacific', 'Franklin', 'Pierce', 'Kitsap', 'Skamania', 'King', 'Clark', 'Adams', 'Douglas', 'San Juan', 'Whatcom', 'Grant', 'Clallam', 'Pend Oreille', 'Skagit', 'Asotin'}
{'Lightning', 'Debris Burn', 'Railroad', 'Smoker', 'Under Invest', 'Undetermined', 'Recreation', 'Logging', 'Miscellaneou', 'None', 'Children', 'Arson'}
{1.0, 2.0, 3.0, 4.0}
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}
{2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}


## Finding the best tree

The below code tries to find a better 'best tree' than previously found
so far the best tree that we have found is 

F: 1 N: 5 M: 4 Accuracy: 0.42

In [8]:
X_train, X_test, y_train, y_test = myevaluation.train_test_split(copy.deepcopy(wildfire_X), copy.deepcopy(wildfire_y), test_size=1000)

best_trees = []
max_accuracy = .4

for i in range(20):
    F = random.randint(1,5)
    N = random.randint(1,30)
    M = random.randint(1,N)

    rf = MyRandomForestClassifier()
    rf.fit(wildfire_X, wildfire_y, F=F, N=N, M=M)

    predictions = []
    for i, x in enumerate(X_test):
        prediction = rf.predict([x])
        # print(prediction, y_test[i])
        # print(prediction)
        predictions.append(int(prediction[0] == y_test[i]))
    if sum(predictions)/len(predictions) > max_accuracy:
        print("F:", F, "N:", N, "M:", M, "Accuracy:", sum(predictions)/len(predictions))
        f = open("best_tree.txt", "w")
        f.write(str(rf.trees))
        f.close()
        max_accuracy = sum(predictions)/len(predictions)
        best_trees = rf.trees

print("done")

F: 5 N: 15 M: 12 Accuracy: 0.414
done


## Testing of best tree

next we test the best tree with some random samples

In [5]:
test_size = 1000

X_train, X_test, y_train, y_test = myevaluation.train_test_split(copy.deepcopy(wildfire_X), copy.deepcopy(wildfire_y), test_size=test_size, shuffle=True)

rf = MyRandomForestClassifier()
rf.trees = copy.deepcopy(best_trees)
predictions = []
for i, x in enumerate(X_test):
    sys.stdout.write("\r" + str(i) + "/" + str(len(X_test) -1) + "    ")
    sys.stdout.flush()
    prediction = rf.predict([x])
    predictions.append(prediction[0])

print()
acc = round(sum([int(x==y) for x,y in zip(predictions, y_test)])/len(predictions), 2)
print("Random Forest: accuracy = " + str(acc) + " error rate = " + str(1-acc))


999/999    
Random Forest: accuracy = 0.42 error rate = 0.5800000000000001


## KNN
The code below tests intstances of the dataset using the the knn classifier

In [6]:
test_size = 1000

X_train, X_test, y_train, y_test = myevaluation.train_test_split(copy.deepcopy(wildfire_X), copy.deepcopy(wildfire_y), test_size=test_size, shuffle=True)

nb = MyKNeighborsClassifier()
nb.fit(X_train, y_train)
predictions = []
for i, x in enumerate(X_test):
    sys.stdout.write("\r" + str(i) + "/" + str(len(X_test) -1) + "    ")
    sys.stdout.flush()
    prediction = nb.predict([x])
    predictions.append(prediction[0])

print()
acc = round(sum([int(x==y) for x,y in zip(predictions, y_test)])/len(predictions), 2)
print("KNN: accuracy = " + str(acc) + " error rate = " + str(1-acc))

999/999    
KNN: accuracy = 0.34 error rate = 0.6599999999999999


## Naive Bayes

The code below tests the acuracy of the dataset using the Naive Bayes classifier

In [7]:
test_size = 1000

X_train, X_test, y_train, y_test = myevaluation.train_test_split(copy.deepcopy(wildfire_X), copy.deepcopy(wildfire_y), test_size=test_size, shuffle=True)

nb = MyNaiveBayesClassifier()
nb.fit(X_train, y_train)
predictions = []
for i, x in enumerate(X_test):
    sys.stdout.write("\r" + str(i) + "/" + str(len(X_test) -1) + "    ")
    sys.stdout.flush()
    prediction = nb.predict([x])
    predictions.append(prediction[0])

print()
acc = round(sum([int(x==y) for x,y in zip(predictions, y_test)])/len(predictions), 2)
print("Naive Bayes: accuracy = " + str(acc) + " error rate = " + str(1-acc))

999/999    
Naive Bayes: accuracy = 0.37 error rate = 0.63
