In [11]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# SK-learn libraries for learning.
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from matplotlib.colors import LogNorm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

## Import Data

In [5]:
cleaned_df=pd.read_csv('cleaned_wildfire_data.csv')
cleaned_df

Unnamed: 0.1,Unnamed: 0,_id,Event,Latitude,Longitude,Size,log_sizes,size_category,temperature_14,temperature_13,...,windSpeed5,windSpeed6,windSpeed7,windSpeed8,windSpeed9,windSpeed10,windSpeed11,windSpeed12,windSpeed13,windSpeed14
0,0,"ObjectId(""5caeb38ca6e44a5b0027cc15"")",CA-CNF-000102,33.66810,-117.50139,100.0,4.605170,class_d,48.221250,52.325833,...,3.646667,2.427500,4.275833,1.949583,2.096250,1.935833,1.428750,2.269583,1.633333,1.618333
1,1,"ObjectId(""5caeb38ca6e44a5b0027cc16"")",CA-MVU-955,33.03330,-116.83389,25.0,3.218876,class_c,48.305833,46.069583,...,1.526667,1.640417,1.833333,5.261250,4.941250,1.846250,2.011250,7.476250,5.292083,2.992083
2,2,"ObjectId(""5caeb38ca6e44a5b0027cc1b"")",CA-BDU-2403,37.36830,-118.35833,840.0,6.733402,class_e,40.872500,41.335833,...,3.337500,2.846250,5.654583,4.394583,2.933750,2.271250,2.413333,2.819167,5.687500,6.950417
3,3,"ObjectId(""5caeb38ca6e44a5b0027cc27"")",CA-RRU-29527,33.86920,-117.68361,892.0,6.793466,class_e,55.987500,59.727083,...,5.224583,4.270417,2.530000,4.327917,5.611250,3.257917,3.257083,3.121667,2.863750,2.850417
4,4,"ObjectId(""5caeb38ca6e44a5b0027cc2c"")",CA-MVU-003100,32.86890,-116.86667,15.0,2.708050,class_c,54.468750,53.435833,...,3.899167,4.002500,3.761667,6.027083,3.868333,2.800000,3.557500,3.774583,4.250417,3.922083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676,676,"ObjectId(""5caeb3d4a6e44a5b0027d325"")",2017-CALAC-362441,34.45282,-118.58139,6049.0,8.707648,class_g,74.087500,77.037500,...,6.338333,4.172500,4.346667,2.132917,4.606667,1.978750,1.973750,7.612917,2.361250,1.133333
677,677,"ObjectId(""5caeb3d4a6e44a5b0027d326"")",2017-CALFD-030179,34.08222,-118.47528,422.0,6.045005,class_e,72.603750,77.289167,...,3.350000,1.769167,1.842917,0.922917,1.963333,1.386667,1.410417,4.534167,1.746250,0.946667
678,678,"ObjectId(""5caeb3d4a6e44a5b0027d331"")",2018-CALAC-166129,34.51998,-118.28181,1352.0,7.209340,class_f,54.496667,57.502500,...,4.163750,3.302500,2.339583,3.526250,4.370000,5.186667,4.914167,5.712083,5.370417,3.529167
679,679,"ObjectId(""5caeb3d4a6e44a5b0027d335"")",2018-CAMVU-008372,32.63000,-116.47000,265.0,5.579730,class_d,57.593333,56.679167,...,2.763750,2.772500,3.166250,3.325833,2.831667,4.111667,4.872083,3.710417,2.650000,2.435833


In [7]:
#Split data into X's (features) and Y's (labels)
X= cleaned_df.drop(columns='size_category')
y= cleaned_df['size_category']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,_id,Event,Latitude,Longitude,Size,log_sizes,temperature_14,temperature_13,temperature_12,...,windSpeed5,windSpeed6,windSpeed7,windSpeed8,windSpeed9,windSpeed10,windSpeed11,windSpeed12,windSpeed13,windSpeed14
487,487,"ObjectId(""5caeb3bda6e44a5b0027d0cb"")",CA-RRU-04351,33.62,-117.197,15.0,2.70805,38.714167,42.222917,43.121667,...,1.175833,2.575417,1.39875,1.134583,2.774583,1.42625,1.275417,2.75125,1.889583,1.44625
104,104,"ObjectId(""5caeb39ca6e44a5b0027cdab"")",CA-MCP-002533,33.3692,-117.35083,800.0,6.684612,65.834583,67.34,64.0125,...,3.692917,3.689583,2.20375,1.665417,2.969167,4.668333,2.7775,2.0075,2.6325,1.9075
114,114,"ObjectId(""5caeb39ca6e44a5b0027cdb5"")",CA-CNF-003056,32.9856,-116.7275,273246.0,12.518128,69.81625,71.999583,72.169583,...,2.215417,4.809583,3.578333,1.724167,2.114583,3.409167,1.565,1.040833,1.110833,1.22875
610,610,"ObjectId(""5caeb3cda6e44a5b0027d260"")",2016-CASKU-007392,41.7789,-122.6094,710.0,6.565265,70.994167,76.185,79.989167,...,1.852083,1.844583,2.168333,2.167083,2.339167,2.673333,2.615,1.640417,2.028333,1.540417
250,250,"ObjectId(""5caeb3a7a6e44a5b0027cece"")",CA-MVU-6006,32.632,-116.764,160.0,5.075174,70.88875,72.21125,73.6075,...,2.234167,2.260833,2.012083,1.890833,2.39125,2.443333,2.449583,2.375,2.498333,2.456667


In [10]:
train_data = X_train.reset_index(drop=True)
test_data = X_test.reset_index(drop=True)
train_labels = y_train.reset_index(drop=True)
test_labels = y_test.reset_index(drop=True)
train_data.head()

Unnamed: 0.1,Unnamed: 0,_id,Event,Latitude,Longitude,Size,log_sizes,temperature_14,temperature_13,temperature_12,...,windSpeed5,windSpeed6,windSpeed7,windSpeed8,windSpeed9,windSpeed10,windSpeed11,windSpeed12,windSpeed13,windSpeed14
0,487,"ObjectId(""5caeb3bda6e44a5b0027d0cb"")",CA-RRU-04351,33.62,-117.197,15.0,2.70805,38.714167,42.222917,43.121667,...,1.175833,2.575417,1.39875,1.134583,2.774583,1.42625,1.275417,2.75125,1.889583,1.44625
1,104,"ObjectId(""5caeb39ca6e44a5b0027cdab"")",CA-MCP-002533,33.3692,-117.35083,800.0,6.684612,65.834583,67.34,64.0125,...,3.692917,3.689583,2.20375,1.665417,2.969167,4.668333,2.7775,2.0075,2.6325,1.9075
2,114,"ObjectId(""5caeb39ca6e44a5b0027cdb5"")",CA-CNF-003056,32.9856,-116.7275,273246.0,12.518128,69.81625,71.999583,72.169583,...,2.215417,4.809583,3.578333,1.724167,2.114583,3.409167,1.565,1.040833,1.110833,1.22875
3,610,"ObjectId(""5caeb3cda6e44a5b0027d260"")",2016-CASKU-007392,41.7789,-122.6094,710.0,6.565265,70.994167,76.185,79.989167,...,1.852083,1.844583,2.168333,2.167083,2.339167,2.673333,2.615,1.640417,2.028333,1.540417
4,250,"ObjectId(""5caeb3a7a6e44a5b0027cece"")",CA-MVU-6006,32.632,-116.764,160.0,5.075174,70.88875,72.21125,73.6075,...,2.234167,2.260833,2.012083,1.890833,2.39125,2.443333,2.449583,2.375,2.498333,2.456667


Model:\
1) Naive Bayes (Pan)\
    -Bernoulli
    -Multinomial
2) k-nn & K-means (Pauline)
3) Gaussian and PCA (Hannah)
4) Logistic Regression (Cisco)
5) Linear Regression (Josh) (optional)



## Naive Bayes (Pan)