In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# SK-learn libraries for learning.
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from matplotlib.colors import LogNorm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

## Import Data

In [2]:
cleaned_df=pd.read_csv('cleaned_wildfire_data.csv').drop(['Unnamed: 0','_id','Event'],axis=1)
cleaned_df.head()

Unnamed: 0,Latitude,Longitude,Size,log_sizes,size_category,temperature_14,temperature_13,temperature_12,temperature_11,temperature_10,...,windSpeed5,windSpeed6,windSpeed7,windSpeed8,windSpeed9,windSpeed10,windSpeed11,windSpeed12,windSpeed13,windSpeed14
0,33.6681,-117.50139,100.0,4.60517,class_d,48.22125,52.325833,55.35375,52.667083,48.797917,...,3.646667,2.4275,4.275833,1.949583,2.09625,1.935833,1.42875,2.269583,1.633333,1.618333
1,33.0333,-116.83389,25.0,3.218876,class_c,48.305833,46.069583,44.835417,39.776667,37.474167,...,1.526667,1.640417,1.833333,5.26125,4.94125,1.84625,2.01125,7.47625,5.292083,2.992083
2,37.3683,-118.35833,840.0,6.733402,class_e,40.8725,41.335833,49.464583,50.744583,55.579167,...,3.3375,2.84625,5.654583,4.394583,2.93375,2.27125,2.413333,2.819167,5.6875,6.950417
3,33.8692,-117.68361,892.0,6.793466,class_e,55.9875,59.727083,60.797917,62.654167,64.84125,...,5.224583,4.270417,2.53,4.327917,5.61125,3.257917,3.257083,3.121667,2.86375,2.850417
4,32.8689,-116.86667,15.0,2.70805,class_c,54.46875,53.435833,55.019167,58.447083,62.486667,...,3.899167,4.0025,3.761667,6.027083,3.868333,2.8,3.5575,3.774583,4.250417,3.922083


In [6]:
#Split data into X's (features) and Y's (labels)
X= cleaned_df.drop(columns='size_category')
y= cleaned_df['size_category']

In [20]:
train_d, dev_d, test_d = np.split(X.sample(frac=1, random_state=42), [int(.6*len(X)), int(.8*len(X))])


In [21]:
train_d.head()

Unnamed: 0,Latitude,Longitude,Size,log_sizes,temperature_14,temperature_13,temperature_12,temperature_11,temperature_10,temperature_9,...,windSpeed5,windSpeed6,windSpeed7,windSpeed8,windSpeed9,windSpeed10,windSpeed11,windSpeed12,windSpeed13,windSpeed14
648,33.68194,-117.39972,188.0,5.236442,73.776667,73.557917,74.04,73.537917,72.35125,73.099167,...,2.2625,2.1125,1.738333,1.844167,1.8375,1.867083,2.002083,1.99625,1.927917,2.248333
296,34.311,-117.322,4100.0,8.318742,64.466667,57.21625,55.025,50.965417,47.996667,55.32125,...,1.065833,1.405833,2.0025,2.440417,2.578333,2.952917,2.266667,4.62375,2.554167,2.53625
63,36.6839,-119.21611,150.0,5.010635,87.258333,87.747917,84.630417,81.019167,80.775,81.9375,...,2.191667,1.915,1.138333,1.995833,1.85625,1.87375,1.972083,1.659583,2.586667,2.27875
320,34.779,-120.09,25000.0,10.126631,59.75875,61.035417,62.762083,60.1475,58.26625,60.17125,...,2.47125,1.929167,2.868333,2.99625,2.430833,2.020833,2.11875,2.98125,2.673333,2.871667
101,33.4436,-117.42389,200.0,5.298317,66.60625,64.455833,65.207917,65.526667,65.628333,64.03375,...,1.88,2.130833,2.150417,2.744167,1.79875,1.59,1.427917,1.802083,1.68375,1.74625


In [22]:
train_l, dev_l, test_l = np.split(y.sample(frac=1, random_state=42), [int(.6*len(y)), int(.8*len(y))])


In [23]:
train_l.head()

648    class_d
296    class_f
63     class_d
320    class_g
101    class_d
Name: size_category, dtype: object

In [24]:
train_data = train_d.reset_index(drop=True)
test_data = test_d.reset_index(drop=True)
train_labels = train_l.reset_index(drop=True)
test_labels = test_l.reset_index(drop=True)
dev_data = dev_d.reset_index(drop=True)
dev_labels = test_l.reset_index(drop=True)

In [25]:
train_data.head()

Unnamed: 0,Latitude,Longitude,Size,log_sizes,temperature_14,temperature_13,temperature_12,temperature_11,temperature_10,temperature_9,...,windSpeed5,windSpeed6,windSpeed7,windSpeed8,windSpeed9,windSpeed10,windSpeed11,windSpeed12,windSpeed13,windSpeed14
0,33.68194,-117.39972,188.0,5.236442,73.776667,73.557917,74.04,73.537917,72.35125,73.099167,...,2.2625,2.1125,1.738333,1.844167,1.8375,1.867083,2.002083,1.99625,1.927917,2.248333
1,34.311,-117.322,4100.0,8.318742,64.466667,57.21625,55.025,50.965417,47.996667,55.32125,...,1.065833,1.405833,2.0025,2.440417,2.578333,2.952917,2.266667,4.62375,2.554167,2.53625
2,36.6839,-119.21611,150.0,5.010635,87.258333,87.747917,84.630417,81.019167,80.775,81.9375,...,2.191667,1.915,1.138333,1.995833,1.85625,1.87375,1.972083,1.659583,2.586667,2.27875
3,34.779,-120.09,25000.0,10.126631,59.75875,61.035417,62.762083,60.1475,58.26625,60.17125,...,2.47125,1.929167,2.868333,2.99625,2.430833,2.020833,2.11875,2.98125,2.673333,2.871667
4,33.4436,-117.42389,200.0,5.298317,66.60625,64.455833,65.207917,65.526667,65.628333,64.03375,...,1.88,2.130833,2.150417,2.744167,1.79875,1.59,1.427917,1.802083,1.68375,1.74625


Model:\
1) Naive Bayes (Pan)\
    -Bernoulli
    -Multinomial
2) k-nn & K-means (Pauline)
3) Gaussian and PCA (Hannah)
4) Logistic Regression (Cisco)
5) Linear Regression (Josh) (optional)



## Naive Bayes (Pan)

## k-nn & K-means (Pauline)

## Gaussian and PCA (Hannah)

## Logistic Regression (Cisco)

## Linear Regression (Josh) 