In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# SK-learn libraries for learning.
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from matplotlib.colors import LogNorm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

## Import Data

In [5]:
cleaned_df=pd.read_csv('cleaned_wildfire_data.csv').drop(['Unnamed: 0','_id','Event'],axis=1)
cleaned_df.head()

Unnamed: 0,Latitude,Longitude,Size,log_sizes,size_category,temperature_14,temperature_13,temperature_12,temperature_11,temperature_10,...,windSpeed5,windSpeed6,windSpeed7,windSpeed8,windSpeed9,windSpeed10,windSpeed11,windSpeed12,windSpeed13,windSpeed14
0,33.6681,-117.50139,100.0,4.60517,class_d,48.22125,52.325833,55.35375,52.667083,48.797917,...,3.646667,2.4275,4.275833,1.949583,2.09625,1.935833,1.42875,2.269583,1.633333,1.618333
1,33.0333,-116.83389,25.0,3.218876,class_c,48.305833,46.069583,44.835417,39.776667,37.474167,...,1.526667,1.640417,1.833333,5.26125,4.94125,1.84625,2.01125,7.47625,5.292083,2.992083
2,37.3683,-118.35833,840.0,6.733402,class_e,40.8725,41.335833,49.464583,50.744583,55.579167,...,3.3375,2.84625,5.654583,4.394583,2.93375,2.27125,2.413333,2.819167,5.6875,6.950417
3,33.8692,-117.68361,892.0,6.793466,class_e,55.9875,59.727083,60.797917,62.654167,64.84125,...,5.224583,4.270417,2.53,4.327917,5.61125,3.257917,3.257083,3.121667,2.86375,2.850417
4,32.8689,-116.86667,15.0,2.70805,class_c,54.46875,53.435833,55.019167,58.447083,62.486667,...,3.899167,4.0025,3.761667,6.027083,3.868333,2.8,3.5575,3.774583,4.250417,3.922083


In [6]:
#Split data into X's (features) and Y's (labels)
X= cleaned_df.drop(columns='size_category')
y= cleaned_df['size_category']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
X_train.head()

Unnamed: 0,Latitude,Longitude,Size,log_sizes,temperature_14,temperature_13,temperature_12,temperature_11,temperature_10,temperature_9,...,windSpeed5,windSpeed6,windSpeed7,windSpeed8,windSpeed9,windSpeed10,windSpeed11,windSpeed12,windSpeed13,windSpeed14
487,33.62,-117.197,15.0,2.70805,38.714167,42.222917,43.121667,44.44875,46.117083,50.07625,...,1.175833,2.575417,1.39875,1.134583,2.774583,1.42625,1.275417,2.75125,1.889583,1.44625
104,33.3692,-117.35083,800.0,6.684612,65.834583,67.34,64.0125,64.894167,68.23875,70.247917,...,3.692917,3.689583,2.20375,1.665417,2.969167,4.668333,2.7775,2.0075,2.6325,1.9075
114,32.9856,-116.7275,273246.0,12.518128,69.81625,71.999583,72.169583,72.37125,69.380833,69.422083,...,2.215417,4.809583,3.578333,1.724167,2.114583,3.409167,1.565,1.040833,1.110833,1.22875
610,41.7789,-122.6094,710.0,6.565265,70.994167,76.185,79.989167,81.58125,79.566667,79.768333,...,1.852083,1.844583,2.168333,2.167083,2.339167,2.673333,2.615,1.640417,2.028333,1.540417
250,32.632,-116.764,160.0,5.075174,70.88875,72.21125,73.6075,74.344167,75.09875,76.4125,...,2.234167,2.260833,2.012083,1.890833,2.39125,2.443333,2.449583,2.375,2.498333,2.456667


In [None]:
train_data = X_train.reset_index(drop=True)
test_data = X_test.reset_index(drop=True)
train_labels = y_train.reset_index(drop=True)
test_labels = y_test.reset_index(drop=True)
train_data.head()

Unnamed: 0.1,Unnamed: 0,_id,Event,Latitude,Longitude,Size,log_sizes,temperature_14,temperature_13,temperature_12,...,windSpeed5,windSpeed6,windSpeed7,windSpeed8,windSpeed9,windSpeed10,windSpeed11,windSpeed12,windSpeed13,windSpeed14
0,487,"ObjectId(""5caeb3bda6e44a5b0027d0cb"")",CA-RRU-04351,33.62,-117.197,15.0,2.70805,38.714167,42.222917,43.121667,...,1.175833,2.575417,1.39875,1.134583,2.774583,1.42625,1.275417,2.75125,1.889583,1.44625
1,104,"ObjectId(""5caeb39ca6e44a5b0027cdab"")",CA-MCP-002533,33.3692,-117.35083,800.0,6.684612,65.834583,67.34,64.0125,...,3.692917,3.689583,2.20375,1.665417,2.969167,4.668333,2.7775,2.0075,2.6325,1.9075
2,114,"ObjectId(""5caeb39ca6e44a5b0027cdb5"")",CA-CNF-003056,32.9856,-116.7275,273246.0,12.518128,69.81625,71.999583,72.169583,...,2.215417,4.809583,3.578333,1.724167,2.114583,3.409167,1.565,1.040833,1.110833,1.22875
3,610,"ObjectId(""5caeb3cda6e44a5b0027d260"")",2016-CASKU-007392,41.7789,-122.6094,710.0,6.565265,70.994167,76.185,79.989167,...,1.852083,1.844583,2.168333,2.167083,2.339167,2.673333,2.615,1.640417,2.028333,1.540417
4,250,"ObjectId(""5caeb3a7a6e44a5b0027cece"")",CA-MVU-6006,32.632,-116.764,160.0,5.075174,70.88875,72.21125,73.6075,...,2.234167,2.260833,2.012083,1.890833,2.39125,2.443333,2.449583,2.375,2.498333,2.456667


Model:\
1) Naive Bayes (Pan)\
    -Bernoulli
    -Multinomial
2) k-nn & K-means (Pauline)
3) Gaussian and PCA (Hannah)
4) Logistic Regression (Cisco)
5) Linear Regression (Josh) (optional)



## Naive Bayes (Pan)

## k-nn & K-means (Pauline)

## Gaussian and PCA (Hannah)

## Logistic Regression (Cisco)

## Linear Regression (Josh) 