# Loading Modules

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import tree, preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.metrics import confusion_matrix
import plotly.express as px
import dash_table

        STATE  FIRE_YEAR   STAT_CAUSE_DESCR FIRE_SIZE_CLASS  MONTH  \
0          CA       2005      Miscellaneous               A      2   
1          CA       2004          Lightning               A      5   
2          CA       2004     Debris Burning               A      5   
3          CA       2004          Lightning               A      6   
4          CA       2004          Lightning               A      6   
...       ...        ...                ...             ...    ...   
1880460    CA       2015  Missing/Undefined               A      9   
1880461    CA       2015      Miscellaneous               A     10   
1880462    CA       2015  Missing/Undefined               A      5   
1880463    CA       2015  Missing/Undefined               B     10   
1880464    CA       2015      Miscellaneous               A      3   

                 Arson               category  
0        Non-Malicious  Miscellaneous/Missing  
1        Non-Malicious                Natural  
2        Non-Ma

In [2]:
def insert_colon(x):
    x = x[0:2] + ":" + x[2:]
    return x

In [65]:
# pickle_df = pd.read_pickle("wildfire_df.p")

In [66]:
# print(pickle_df)

        STATE  FIRE_YEAR   STAT_CAUSE_DESCR FIRE_SIZE_CLASS  MONTH  \
0          CA       2005      Miscellaneous               A      2   
1          CA       2004          Lightning               A      5   
2          CA       2004     Debris Burning               A      5   
3          CA       2004          Lightning               A      6   
4          CA       2004          Lightning               A      6   
...       ...        ...                ...             ...    ...   
1880460    CA       2015  Missing/Undefined               A      9   
1880461    CA       2015      Miscellaneous               A     10   
1880462    CA       2015  Missing/Undefined               A      5   
1880463    CA       2015  Missing/Undefined               B     10   
1880464    CA       2015      Miscellaneous               A      3   

                 Arson               category  
0        Non-Malicious  Miscellaneous/Missing  
1        Non-Malicious                Natural  
2        Non-Ma

In [5]:
# def arson_column(cause):
#     if cause == "Arson":
#         return "Malicious"
#     else:
#         return "Non-Malicious"

In [7]:
# def categorize(cause):
#     others = ['Structure','Fireworks','Powerline','Railroad','Smoking',\
#                   'Children','Campfire','Equipment Use','Debris Burning']
#     if cause == 'Lightning':
#         return "Natural"
#     elif cause in others:
#         return "Accidental"
#     elif cause == 'Arson':
#         return "Malicious"
#     else:
#         return "Miscellaneous/Missing"

In [9]:
# fig = px.sunburst(
#     data_frame=df,
#     path=['Arson', 'category', 'STAT_CAUSE_DESCR'],
#     color = 'category',
#     maxdepth=-1
# )
# fig.show()

# Basic Dataframe Setup

In [11]:
# set up data frame
con = sqlite3.connect("wildfire_data.sqlite")
df = pd.read_sql_query('SELECT STATE, FIRE_YEAR, STAT_CAUSE_DESCR, FIRE_SIZE_CLASS, \
                       FIRE_SIZE, CONT_DATE, CONT_TIME, DISCOVERY_DATE, LATITUDE, LONGITUDE, \
                       DISCOVERY_TIME from Fires', con)

# mask = df['DISCOVERY_TIME'].isna() | df['DISCOVERY_DATE'].isna() | df['CONT_TIME'].isna() | df['CONT_DATE'].isna()

# df_null = df[mask]

df = df.dropna(axis=0, subset = ['DISCOVERY_TIME', 'DISCOVERY_DATE', 'CONT_TIME', 'CONT_DATE'])

# Convert Julian date to date
df['DISCOVERY_DATE'] = pd.to_datetime(df['DISCOVERY_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')
df['CONT_DATE'] = pd.to_datetime(df['CONT_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')

# Extract Month and Day from date
df['MONTH'] = pd.DatetimeIndex(df['DISCOVERY_DATE']).month
df['DAY'] = df['DISCOVERY_DATE'].dt.dayofweek

# Convert time to datetime
# df['DISCOVERY_TIME'] = pd.to_datetime(df.DISCOVERY_TIME, format='%H%M').dt.time
# df['CONT_TIME'] = pd.to_datetime(df.CONT_TIME, format='%H%M').dt.time
df['DISCOVERY_TIME'] = df['DISCOVERY_TIME'].apply(lambda x: insert_colon(str(x)))
df['CONT_TIME'] = df['CONT_TIME'].apply(lambda x: insert_colon(str(x)))

#Combine date and time to full datetime
df['DISCOVERY_DATETIME'] = pd.to_datetime(df['DISCOVERY_DATE'].apply(str) + ' ' + df['DISCOVERY_TIME'].apply(str))
df['CONT_DATETIME'] = pd.to_datetime(df['CONT_DATE'].apply(str) + ' ' + df['CONT_TIME'].apply(str))

# Burn Time calculation in Hours
df['BURN_TIME'] = (df['CONT_DATETIME'] - df['DISCOVERY_DATETIME']).dt.total_seconds() / 3600.0


In [12]:
# df_null['MONTH'] = np.nan
# df_null['DAY'] = np.nan
# df_null['BURN_TIME'] = np.nan
# print(df_null)

In [13]:
# final_df = pd.concat([df_null, df], ignore_index=True)
# final_df['Arson'] = final_df['STAT_CAUSE_DESCR'].apply(lambda x: arson_column(x))
# final_df['category'] = final_df['STAT_CAUSE_DESCR'].apply(lambda x:categorize(x))
# print(final_df)

In [14]:
# pd.to_pickle(final_df, "browse_df.p")

In [15]:
# fig = px.sunburst(
#     data_frame=final_df,
#     path=['Arson', 'category', 'STAT_CAUSE_DESCR'],
#     color = 'category',
#     maxdepth=-1
# )
# fig.show()

In [64]:
# df_excel = pd.read_excel('evaluation.xlsx')
# print(df_excel)

                  Predicted | Data  Random Forest      KNN  Decision Tree  \
0        Specific Cause | All Data        0.65769  0.59261        0.55403   
1     Specific Cause | State, Date        0.56784  0.53186        0.56667   
2     Categorized Cause | All Data        0.79012  0.73671        0.71726   
3  Categorized Cause | State, Date        0.71362  0.68513        0.71309   
4             Malicious | All Data        0.88573  0.86005        0.83781   
5          Malicious | State, Date        0.85711  0.84573        0.85713   

   Naïve Bayes  
0      0.24248  
1      0.47857  
2      0.55206  
3      0.61265  
4      0.66574  
5      0.84521  


# Categorical Data -> Numeric Data

In [16]:
# turn data into numeric
df_num = df.copy()
df_num = df_num[df_num['STAT_CAUSE_DESCR'] != 'Missing/Undefined']
df_num = df_num[df_num['STAT_CAUSE_DESCR'] != 'Miscellaneous']
encoder = preprocessing.LabelEncoder()
df_num['STAT_CAUSE_DESCR'] = encoder.fit_transform(df_num['STAT_CAUSE_DESCR'])
cause_mapping = {l: i for i, l in enumerate(encoder.classes_)}
print(cause_mapping)
df_num['STATE'] = encoder.fit_transform(df_num['STATE'])
df_num['FIRE_SIZE_CLASS'] = encoder.fit_transform(df_num['FIRE_SIZE_CLASS'])
df_num = df_num.drop(columns = ['DISCOVERY_DATE', 'CONT_DATE', 'DISCOVERY_TIME', 'CONT_TIME', 'DISCOVERY_DATETIME', 'CONT_DATETIME'])

{'Arson': 0, 'Campfire': 1, 'Children': 2, 'Debris Burning': 3, 'Equipment Use': 4, 'Fireworks': 5, 'Lightning': 6, 'Powerline': 7, 'Railroad': 8, 'Smoking': 9, 'Structure': 10}


In [17]:
# # Correlation Matrix
# corr = df_num.corr()
# fig, ax = plt.subplots(figsize=(10, 10))
# ax.matshow(corr,cmap=plt.cm.Blues)
# plt.xticks(range(len(corr.columns)), ["State", "Year", "Cause", "Fire Size Class", "Fire Size", "Latitude", "Longitude",\
#                                      "Month", "Day", "Burn Time"])
# plt.yticks(range(len(corr.columns)), ["State", "Year", "Cause", "Fire Size Class", "Fire Size", "Latitude", "Longitude",\
#                                      "Month", "Day", "Burn Time"])
# for tick in ax.get_xticklabels():
#     tick.set_rotation(45)    
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# plt.show()

# Specific Cause | All Data

In [18]:
# Create test train split for advanced ML
x = df_num.drop(['STAT_CAUSE_DESCR'], axis='columns').values
y = df_num['STAT_CAUSE_DESCR'].values
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)

#### Models

In [19]:
# Random forest for full advanced ML
all_adv_rf = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
all_adv_rf = all_adv_rf.fit(x_train, y_train)
print(all_adv_rf.score(x_test,y_test))
filename = "specific_all_rf.sav"
pickle.dump(all_adv_rf, open(filename, 'wb'))

0.6575352761024369


In [20]:
# y_pred = all_adv_rf.predict(x_test)
# cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(11), ['Arson', 'Campfire', 'Children', 'Debris Burning', 'Equipment Use', 'Fireworks',\
#                            'Lightning', 'Powerline', 'Railroad', 'Smoking', 'Structure'], rotation=20)
# plt.yticks(np.arange(11), ['Arson', 'Campfire', 'Children', 'Debris Burning', 'Equipment Use', 'Fireworks',\
#                            'Lightning', 'Powerline', 'Railroad', 'Smoking', 'Structure'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

In [21]:
# KNeighbors for full advanced ML
all_adv_knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors = 5)
all_adv_knn = all_adv_knn.fit(x_train, y_train)
print(all_adv_knn.score(x_test,y_test))
filename = "specific_all_knn.sav"
pickle.dump(all_adv_knn, open(filename, 'wb'))

0.5916661969449298


In [22]:
# y_pred = all_adv_knn.predict(x_test)
# cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(11), ['Arson', 'Campfire', 'Children', 'Debris Burning', 'Equipment Use', 'Fireworks',\
#                            'Lightning', 'Powerline', 'Railroad', 'Smoking', 'Structure'], rotation=20)
# plt.yticks(np.arange(11), ['Arson', 'Campfire', 'Children', 'Debris Burning', 'Equipment Use', 'Fireworks',\
#                            'Lightning', 'Powerline', 'Railroad', 'Smoking', 'Structure'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

In [23]:
# Decision Tree for full advanced ML
all_adv_dt = sklearn.tree.DecisionTreeClassifier(random_state = 0)
all_adv_dt = all_adv_dt.fit(x_train, y_train)
print(all_adv_dt.score(x_test, y_test))
filename = "specific_all_dt.sav"
pickle.dump(all_adv_dt, open(filename, 'wb'))

0.5551829096443267


In [24]:
# y_pred = all_adv_dt.predict(x_test)
# cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(11), ['Arson', 'Campfire', 'Children', 'Debris Burning', 'Equipment Use', 'Fireworks',\
#                            'Lightning', 'Powerline', 'Railroad', 'Smoking', 'Structure'], rotation=20)
# plt.yticks(np.arange(11), ['Arson', 'Campfire', 'Children', 'Debris Burning', 'Equipment Use', 'Fireworks',\
#                            'Lightning', 'Powerline', 'Railroad', 'Smoking', 'Structure'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

In [25]:
# Naive Bayes for full advanced ML
all_adv_nb = sklearn.naive_bayes.GaussianNB()
all_adv_nb = all_adv_nb.fit(x_train, y_train)
print(all_adv_nb.score(x_test, y_test))
filename = "specific_all_nb.sav"
pickle.dump(all_adv_nb, open(filename, 'wb'))

0.25315183285421716


In [26]:
# y_pred = all_adv_nb.predict(x_test)
# cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(11), ['Arson', 'Campfire', 'Children', 'Debris Burning', 'Equipment Use', 'Fireworks',\
#                            'Lightning', 'Powerline', 'Railroad', 'Smoking', 'Structure'], rotation=20)
# plt.yticks(np.arange(11), ['Arson', 'Campfire', 'Children', 'Debris Burning', 'Equipment Use', 'Fireworks',\
#                            'Lightning', 'Powerline', 'Railroad', 'Smoking', 'Structure'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

# Specific Cause | State, Month, Day

In [27]:
# Simple train test split
simple_df = df_num[['STATE', 'MONTH', 'DAY', 'FIRE_YEAR', 'STAT_CAUSE_DESCR']].copy()
x_simple = simple_df.drop(['STAT_CAUSE_DESCR'], axis='columns').values
y_simple = simple_df['STAT_CAUSE_DESCR'].values
x_simple_train, x_simple_test, y_simple_train, y_simple_test = train_test_split(x_simple,y_simple, test_size=0.3)
print(simple_df)

         STATE  MONTH  DAY  FIRE_YEAR  STAT_CAUSE_DESCR
1            4      5    2       2004                 6
2            4      5    0       2004                 3
3            4      6    0       2004                 6
4            4      6    0       2004                 6
5            4      6    2       2004                 6
...        ...    ...  ...        ...               ...
1880443      4      9    6       2015                 4
1880448      4      6    4       2015                 6
1880451      4      6    5       2015                 4
1880457      4      9    2       2015                 0
1880458      4      8    5       2015                 6

[709640 rows x 5 columns]


#### Models

In [28]:
# Random Forest for full simple ML
all_basic_rf = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
all_basic_rf = all_basic_rf.fit(x_simple_train, y_simple_train)
print(all_basic_rf.score(x_simple_test,y_simple_test))
filename = "specific_basic_rf.sav"
pickle.dump(all_basic_rf, open(filename, 'wb'))

0.567372188715405


In [29]:
# KNeighbors for full simple ML
all_basic_knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors = 5)
all_basic_knn = all_basic_knn.fit(x_simple_train, y_simple_train)
print(all_basic_knn.score(x_simple_test,y_simple_test))
filename = "specific_basic_knn.sav"
pickle.dump(all_basic_knn, open(filename, 'wb'))

0.5318659226274355


In [30]:
# Decision Tree for full simple ML
all_basic_dt = sklearn.tree.DecisionTreeClassifier(random_state = 0)
all_basic_dt = all_basic_dt.fit(x_simple_train, y_simple_train)
print(all_basic_dt.score(x_simple_test, y_simple_test))
filename = "specific_basic_dt.sav"
pickle.dump(all_basic_dt, open(filename, 'wb'))

0.5666723033275087


In [31]:
# Naive Bayes for full simple ML
all_basic_nb = sklearn.naive_bayes.GaussianNB()
all_basic_nb = all_basic_nb.fit(x_simple_train, y_simple_train)
print(all_basic_nb.score(x_simple_test, y_simple_test))
filename = "specific_basic_nb.sav"
pickle.dump(all_basic_nb, open(filename, 'wb'))

0.47857129436521806


# Cause Category | All Data

In [32]:
def reduce(cause):
    others = ['Structure','Fireworks','Powerline','Railroad','Smoking',\
                  'Children','Campfire','Equipment Use','Debris Burning']
    if cause == 'Lightning':
        return 1
    elif cause in others:
        return 2
    elif cause == 'Arson':
        return 3
    else:
        return 4

In [33]:
# turn data into numeric
red_df = df.copy()
red_df = red_df[red_df['STAT_CAUSE_DESCR'] != 'Missing/Undefined']
red_df = red_df[red_df['STAT_CAUSE_DESCR'] != 'Miscellaneous']
red_df['STAT_CAUSE_DESCR'] = red_df['STAT_CAUSE_DESCR'].apply(lambda x: reduce(x))
encoder = preprocessing.LabelEncoder()
red_df['STATE'] = encoder.fit_transform(red_df['STATE'])
red_df['FIRE_SIZE_CLASS'] = encoder.fit_transform(red_df['FIRE_SIZE_CLASS'])
red_df = red_df.drop(columns = ['DISCOVERY_DATE', 'CONT_DATE', 'DISCOVERY_TIME', 'CONT_TIME', 'DISCOVERY_DATETIME', 'CONT_DATETIME'])
# print(red_df)

In [34]:
x_red = red_df.drop(['STAT_CAUSE_DESCR'], axis='columns').values
y_red = red_df['STAT_CAUSE_DESCR'].values
x_train_red, x_test_red, y_train_red, y_test_red = train_test_split(x_red,y_red, test_size=0.3)

#### Models

In [35]:
# Random forest for reduced advanced ML
red_adv_rf = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
red_adv_rf = red_adv_rf.fit(x_train_red, y_train_red)
print(red_adv_rf.score(x_test_red,y_test_red))
filename = "group_all_rf.sav"
pickle.dump(red_adv_rf, open(filename, 'wb'))

0.7901236307611371


In [36]:
# y_pred = red_adv_rf.predict(x_test_red)
# cm = confusion_matrix(y_true=y_test_red, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(3), ['Natural', 'Accidental', 'Malicious'])
# plt.yticks(np.arange(3), ['Natural', 'Accidental', 'Malicious'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

In [37]:
# KNeighbors for reduced advanced ML
red_adv_knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors = 5)
red_adv_knn = red_adv_knn.fit(x_train_red, y_train_red)
print(red_adv_knn.score(x_test_red,y_test_red))
filename = "group_all_knn.sav"
pickle.dump(red_adv_knn, open(filename, 'wb'))

0.736702177629972


In [38]:
# y_pred = red_adv_knn.predict(x_test_red)
# cm = confusion_matrix(y_true=y_test_red, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(3), ['Natural', 'Accidental', 'Malicious'])
# plt.yticks(np.arange(3), ['Natural', 'Accidental', 'Malicious'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

In [39]:
# Decision Tree for reduced advanced ML
red_adv_dt = sklearn.tree.DecisionTreeClassifier(random_state = 0)
red_adv_dt = red_adv_dt.fit(x_train_red, y_train_red)
print(red_adv_dt.score(x_test_red, y_test_red))
filename = "group_all_dt.sav"
pickle.dump(red_adv_dt, open(filename, 'wb'))

0.7172603949420363


In [40]:
# y_pred = red_adv_dt.predict(x_test_red)
# cm = confusion_matrix(y_true=y_test_red, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(3), ['Natural', 'Accidental', 'Malicious'])
# plt.yticks(np.arange(3), ['Natural', 'Accidental', 'Malicious'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

In [41]:
# Naive Bayes for reduced advanced ML
red_adv_nb = sklearn.naive_bayes.GaussianNB()
red_adv_nb = red_adv_nb.fit(x_train_red, y_train_red)
print(red_adv_nb.score(x_test_red, y_test_red))
filename = "group_all_nb.sav"
pickle.dump(red_adv_nb, open(filename, 'wb'))

0.552068654529057


In [42]:
# y_pred = red_adv_nb.predict(x_test_red)
# cm = confusion_matrix(y_true=y_test_red, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(3), ['Natural', 'Accidental', 'Malicious'])
# plt.yticks(np.arange(3), ['Natural', 'Accidental', 'Malicious'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

# Cause Category | State, Month, Day, Year

In [43]:
red_basic_df = red_df[['STATE', 'MONTH', 'DAY', 'FIRE_YEAR', 'STAT_CAUSE_DESCR']].copy()
x_red_basic = red_basic_df.drop(['STAT_CAUSE_DESCR'], axis='columns').values
y_red_basic = red_basic_df['STAT_CAUSE_DESCR'].values
x_train_red_basic, x_test_red_basic, y_train_red_basic, y_test_red_basic = train_test_split(x_red_basic,y_red_basic, test_size=0.3)

#### Models

In [44]:
# Random forest for reduced basic ML
red_basic_rf = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
red_basic_rf = red_basic_rf.fit(x_train_red_basic, y_train_red_basic)
print(red_basic_rf.score(x_test_red_basic,y_test_red_basic))
filename = "group_basic_rf.sav"
pickle.dump(red_basic_rf, open(filename, 'wb'))

0.7136247486988708


In [45]:
# KNeighbors for reduced basic ML
red_basic_knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors = 5)
red_basic_knn = red_basic_knn.fit(x_train_red_basic, y_train_red_basic)
print(red_basic_knn.score(x_test_red_basic,y_test_red_basic))
filename = "group_basic_knn.sav"
pickle.dump(red_basic_knn, open(filename, 'wb'))

0.6851361253593371


In [46]:
# Decision Tree for reduced basic ML
red_basic_dt = sklearn.tree.DecisionTreeClassifier(random_state = 0)
red_basic_dt = red_basic_dt.fit(x_train_red_basic, y_train_red_basic)
print(red_basic_dt.score(x_test_red_basic, y_test_red_basic))
filename = "group_basic_dt.sav"
pickle.dump(red_basic_dt, open(filename, 'wb'))

0.7130939631362381


In [47]:
# Naive Bayes for reduced advanced ML
red_basic_nb = sklearn.naive_bayes.GaussianNB()
red_basic_nb = red_basic_nb.fit(x_train_red_basic, y_train_red_basic)
print(red_basic_nb.score(x_test_red_basic, y_test_red_basic))
filename = "group_basic_nb.sav"
pickle.dump(red_basic_nb, open(filename, 'wb'))

0.6126533641470793


# Arson | All Data

In [48]:
def arson(cause):
    if cause == 'Arson':
        return 1
    else:
        return 0

In [49]:
arson_df = df.copy()
arson_df['STAT_CAUSE_DESCR'] = arson_df['STAT_CAUSE_DESCR'].apply(lambda x: arson(x))
encoder = preprocessing.LabelEncoder()
arson_df['STATE'] = encoder.fit_transform(arson_df['STATE'])
arson_df['FIRE_SIZE_CLASS'] = encoder.fit_transform(arson_df['FIRE_SIZE_CLASS'])
arson_df = arson_df.drop(columns = ['DISCOVERY_DATE', 'CONT_DATE', 'DISCOVERY_TIME', 'CONT_TIME', 'DISCOVERY_DATETIME', 'CONT_DATETIME'])

In [50]:
x_arson = arson_df.drop(['STAT_CAUSE_DESCR'], axis='columns').values
y_arson = arson_df['STAT_CAUSE_DESCR'].values
x_train_arson, x_test_arson, y_train_arson, y_test_arson = train_test_split(x_arson,y_arson, test_size=0.3)

#### Models

In [51]:
# Random forest for arson ML
arson_rf = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
arson_rf = arson_rf.fit(x_train_arson, y_train_arson)
print(arson_rf.score(x_test_arson,y_test_arson))
filename = "arson_all_rf.sav"
pickle.dump(arson_rf, open(filename, 'wb'))

0.8857337174844826


In [52]:
# y_pred = arson_rf.predict(x_test_arson)
# cm = confusion_matrix(y_true=y_test_arson, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(2), ['Non-Malicious', 'Malicious'])
# plt.yticks(np.arange(2), ['Non-Malicious', 'Malicious'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

In [53]:
# KNeighbors for arson ML
arson_knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors = 5)
arson_knn = arson_knn.fit(x_train_arson, y_train_arson)
print(arson_knn.score(x_test_arson,y_test_arson))
filename = "arson_all_knn.sav"
pickle.dump(arson_knn, open(filename, 'wb'))

0.8600576226723916


In [54]:
# y_pred = arson_knn.predict(x_test_arson)
# cm = confusion_matrix(y_true=y_test_arson, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(2), ['Non-Malicious', 'Malicious'])
# plt.yticks(np.arange(2), ['Non-Malicious', 'Malicious'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

In [55]:
# Decision Tree for arson ML
arson_dt = sklearn.tree.DecisionTreeClassifier(random_state = 0)
arson_dt = arson_dt.fit(x_train_arson, y_train_arson)
print(arson_dt.score(x_test_arson,y_test_arson))
filename = "arson_all_dt.sav"
pickle.dump(arson_dt, open(filename, 'wb'))

0.8378082457969455


In [56]:
# y_pred = arson_dt.predict(x_test_arson)
# cm = confusion_matrix(y_true=y_test_arson, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(2), ['Non-Malicious', 'Malicious'])
# plt.yticks(np.arange(2), ['Non-Malicious', 'Malicious'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

In [57]:
# Naive Bayes for arson ML
arson_nb = sklearn.naive_bayes.GaussianNB()
arson_nb = arson_nb.fit(x_train_arson, y_train_arson)
print(arson_nb.score(x_test_arson,y_test_arson))
filename = "arson_all_nb.sav"
pickle.dump(arson_nb, open(filename, 'wb'))

0.6657436575823141


In [58]:
# y_pred = arson_nb.predict(x_test_arson)
# cm = confusion_matrix(y_true=y_test_arson, y_pred=y_pred)
# cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# fig,ax = plt.subplots(figsize=(10,10))
# ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
# plt.xticks(np.arange(2), ['Non-Malicious', 'Malicious'])
# plt.yticks(np.arange(2), ['Non-Malicious', 'Malicious'])
# for i in range(cmn.shape[0]):
#     for j in range(cmn.shape[1]):
#         ax.text(x=j,y=i,s=round(cmn[i,j],2),va='center',ha='center')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# ax.tick_params(axis='x', colors='white')
# ax.tick_params(axis='y', colors='white')
# ax.xaxis.label.set_color('white')
# ax.yaxis.label.set_color('white')
# plt.show()

# Arson | State, Month, Day, Year

In [59]:
arson_df_basic = arson_df[['STATE', 'MONTH', 'DAY', 'FIRE_YEAR', 'STAT_CAUSE_DESCR']].copy()
x_arson_basic = arson_df_basic.drop(['STAT_CAUSE_DESCR'], axis='columns').values
y_arson_basic = arson_df_basic['STAT_CAUSE_DESCR'].values
x_train_arson_basic, x_test_arson_basic, y_train_arson_basic, y_test_arson_basic = train_test_split(x_arson_basic,y_arson_basic, test_size=0.3)
print(arson_df_basic)

         STATE  MONTH  DAY  FIRE_YEAR  STAT_CAUSE_DESCR
0            4      2    2       2005                 0
1            4      5    2       2004                 0
2            4      5    0       2004                 0
3            4      6    0       2004                 0
4            4      6    0       2004                 0
...        ...    ...  ...        ...               ...
1880456      4      6    6       2015                 0
1880457      4      9    2       2015                 1
1880458      4      8    5       2015                 0
1880459      4      5    3       2015                 0
1880460      4      9    5       2015                 0

[892007 rows x 5 columns]


In [60]:
# Random forest for basic arson ML
arson_basic_rf = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
arson_basic_rf = arson_basic_rf.fit(x_train_arson_basic, y_train_arson_basic)
print(arson_basic_rf.score(x_test_arson_basic,y_test_arson_basic))
filename = "arson_basic_rf.sav"
pickle.dump(arson_basic_rf, open(filename, 'wb'))

0.857109225232901


In [61]:
# KNeighbors for basic arson ML
arson_basic_knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors = 5)
arson_basic_knn = arson_basic_knn.fit(x_train_arson_basic, y_train_arson_basic)
print(arson_basic_knn.score(x_test_arson_basic,y_test_arson_basic))
filename = "arson_basic_knn.sav"
pickle.dump(arson_basic_knn, open(filename, 'wb'))

0.8457304290310647


In [62]:
# Decision Tree for basic arson ML
arson_basic_dt = sklearn.tree.DecisionTreeClassifier(random_state = 0)
arson_basic_dt = arson_basic_dt.fit(x_train_arson_basic, y_train_arson_basic)
print(arson_basic_dt.score(x_test_arson_basic,y_test_arson_basic))
filename = "arson_basic_dt.sav"
pickle.dump(arson_basic_dt, open(filename, 'wb'))

0.8571391202639731


In [63]:
# Naive Bayes for basic arson ML
arson_basic_nb = sklearn.naive_bayes.GaussianNB()
arson_basic_nb = arson_basic_nb.fit(x_train_arson_basic, y_train_arson_basic)
print(arson_basic_nb.score(x_test_arson_basic,y_test_arson_basic))
filename = "arson_basic_nb.sav"
pickle.dump(arson_basic_nb, open(filename, 'wb'))

0.8452147397450701
