In [1]:
# Import packages and connect to database
import sqlite3
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score

conn = sqlite3.connect("Wildfires.sqlite")
print(conn.total_changes)

0


In [2]:
# Read in Data
df = pd.read_sql_query("SELECT * FROM Fires;", conn)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1880465 entries, 0 to 1880464
Data columns (total 39 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   OBJECTID                    int64  
 1   FOD_ID                      int64  
 2   FPA_ID                      object 
 3   SOURCE_SYSTEM_TYPE          object 
 4   SOURCE_SYSTEM               object 
 5   NWCG_REPORTING_AGENCY       object 
 6   NWCG_REPORTING_UNIT_ID      object 
 7   NWCG_REPORTING_UNIT_NAME    object 
 8   SOURCE_REPORTING_UNIT       object 
 9   SOURCE_REPORTING_UNIT_NAME  object 
 10  LOCAL_FIRE_REPORT_ID        object 
 11  LOCAL_INCIDENT_ID           object 
 12  FIRE_CODE                   object 
 13  FIRE_NAME                   object 
 14  ICS_209_INCIDENT_NUMBER     object 
 15  ICS_209_NAME                object 
 16  MTBS_ID                     object 
 17  MTBS_FIRE_NAME              object 
 18  COMPLEX_NAME                object 
 19  FIRE_YEAR            

In [3]:
# Restrict to fires at least an acre large
fires_all = df[df['FIRE_SIZE'] > 1]
fires_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 746709 entries, 13 to 1880463
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   OBJECTID                    746709 non-null  int64  
 1   FOD_ID                      746709 non-null  int64  
 2   FPA_ID                      746709 non-null  object 
 3   SOURCE_SYSTEM_TYPE          746709 non-null  object 
 4   SOURCE_SYSTEM               746709 non-null  object 
 5   NWCG_REPORTING_AGENCY       746709 non-null  object 
 6   NWCG_REPORTING_UNIT_ID      746709 non-null  object 
 7   NWCG_REPORTING_UNIT_NAME    746709 non-null  object 
 8   SOURCE_REPORTING_UNIT       746709 non-null  object 
 9   SOURCE_REPORTING_UNIT_NAME  746709 non-null  object 
 10  LOCAL_FIRE_REPORT_ID        118376 non-null  object 
 11  LOCAL_INCIDENT_ID           432045 non-null  object 
 12  FIRE_CODE                   92070 non-null   object 
 13  FIRE_NAME   

In [4]:
# Drop columns with a lot of missing data
fires = fires_all.drop(columns = ['LOCAL_INCIDENT_ID', 'FIRE_CODE', 'ICS_209_INCIDENT_NUMBER',
                                 'ICS_209_NAME', 'MTBS_ID', 'MTBS_FIRE_NAME', 'COMPLEX_NAME',
                                 'COUNTY', 'FIPS_CODE', 'FIPS_NAME'])
fires.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 746709 entries, 13 to 1880463
Data columns (total 29 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   OBJECTID                    746709 non-null  int64  
 1   FOD_ID                      746709 non-null  int64  
 2   FPA_ID                      746709 non-null  object 
 3   SOURCE_SYSTEM_TYPE          746709 non-null  object 
 4   SOURCE_SYSTEM               746709 non-null  object 
 5   NWCG_REPORTING_AGENCY       746709 non-null  object 
 6   NWCG_REPORTING_UNIT_ID      746709 non-null  object 
 7   NWCG_REPORTING_UNIT_NAME    746709 non-null  object 
 8   SOURCE_REPORTING_UNIT       746709 non-null  object 
 9   SOURCE_REPORTING_UNIT_NAME  746709 non-null  object 
 10  LOCAL_FIRE_REPORT_ID        118376 non-null  object 
 11  FIRE_NAME                   305330 non-null  object 
 12  FIRE_YEAR                   746709 non-null  int64  
 13  DISCOVERY_DA

In [5]:
import pickle
with open('More_Than_1Acre.pickle', 'wb') as to_write:
        pickle.dump(fires, to_write)

In [None]:
with open('More_Than_1Acre.pickle','rb') as read_file:
    fires = pickle.load(read_file)

In [29]:
fires['FIRE_SIZE_CLASS'].value_counts()

B    472539
C    220077
D     28427
E     14107
F      7786
G      3773
Name: FIRE_SIZE_CLASS, dtype: int64

In [30]:
fires['FIRE_SIZE_CLASS_BIN'] = np.where(fires['FIRE_SIZE_CLASS'].isin(['B']), 0, 1)

In [31]:
fires['STATE'].value_counts()

GA    74555
TX    73300
MS    55729
FL    48202
AL    44588
SC    43719
CA    39563
NC    35088
OK    32655
AR    21808
KY    20343
LA    20304
TN    18277
NY    17242
MN    16256
AZ    15342
NM    12313
MO    12242
SD    10470
PR    10185
WV    10027
ID     9976
MT     9914
OR     8997
VA     8778
WA     7106
UT     7053
WI     6598
ND     6203
CO     5034
NV     5019
WY     4771
MI     4504
AK     4326
NE     3838
KS     3525
PA     3137
NJ     3104
IA     2554
ME     2450
OH     1454
HI     1222
IL     1036
MD      987
CT      986
NH      682
IN      639
MA      284
VT      160
RI      105
DE       52
DC        7
Name: STATE, dtype: int64

In [32]:
# NOAA climate regions
fires['REGION'] = np.where(fires['STATE'].isin(['WA', 'OR', 'ID']), 'Northwest', 
                  np.where(fires['STATE'].isin(['CA', 'NV']), 'West', 
                  np.where(fires['STATE'].isin(['MT', 'NE', 'ND', 'SD', 'WY']), 'Northern Rockies', 
                  np.where(fires['STATE'].isin(['AK', 'HI']), 'Alaska_Hawaii', 
                  np.where(fires['STATE'].isin(['AZ', 'NM', 'CO', 'UT']), 'Southwest',          
                  np.where(fires['STATE'].isin(['OK', 'TX', 'KS', 'AR', 'LA', 'MS']), 'South', 
                  np.where(fires['STATE'].isin(['AL', 'FL', 'GA', 'NC', 'SC', 'VA']), 'Southeast', 
                  np.where(fires['STATE'].isin(['IL', 'IN', 'KY', 'MO', 'TN', 'OH', 'WV']), 'Ohio Valley', 
                  np.where(fires['STATE'].isin(['IA',  'MI', 'MN', 'WI']), 'Upper Midwest', 
                  np.where(fires['STATE'].isin(['CT', 'NY', 'ME', 'MA', 'MD', 'NH', 'NJ', 'PA', 'RI', 'VT', 'DE']), 'Northeast', 'None'        
                                  ))))))))))

In [33]:
fires['REGION'].value_counts()

Southeast           254930
South               207321
Ohio Valley          64018
West                 44582
Southwest            39742
Northern Rockies     35196
Upper Midwest        29912
Northeast            29189
Northwest            26079
None                 10192
Alaska_Hawaii         5548
Name: REGION, dtype: int64

In [34]:
from sklearn.model_selection import train_test_split

fires_train, fires_test = train_test_split(fires, test_size=0.2, random_state=42)

In [None]:
data_crosstab = pd.crosstab(fires_train['STAT_CAUSE_DESC'], 
                            fires_train['FIRE_SIZE_CLASS_BIN'],  
                               margins = False, normalize = 'index') 
print(data_crosstab) 

In [None]:
# Box and whisker plot of fire size by cause 

In [1]:
data_crosstab = pd.crosstab(fires_train['REGION'], 
                            fires_train['FIRE_SIZE_CLASS_BIN'],  
                               margins = False, normalize = 'index') 
print(data_crosstab) 

NameError: name 'pd' is not defined

In [None]:
# Box and whisker plot of fire size by region

In [None]:
sns.scatterplot(data=combined_df, x='DAYS_TO_CONTAIN', y='FIRE_SIZE', marker='o', s=25);

In [None]:
# One hot encoding fire cause

fires_train.sort_values(by='STAT_CAUSE_DESCR', inplace = True)

cat = fires_train.loc[:, ['STAT_CAUSE_DESCR']]

ohe = OneHotEncoder(drop= 'first', sparse=False)

ohe.fit(cat)
ohe_X = ohe.transform(cat)
columns = ohe.get_feature_names(['STAT_CAUSE_DESCR'])
ohe_X_df = pd.DataFrame(ohe_X, columns = columns, index=cat.index)
combined_df = pd.concat([fires_train, ohe_X_df], axis=1)
combined_df.info()

In [None]:
# Creating new date variables
combined_df['DAYS_TO_CONTAIN'] = combined_df['CONT_DATE'] - combined_df['DISCOVERY_DATE'] 
combined_df['DAYS_TO_CONTAIN'] = combined_df['DAYS_TO_CONTAIN'].fillna(0)
combined_df['DISCOVERY_DATE_DT'] = pd.to_datetime(combined_df['DISCOVERY_DATE'], unit='D', origin='julian')
combined_df['CONT_DATE_DT'] = pd.to_datetime(combined_df['CONT_DATE'], unit='D', origin='julian')