### Import libraries and Connect to Database

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

conn = sqlite3.connect("Wildfires.sqlite")
print(conn.total_changes)

0


### Read and Filter Data

In [2]:
# Connect to fires table
df = pd.read_sql_query("SELECT * FROM Fires;", conn)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1880465 entries, 0 to 1880464
Data columns (total 39 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   OBJECTID                    int64  
 1   FOD_ID                      int64  
 2   FPA_ID                      object 
 3   SOURCE_SYSTEM_TYPE          object 
 4   SOURCE_SYSTEM               object 
 5   NWCG_REPORTING_AGENCY       object 
 6   NWCG_REPORTING_UNIT_ID      object 
 7   NWCG_REPORTING_UNIT_NAME    object 
 8   SOURCE_REPORTING_UNIT       object 
 9   SOURCE_REPORTING_UNIT_NAME  object 
 10  LOCAL_FIRE_REPORT_ID        object 
 11  LOCAL_INCIDENT_ID           object 
 12  FIRE_CODE                   object 
 13  FIRE_NAME                   object 
 14  ICS_209_INCIDENT_NUMBER     object 
 15  ICS_209_NAME                object 
 16  MTBS_ID                     object 
 17  MTBS_FIRE_NAME              object 
 18  COMPLEX_NAME                object 
 19  FIRE_YEAR            

In [3]:
# Only look at fires from 2000 or later
fires_all = df[df['FIRE_YEAR'] >= 2000]
fires_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1308317 entries, 0 to 1880464
Data columns (total 39 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   OBJECTID                    1308317 non-null  int64  
 1   FOD_ID                      1308317 non-null  int64  
 2   FPA_ID                      1308317 non-null  object 
 3   SOURCE_SYSTEM_TYPE          1308317 non-null  object 
 4   SOURCE_SYSTEM               1308317 non-null  object 
 5   NWCG_REPORTING_AGENCY       1308317 non-null  object 
 6   NWCG_REPORTING_UNIT_ID      1308317 non-null  object 
 7   NWCG_REPORTING_UNIT_NAME    1308317 non-null  object 
 8   SOURCE_REPORTING_UNIT       1308317 non-null  object 
 9   SOURCE_REPORTING_UNIT_NAME  1308317 non-null  object 
 10  LOCAL_FIRE_REPORT_ID        284168 non-null   object 
 11  LOCAL_INCIDENT_ID           762425 non-null   object 
 12  FIRE_CODE                   276762 non-null   object 
 1

In [4]:
# Drop columns with a lot of missing data
fires = fires_all.drop(columns = ['FOD_ID', 'FPA_ID', 'LOCAL_FIRE_REPORT_ID', 
                                'LOCAL_INCIDENT_ID', 'FIRE_CODE', 
                                'ICS_209_INCIDENT_NUMBER','ICS_209_NAME', 
                                'MTBS_ID', 'MTBS_FIRE_NAME', 'COMPLEX_NAME',
                                 'COUNTY', 'FIPS_CODE', 'FIPS_NAME'])
fires.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1308317 entries, 0 to 1880464
Data columns (total 26 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   OBJECTID                    1308317 non-null  int64  
 1   SOURCE_SYSTEM_TYPE          1308317 non-null  object 
 2   SOURCE_SYSTEM               1308317 non-null  object 
 3   NWCG_REPORTING_AGENCY       1308317 non-null  object 
 4   NWCG_REPORTING_UNIT_ID      1308317 non-null  object 
 5   NWCG_REPORTING_UNIT_NAME    1308317 non-null  object 
 6   SOURCE_REPORTING_UNIT       1308317 non-null  object 
 7   SOURCE_REPORTING_UNIT_NAME  1308317 non-null  object 
 8   FIRE_NAME                   704705 non-null   object 
 9   FIRE_YEAR                   1308317 non-null  int64  
 10  DISCOVERY_DATE              1308317 non-null  float64
 11  DISCOVERY_DOY               1308317 non-null  int64  
 12  DISCOVERY_TIME              703010 non-null   object 
 1

In [5]:
# Creating new date variables
fires['DAYS_TO_CONTAIN'] = fires['CONT_DATE'] - fires['DISCOVERY_DATE'] 
fires['DAYS_TO_CONTAIN'] = fires['DAYS_TO_CONTAIN'].fillna(0)
fires['DISCOVERY_DATE_D'] = pd.to_datetime(fires['DISCOVERY_DATE'], unit='D', origin='julian')
fires['CONT_DATE_D'] = pd.to_datetime(fires['CONT_DATE'], unit='D', origin='julian')

In [6]:
# NOAA climate regions (https://www.ncdc.noaa.gov/monitoring-references/maps/us-climate-regions.php)
fires['REGION'] = np.where(fires['STATE'].isin(['WA', 'OR', 'ID']), 'Northwest', 
                  np.where(fires['STATE'].isin(['CA', 'NV']), 'West', 
                  np.where(fires['STATE'].isin(['MT', 'NE', 'ND', 'SD', 'WY']), 'Northern Rockies', 
                  np.where(fires['STATE'].isin(['AK']), 'Alaska', 
                  np.where(fires['STATE'].isin(['AZ', 'NM', 'CO', 'UT']), 'Southwest',          
                  np.where(fires['STATE'].isin(['OK', 'TX', 'KS', 'AR', 'LA', 'MS']), 'South', 
                  np.where(fires['STATE'].isin(['AL', 'FL', 'GA', 'NC', 'SC', 'VA']), 'Southeast', 
                  np.where(fires['STATE'].isin(['IL', 'IN', 'KY', 'MO', 'TN', 'OH', 'WV']), 'Ohio Valley', 
                  np.where(fires['STATE'].isin(['IA',  'MI', 'MN', 'WI']), 'Upper Midwest', 
                  np.where(fires['STATE'].isin(['CT', 'NY', 'ME', 'MA', 'MD', 'NH', 'NJ', 'PA', 'RI', 'VT', 'DE']), 'Northeast', 'Other'        
                                  ))))))))))

In [None]:
# Export data for after 2000 to csv
fires.to_csv('Fires_During_After_2000.csv')

In [7]:
# Restricting data source to only records with discovery and containment times
fires_time = fires[ fires['DISCOVERY_TIME'].notna() & (fires['DISCOVERY_TIME']!=u'')]
fires_cont = fires_time[ fires_time['CONT_TIME'].notna() & (fires['CONT_TIME']!=u'')]
fires_cont.info()

  fires_cont = fires_time[ fires_time['CONT_TIME'].notna() & (fires['CONT_TIME']!=u'')]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 623178 entries, 0 to 1880460
Data columns (total 30 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   OBJECTID                    623178 non-null  int64         
 1   SOURCE_SYSTEM_TYPE          623178 non-null  object        
 2   SOURCE_SYSTEM               623178 non-null  object        
 3   NWCG_REPORTING_AGENCY       623178 non-null  object        
 4   NWCG_REPORTING_UNIT_ID      623178 non-null  object        
 5   NWCG_REPORTING_UNIT_NAME    623178 non-null  object        
 6   SOURCE_REPORTING_UNIT       623178 non-null  object        
 7   SOURCE_REPORTING_UNIT_NAME  623178 non-null  object        
 8   FIRE_NAME                   493876 non-null  object        
 9   FIRE_YEAR                   623178 non-null  int64         
 10  DISCOVERY_DATE              623178 non-null  float64       
 11  DISCOVERY_DOY               623178 non

In [8]:
# Separating discovery hour and min as well as containment hour and min because original format
# was not allowing for transformation to date time
fires_cont['DISCOVERY_HOUR'] = fires_cont['DISCOVERY_TIME'].astype(str).str[:-2]
fires_cont['DISCOVERY_MIN'] = fires_cont['DISCOVERY_TIME'].astype(str).str[-2:]
fires_cont['CONT_HOUR'] = fires_cont['CONT_TIME'].astype(str).str[:-2]
fires_cont['CONT_MIN'] = fires_cont['CONT_TIME'].astype(str).str[-2:].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_cont['DISCOVERY_HOUR'] = fires_cont['DISCOVERY_TIME'].astype(str).str[:-2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_cont['DISCOVERY_MIN'] = fires_cont['DISCOVERY_TIME'].astype(str).str[-2:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_cont['CONT_HOUR'] = fires_cont['CONT_T

In [9]:
# Create time variables
fires_cont['DISCOVERY_TIME_NEW'] = (pd.to_datetime(fires_cont['DISCOVERY_HOUR'].astype(str) + ':' + fires_cont['DISCOVERY_MIN'].astype(str), format='%H:%M'))
fires_cont['CONT_TIME_NEW'] = (pd.to_datetime(fires_cont['CONT_HOUR'].astype(str) + ':' + fires_cont['CONT_MIN'].astype(str), format='%H:%M'))
fires_cont.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_cont['DISCOVERY_TIME_NEW'] = (pd.to_datetime(fires_cont['DISCOVERY_HOUR'].astype(str) + ':' + fires_cont['DISCOVERY_MIN'].astype(str), format='%H:%M'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_cont['CONT_TIME_NEW'] = (pd.to_datetime(fires_cont['CONT_HOUR'].astype(str) + ':' + fires_cont['CONT_MIN'].astype(str), format='%H:%M'))


<class 'pandas.core.frame.DataFrame'>
Int64Index: 623178 entries, 0 to 1880460
Data columns (total 36 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   OBJECTID                    623178 non-null  int64         
 1   SOURCE_SYSTEM_TYPE          623178 non-null  object        
 2   SOURCE_SYSTEM               623178 non-null  object        
 3   NWCG_REPORTING_AGENCY       623178 non-null  object        
 4   NWCG_REPORTING_UNIT_ID      623178 non-null  object        
 5   NWCG_REPORTING_UNIT_NAME    623178 non-null  object        
 6   SOURCE_REPORTING_UNIT       623178 non-null  object        
 7   SOURCE_REPORTING_UNIT_NAME  623178 non-null  object        
 8   FIRE_NAME                   493876 non-null  object        
 9   FIRE_YEAR                   623178 non-null  int64         
 10  DISCOVERY_DATE              623178 non-null  float64       
 11  DISCOVERY_DOY               623178 non

In [10]:
# Create datetime variables and hours til containment variable
fires_cont['DISCOVERY_DATE_DT'] = pd.to_datetime(fires_cont['DISCOVERY_DATE_D'].astype(str) + ' ' + fires_cont['DISCOVERY_TIME_NEW'].astype(str))
fires_cont['CONT_DATE_DT'] = pd.to_datetime(fires_cont['CONT_DATE_D'].astype(str) + ' ' + fires_cont['CONT_TIME_NEW'].astype(str))
fires_cont['CONT_HOURS'] = (fires_cont['CONT_DATE_DT'] - fires_cont['DISCOVERY_DATE_DT'])/ np.timedelta64(1,'h')
fires_cont.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_cont['DISCOVERY_DATE_DT'] = pd.to_datetime(fires_cont['DISCOVERY_DATE_D'].astype(str) + ' ' + fires_cont['DISCOVERY_TIME_NEW'].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_cont['CONT_DATE_DT'] = pd.to_datetime(fires_cont['CONT_DATE_D'].astype(str) + ' ' + fires_cont['CONT_TIME_NEW'].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/panda

Unnamed: 0,OBJECTID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,SOURCE_REPORTING_UNIT_NAME,FIRE_NAME,FIRE_YEAR,...,REGION,DISCOVERY_HOUR,DISCOVERY_MIN,CONT_HOUR,CONT_MIN,DISCOVERY_TIME_NEW,CONT_TIME_NEW,DISCOVERY_DATE_DT,CONT_DATE_DT,CONT_HOURS
0,1,FED,FS-FIRESTAT,FS,USCAPNF,Plumas National Forest,511,Plumas National Forest,FOUNTAIN,2005,...,West,13,0,17,30,1900-01-01 13:00:00,1900-01-01 17:30:00,2005-02-02 13:00:00-01:00,2005-02-02 17:30:00-01:00,4.5
1,2,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,PIGEON,2004,...,West,8,45,15,30,1900-01-01 08:45:00,1900-01-01 15:30:00,2004-05-12 08:45:00-01:00,2004-05-12 15:30:00-01:00,6.75
2,3,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,SLACK,2004,...,West,19,21,20,24,1900-01-01 19:21:00,1900-01-01 20:24:00,2004-05-31 19:21:00-01:00,2004-05-31 20:24:00-01:00,1.05
3,4,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,DEER,2004,...,West,16,0,14,0,1900-01-01 16:00:00,1900-01-01 14:00:00,2004-06-28 16:00:00-01:00,2004-07-03 14:00:00-01:00,118.0
4,5,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,STEVENOT,2004,...,West,16,0,12,0,1900-01-01 16:00:00,1900-01-01 12:00:00,2004-06-28 16:00:00-01:00,2004-07-03 12:00:00-01:00,116.0


In [None]:
# Export data to csv
fires_cont.to_csv('Fires_During_After_2000_With_Containment.csv')

In [12]:
# Restrictions for final data set
fires_new = fires_cont[fires_cont['CONT_HOURS'] > 5]
fires_final = fires_new[ fires_new['FIRE_SIZE_CLASS'] != 'A'] # A is fire size class that is <= 0.25 acres
fires_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88560 entries, 13 to 1880458
Data columns (total 39 columns):
 #   Column                      Non-Null Count  Dtype                                
---  ------                      --------------  -----                                
 0   OBJECTID                    88560 non-null  int64                                
 1   SOURCE_SYSTEM_TYPE          88560 non-null  object                               
 2   SOURCE_SYSTEM               88560 non-null  object                               
 3   NWCG_REPORTING_AGENCY       88560 non-null  object                               
 4   NWCG_REPORTING_UNIT_ID      88560 non-null  object                               
 5   NWCG_REPORTING_UNIT_NAME    88560 non-null  object                               
 6   SOURCE_REPORTING_UNIT       88560 non-null  object                               
 7   SOURCE_REPORTING_UNIT_NAME  88560 non-null  object                               
 8   FIRE_NAME    

In [13]:
# Export final data set used in models
# The other data sources were created for more flexibility as I was figuring out my data 
# restrictions, but this one is what I ultimately went with
fires_final.to_csv('Fires_Final_Data.csv')