In [2]:
#############
## Imports ##
#############

# General Use / Computation Packages
import os
import numpy as np
import pandas as pd
from math import *
from datetime import datetime
from tqdm import tqdm
import warnings
import re
import warnings
warnings.filterwarnings("ignore")

In [3]:
######################
## Reading the Data ##
######################
incidents_full = pd.DataFrame() # Define full DataFrame as empty df
fires_full = pd.DataFrame() # Define full DataFrame as empty df
addresses_full = pd.DataFrame() # Define full DataFrame as empty df
incidents_cols = ['STATE', 'FDID', 'INC_DATE', 'INC_NO', 'EXP_NO', 'INC_TYPE']
fires_cols = ['STATE', 'FDID', 'INC_DATE', 'INC_NO', 'EXP_NO', 'AREA_ORIG', 'FIRST_IGN', 'CAUSE_IGN']
addresses_cols = ['STATE', 'FDID', 'INC_DATE', 'INC_NO', 'EXP_NO', 'CITY', 'STATE_ID', 'ZIP5']

for year in tqdm(np.arange(2004, 2017)): # Loop through the years (with progress bar)
    os.chdir('./NFIRS_DATA_RAW/{}'.format('{} data'.format(year))) # Change directory to the folder where the data is stored
    if year<2012:
        incidents = pd.read_csv( # Read the fireincident file, separated by ^
            'basicincident.csv',
            usecols = incidents_cols,
            encoding = 'latin-1',
            low_memory = False
        )
        fires = pd.read_csv( # Read the fireincident file, separated by ^
            'fireincident.csv',
            usecols = fires_cols,
            encoding = 'latin-1',
            low_memory = False
        )
        addresses = pd.read_csv( # Read the incidentaddress file, separated by ^
            'incidentaddress.csv',
            usecols = addresses_cols,
            encoding = 'latin-1',
            low_memory = False
        )
    else:
        incidents = pd.read_csv( # Read the basicincident file, separated by ^
            'basicincident.txt',
            usecols = incidents_cols,
            delimiter = '^',
            encoding="ISO-8859-1",
            low_memory = False
        )
        fires = pd.read_csv( # Read the fireincident file, separated by ^
            'fireincident.txt',
            usecols = fires_cols,
            delimiter = '^',
            encoding="ISO-8859-1",
            low_memory = False
        )
        addresses = pd.read_csv( # Read the incidentaddress file, separated by ^
            'incidentaddress.txt',
            usecols = addresses_cols,
            delimiter = '^',
            encoding="ISO-8859-1",
            low_memory = False
        )
    if len(incidents_full) == 0:
        incidents_full = incidents
    else:
        incidents_full = incidents_full.append(incidents) # append each year to the full DataFrame
    if len(fires_full) == 0:
        fires_full = fires
    else:
        fires_full = fires_full.append(fires) # append each year to the full DataFrame
    if len(addresses_full) == 0:
        addresses_full = addresses
    else:
        addresses_full = addresses_full.append(addresses) # append each year to the full DataFrame
    os.chdir('../../')

100%|██████████| 13/13 [05:29<00:00, 25.33s/it]


In [4]:
# The unique identifier for these incident-associated records is the combination of the State, 
# fire department ID, incident date, incident number, and exposure number (STATE, FDID, INC_DATE, INC_NO, and EXP_NO)
# NFIRS Version 5.0 Fire Data Analysis Guidelines and Issues Page 8
# https://www.usfa.fema.gov/downloads/pdf/nfirs/nfirs_data_analysis_guidelines_issues.pdf
id_cols = ['STATE', 'FDID', 'INC_DATE', 'INC_NO', 'EXP_NO']

########################################
## Filling NaNs, Creating a Unique ID ##
## Not all years have INCIDENT_KEY    ##
########################################

def str2(x): # put all addresses in uppercase and remove NaN missing values
    if str(x) == 'nan':
        return ''
    else:
        return str(x).upper()

# Create unique id
incidents_full['id'] = ['_'.join([str2(a), str2(b), str2(c), str2(d), str2(e)]) for a, b, c, d, e in zip(
    incidents_full['STATE'],
    incidents_full['FDID'],
    incidents_full['INC_DATE'],
    incidents_full['INC_NO'],
    incidents_full['EXP_NO']
    )]
fires_full['id'] = ['_'.join([str2(a), str2(b), str2(c), str2(d), str2(e)]) for a, b, c, d, e in zip(
    fires_full['STATE'],
    fires_full['FDID'],
    fires_full['INC_DATE'],
    fires_full['INC_NO'],
    fires_full['EXP_NO']
    )]
addresses_full['id'] = ['_'.join([str2(a), str2(b), str2(c), str2(d), str2(e)]) for a, b, c, d, e in zip(
    addresses_full['STATE'],
    addresses_full['FDID'],
    addresses_full['INC_DATE'],
    addresses_full['INC_NO'],
    addresses_full['EXP_NO']
    )]

In [None]:
incidents_full = incidents_full[[ # filter to use only the relevant columns for merging
    'id',
    'INC_TYPE'
]]
fires_full = fires_full[[ # filter to use only the relevant columns for merging
    'id',
]]
addresses_full = addresses_full[[ # filter to use only the relevant columns for merging
    'id',
    'INC_DATE',
    'FDID',
    'CITY',
    'STATE',
    'ZIP5'
]]

In [None]:
fires_merged = fires_full.merge( # merge the fires, incidents and addresses together
    incidents_full,
    how = 'left',
    left_on = 'id',
    right_on = 'id'
).merge(
    addresses_full,
    how = 'left',
    left_on = 'id',
    right_on = 'id'
)

In [43]:
# Conditions for a wildfire #
# INC_TYPE
# 14 Natural vegetation fire
# 140 Natural vegetation fire, other.
# 141 Forest, woods or wildland fire.
# 142 Brush or brush-and-grass mixture fire.
# 143 Grass fire.

wildfires = fires_merged[fires_merged['INC_TYPE'].isin([14, 140, 141, 142, 143, '14', '140', '141', '142', '143'])]
wildfires.drop_duplicates('id', inplace = True)

In [44]:
#wildfires.head()
#wildfires.to_csv('wildfires.csv')