In [1]:
import pandas as pd
from datetime import datetime

In [2]:
def print_all(arr):
    for item in arr:
        print(item)

In [3]:
# load the dataset with columns we intend to use
usedcols = ['IncidentDate', 'Zipcode', 'EstimatedPropertyLoss', 'EstimatedContentsLoss', 
     'SuppressionPersonnel', 'CivilianFatalities', 'CivilianInjuries', 'IgnitionCause', 
     'AreaofFireOrigin', 'PropertyUse', 'StructureStatus', 'DetectorsPresent']
ds = pd.read_csv('Fire_Incidents.csv', usecols=usedcols)
print("Count:", len(ds))

Count: 481531


In [4]:
# filter: only rows with some EstimatedPropertyLoss 
ds2 = ds[(ds['EstimatedPropertyLoss'].notna()) & (ds['EstimatedContentsLoss'].notna())]
print("Count:", len(ds2))

Count: 125989


In [5]:
# filter: remove row if there is nan in any other column
ds3 = ds2.dropna()
print("Count:", len(ds3))

Count: 6123


In [6]:
# print
ds4 = pd.DataFrame(ds3)
ds4[0:3]

Unnamed: 0,IncidentDate,Zipcode,SuppressionPersonnel,EstimatedPropertyLoss,EstimatedContentsLoss,CivilianFatalities,CivilianInjuries,PropertyUse,AreaofFireOrigin,IgnitionCause,StructureStatus,DetectorsPresent
38,05/30/2005,94124.0,4,0.0,0.0,0,0,807 - outside material storage area,"40 - storage area, other","0 - cause, other",-,-
81,02/25/2003,94109.0,22,3000.0,1000.0,0,0,"449 - hotel/motel, commercial","14 - common room, den, family/living room",2 - unintentional,2 -in normal use,1 -present
97,10/02/2005,94117.0,31,5000.0,0.0,0,0,429 - multifamily dwellings,"72 - exterior balcony, unenclosed porch",2 - unintentional,2 -in normal use,-


In [7]:
# convert: IncidentDate from date to month only
ds4['IncidentDate'] = ds4['IncidentDate'].apply(lambda row: datetime.strptime(row, '%m/%d/%Y').strftime('%-m'))

# convert: Zipcode to integer (it makes no sense for it to be float)
ds4['Zipcode'] = ds4['Zipcode'].astype('int64')

# convert: EstimatedPropertyLoss and EstimatedContentsLoss to integer (it makes no sense for it to be float)
ds4['EstimatedPropertyLoss'] = ds4['EstimatedPropertyLoss'].astype('int64')
ds4['EstimatedContentsLoss'] = ds4['EstimatedContentsLoss'].astype('int64')

In [8]:
# check: what are all possible values for PropertyUse?
#print_all(ds3['PropertyUse'].unique())

# convert: PropertyUse to categorical (integers)
ds4['PropertyUse'] = pd.factorize(ds4['PropertyUse'])[0]

In [9]:
# check: what are all possible values for AreaofFireOrigin?
#print_all(ds3['AreaofFireOrigin'].unique())

# filter: remove 'AreaofFireOrigin': -, 0
ds5 = ds4[(ds4['AreaofFireOrigin'] != '0 -') 
          & (ds4['AreaofFireOrigin'] != '-')]
print("Count:", len(ds5))

ds6 = pd.DataFrame(ds5)

# convert: AreaofFireOrigin to categorical (integers)
ds6['AreaofFireOrigin'] = pd.factorize(ds6['AreaofFireOrigin'])[0]

Count: 5639


In [10]:
# check: what are all possible values for IgnitionCause?
#print_all(ds6['IgnitionCause'].unique())

# filter: remove 'IgnitionCause': -
ds7 = ds6[(ds6['IgnitionCause'] != '-')]
print("Count:", len(ds7))

ds8 = pd.DataFrame(ds7)

# convert: IgnitionCause to categorical (integers)
ds8['IgnitionCause'] = pd.factorize(ds8['IgnitionCause'])[0]

Count: 5639


In [11]:
# check: what are all possible values for StructureStatus?
#print_all(ds8['StructureStatus'].unique())

# filter: remove 'StructureStatus': -
ds9 = ds8[(ds8['StructureStatus'] != '-')]
print("Count:", len(ds9))

ds10 = pd.DataFrame(ds9)

# convert: StructureStatus to categorical (integers)
ds10['StructureStatus'] = pd.factorize(ds10['StructureStatus'])[0]

Count: 5084


In [12]:
# check: what are all possible values for DetectorsPresent?
#print_all(ds10['DetectorsPresent'].unique())

# transform: it's either 0 = not present or 1 = present
ds10['DetectorsPresent'] = ds10['DetectorsPresent'].replace(to_replace='1 -present', value='1')
ds10['DetectorsPresent'] = ds10['DetectorsPresent'].replace(to_replace='1 present', value='1')
ds10['DetectorsPresent'] = ds10['DetectorsPresent'].replace(to_replace='n -not present', value='0')
ds10['DetectorsPresent'] = ds10['DetectorsPresent'].replace(to_replace='u -undetermined', value='0')
ds10['DetectorsPresent'] = ds10['DetectorsPresent'].replace(to_replace='n none present', value='0')
ds10['DetectorsPresent'] = ds10['DetectorsPresent'].replace(to_replace='u undetermined', value='0')

ds10['DetectorsPresent'] = pd.factorize(ds10['DetectorsPresent'])[0]

In [13]:
# print
ds10[0:10]

Unnamed: 0,IncidentDate,Zipcode,SuppressionPersonnel,EstimatedPropertyLoss,EstimatedContentsLoss,CivilianFatalities,CivilianInjuries,PropertyUse,AreaofFireOrigin,IgnitionCause,StructureStatus,DetectorsPresent
81,2,94109,22,3000,1000,0,0,1,1,1,0,0
97,10,94117,31,5000,0,0,0,2,2,1,0,1
276,3,94102,42,50,10,0,0,2,3,1,0,2
378,6,94110,32,0,0,0,0,2,4,1,0,2
428,3,94131,30,0,0,0,0,3,5,1,0,2
452,1,94105,32,20000,25000,0,0,4,6,1,1,2
453,2,94115,32,10000,5000,0,0,2,7,2,0,0
459,9,94122,4,2000,2000,0,0,5,6,1,0,2
580,4,94109,31,100,100,0,0,6,8,1,0,0
606,7,94102,31,0,0,0,0,1,4,1,0,0


In [14]:
# save dataset to disk
ds10.to_csv('sf_clean.csv', index=False)