# Exploratory data analysis on FEMA disasters list
Importing and cleaning data from this site:


https://www.fema.gov/openfema-dataset-disaster-declarations-summaries-v1


In [1]:
import pandas as pd

In [2]:
disaster_df = pd.read_csv('../data/DisasterDeclarationsSummaries.csv')

In [3]:
disaster_df.isna().sum().sum()

25234

In [4]:
disaster_df.isna().sum()

disasterNumber              0
ihProgramDeclared           0
iaProgramDeclared           0
paProgramDeclared           0
hmProgramDeclared           0
state                       0
declarationDate             0
fyDeclared                  0
disasterType                0
incidentType                0
title                       0
incidentBeginDate           0
incidentEndDate          8033
disasterCloseOutDate    16585
declaredCountyArea        235
placeCode                 235
hash                       73
lastRefresh                73
id                          0
dtype: int64

In [5]:
disaster_df.shape

(58771, 19)

In [6]:
def null_cleaning(df):
    null_cols = []
    problem_cols = []
    
    # looping through the columns
    for i in range(df.shape[1]):
        # counting the number of null vals in the column
        num_nulls = df.isnull().sum()[i]
        # if there are more than 0 null values, add the column to our list, and see what % of the data it is
        if num_nulls != 0:
            col_name = df.columns[i]
            percent_of_data = round(num_nulls/df.shape[0], 3)
            null_cols.append([col_name, num_nulls, percent_of_data])
            # if this column is over 20% nulls, then mark it as a problem column
            if percent_of_data >= 0.2:
                problem_cols.append(col_name)
                
    # what happens if we drop the problem columns?
    df_no_prob_cols = df.drop(columns = problem_cols)
    prob_cols_percent = df_no_prob_cols.shape[1] / df.shape[1]
    
    #what happens is we drop the problem rows too?
    df_no_nulls = df_no_prob_cols.dropna()
    prob_rows_percent = df_no_nulls.shape[0]/df_no_prob_cols.shape[0]
    
    print("The problem columns are: ", problem_cols)
    print("After dropping the problem columns, you are left with ", prob_cols_percent, "% of your columns.")
    print("After dropping the problem columns, and then dropping all rows containing nulls,\n you are left with ", 
          prob_rows_percent, "% of your rows.")
    return null_cols


In [7]:
null_cleaning(disaster_df)

The problem columns are:  ['disasterCloseOutDate']
After dropping the problem columns, you are left with  0.9473684210526315 % of your columns.
After dropping the problem columns, and then dropping all rows containing nulls,
 you are left with  0.8585356723554134 % of your rows.


[['incidentEndDate', 8033, 0.137],
 ['disasterCloseOutDate', 16585, 0.282],
 ['declaredCountyArea', 235, 0.004],
 ['placeCode', 235, 0.004],
 ['hash', 73, 0.001],
 ['lastRefresh', 73, 0.001]]

In [8]:
begin_vs_end_date = disaster_df[(disaster_df['incidentBeginDate']) != (disaster_df['incidentEndDate'])]

In [9]:
begin_vs_end_date.shape

(50576, 19)

In [10]:
# begin and end date are only different on 18 out of over 50,000 entries, 
# and end date has many nans
# so I will drop this col
disaster_df.drop(columns = ['incidentEndDate'], inplace = True)

In [11]:
# incidentType and title are redundant, incidentType is more generic and what would be an input into the model
# so let's drop title
disaster_df.drop(columns = ['title'], inplace = True)

In [12]:
# changing declaration date to date time
disaster_df['declarationDate'][0]

'1953-05-02T00:00:00.000Z'

In [13]:
first_time = pd.to_datetime(disaster_df['declarationDate'][0])

In [14]:
first_time.year

1953

In [15]:
first_time.month

5

In [16]:
first_time.month == 5

True

In [17]:
disaster_df['declaration_dt'] = pd.to_datetime(disaster_df['declarationDate'])

In [18]:
disaster_df['year'] = [date.year for date in disaster_df['declaration_dt']]

In [19]:
disaster_df['year'].head()

0    1953
1    1953
2    1953
3    1953
4    1953
Name: year, dtype: int64

In [20]:
disaster_df_1976_on = disaster_df[disaster_df['year'] > 1975]

In [21]:
disaster_df_1976_on.head()

Unnamed: 0,disasterNumber,ihProgramDeclared,iaProgramDeclared,paProgramDeclared,hmProgramDeclared,state,declarationDate,fyDeclared,disasterType,incidentType,incidentBeginDate,disasterCloseOutDate,declaredCountyArea,placeCode,hash,lastRefresh,id,declaration_dt,year
4478,495,0,1,1,0,MI,1976-03-19T00:00:00.000Z,1976,DR,Severe Storm(s),1976-03-19T00:00:00.000Z,1980-06-26T00:00:00.000Z,Gladwin (County),99051.0,efebbaba7017650dd16dc65a3ad8a718,2019-07-26T18:09:01.198Z,5d1bbd9e8bdcfa6efb330f30,1976-03-19 00:00:00+00:00,1976
4484,495,0,1,1,0,MI,1976-03-19T00:00:00.000Z,1976,DR,Severe Storm(s),1976-03-19T00:00:00.000Z,1980-06-26T00:00:00.000Z,Clare (County),99035.0,e296ea5530ecb58080328e8fee2c1a54,2019-07-26T18:09:01.198Z,5d1bbd9e8bdcfa6efb330f40,1976-03-19 00:00:00+00:00,1976
4485,494,0,1,1,0,NY,1976-03-19T00:00:00.000Z,1976,DR,Severe Ice Storm,1976-03-19T00:00:00.000Z,1991-09-27T00:00:00.000Z,Chautauqua (County),99013.0,14841f57e66b7b871250c134ee7791f3,2019-07-26T18:09:01.198Z,5d1bbd9e8bdcfa6efb330f27,1976-03-19 00:00:00+00:00,1976
4487,494,0,1,1,0,NY,1976-03-19T00:00:00.000Z,1976,DR,Severe Ice Storm,1976-03-19T00:00:00.000Z,1991-09-27T00:00:00.000Z,Genesee (County),99037.0,eea03bc9b9317a92e4a5dd549ba4b93a,2019-07-26T18:09:01.202Z,5d1bbd9e8bdcfa6efb330f29,1976-03-19 00:00:00+00:00,1976
4488,494,0,1,1,0,NY,1976-03-19T00:00:00.000Z,1976,DR,Severe Ice Storm,1976-03-19T00:00:00.000Z,1991-09-27T00:00:00.000Z,Wyoming (County),99121.0,a45ed7f6c6a3d9e4d3ec659fe73f9cab,2019-07-26T18:09:01.200Z,5d1bbd9e8bdcfa6efb330f2a,1976-03-19 00:00:00+00:00,1976


In [22]:
max(disaster_df_1976_on['year'])

2020

In [23]:
min(disaster_df_1976_on['year'])

1976

In [24]:
disaster_df_1976_on.shape

(54202, 19)

In [25]:
disaster_df.shape

(58771, 19)

In [26]:
df = disaster_df_1976_on.copy()

In [27]:
null_cleaning(df)

The problem columns are:  ['disasterCloseOutDate']
After dropping the problem columns, you are left with  0.9473684210526315 % of your columns.
After dropping the problem columns, and then dropping all rows containing nulls,
 you are left with  0.9977122615401646 % of your rows.


[['disasterCloseOutDate', 16585, 0.306],
 ['declaredCountyArea', 54, 0.001],
 ['placeCode', 54, 0.001],
 ['hash', 73, 0.001],
 ['lastRefresh', 73, 0.001]]

In [28]:
# we are just specifying state, not county, so I will drop the county col
# also don't need last refresh or hash
df.drop(columns = ['declaredCountyArea', 'lastRefresh', 'hash'], inplace = True)

In [29]:
df.drop(columns = ['id', 'placeCode','declarationDate', 'fyDeclared'], inplace = True)

In [30]:
null_cleaning(df)

The problem columns are:  ['disasterCloseOutDate']
After dropping the problem columns, you are left with  0.9166666666666666 % of your columns.
After dropping the problem columns, and then dropping all rows containing nulls,
 you are left with  1.0 % of your rows.


[['disasterCloseOutDate', 16585, 0.306]]

In [31]:
# what percent of our data is missing the closeout date?
16585/df.shape[0]

0.3059850190029888

In [32]:
# a third of our data would be lost so let's just drop the closeout date column
df.drop(columns = ['disasterCloseOutDate'], inplace = True)

In [33]:
df.isna().sum().sum()

0

In [34]:
# dropping nans is done!

In [35]:
# is df['ihProgramDeclared'] the same for every row?
df['ihProgramDeclared'].value_counts()

0    45035
1     9167
Name: ihProgramDeclared, dtype: int64

In [36]:
disaster_df[disaster_df['incidentBeginDate'] != disaster_df['declarationDate']].shape

(50484, 19)

In [37]:
# dropping incident begin date
# it's same as declaration date for all but 19 rows
df.drop(columns = ['incidentBeginDate'], inplace = True)

In [40]:
disaster_df[disaster_df['disasterNumber'] == 495].head()

Unnamed: 0,disasterNumber,ihProgramDeclared,iaProgramDeclared,paProgramDeclared,hmProgramDeclared,state,declarationDate,fyDeclared,disasterType,incidentType,incidentBeginDate,disasterCloseOutDate,declaredCountyArea,placeCode,hash,lastRefresh,id,declaration_dt,year
4478,495,0,1,1,0,MI,1976-03-19T00:00:00.000Z,1976,DR,Severe Storm(s),1976-03-19T00:00:00.000Z,1980-06-26T00:00:00.000Z,Gladwin (County),99051.0,efebbaba7017650dd16dc65a3ad8a718,2019-07-26T18:09:01.198Z,5d1bbd9e8bdcfa6efb330f30,1976-03-19 00:00:00+00:00,1976
4484,495,0,1,1,0,MI,1976-03-19T00:00:00.000Z,1976,DR,Severe Storm(s),1976-03-19T00:00:00.000Z,1980-06-26T00:00:00.000Z,Clare (County),99035.0,e296ea5530ecb58080328e8fee2c1a54,2019-07-26T18:09:01.198Z,5d1bbd9e8bdcfa6efb330f40,1976-03-19 00:00:00+00:00,1976
4490,495,0,1,1,0,MI,1976-03-19T00:00:00.000Z,1976,DR,Severe Storm(s),1976-03-19T00:00:00.000Z,1980-06-26T00:00:00.000Z,Allegan (County),99005.0,98cd5c9663b101763e19111e08120e48,2019-07-26T18:09:01.198Z,5d1bbd9e8bdcfa6efb330f2d,1976-03-19 00:00:00+00:00,1976
4494,495,0,1,1,0,MI,1976-03-19T00:00:00.000Z,1976,DR,Severe Storm(s),1976-03-19T00:00:00.000Z,1980-06-26T00:00:00.000Z,Clinton (County),99037.0,be0c73cf6cb57d74e2d51898bc613d90,2019-07-26T18:09:01.202Z,5d1bbd9e8bdcfa6efb330f33,1976-03-19 00:00:00+00:00,1976
4500,495,0,1,1,0,MI,1976-03-19T00:00:00.000Z,1976,DR,Severe Storm(s),1976-03-19T00:00:00.000Z,1980-06-26T00:00:00.000Z,St. Clair (County),99147.0,5a4c394af5ebb6a52fb5f690abccb73a,2019-07-26T18:09:01.198Z,5d1bbd9e8bdcfa6efb330f53,1976-03-19 00:00:00+00:00,1976


In [41]:
df['month'] = [date.month for date in df['declaration_dt']]

In [42]:
# dropping delcaration date because we already have the 
# year and month
df.drop(columns = ['declaration_dt'], inplace = True)

In [43]:
# dropping program declared cols
df.drop(columns = ['ihProgramDeclared','iaProgramDeclared','paProgramDeclared','hmProgramDeclared'], inplace = True)

In [44]:
df[df['disasterNumber'] == 495].head()

Unnamed: 0,disasterNumber,state,disasterType,incidentType,year,month
4478,495,MI,DR,Severe Storm(s),1976,3
4484,495,MI,DR,Severe Storm(s),1976,3
4490,495,MI,DR,Severe Storm(s),1976,3
4494,495,MI,DR,Severe Storm(s),1976,3
4500,495,MI,DR,Severe Storm(s),1976,3


In [45]:
df.drop_duplicates().shape

(3613, 6)

In [46]:
df.drop_duplicates(inplace = True)

In [47]:
df.head()

Unnamed: 0,disasterNumber,state,disasterType,incidentType,year,month
4478,495,MI,DR,Severe Storm(s),1976,3
4485,494,NY,DR,Severe Ice Storm,1976,3
4525,496,WI,DR,Flood,1976,3
4570,497,OK,DR,Tornado,1976,4
4574,498,AR,DR,Tornado,1976,4


In [48]:
df['disasterType'].value_counts()

DR    2050
FM     842
EM     508
FS     213
Name: disasterType, dtype: int64

In [49]:
df['incidentType'].value_counts()

Fire                1128
Severe Storm(s)      953
Flood                515
Hurricane            327
Snow                 163
Biological           151
Tornado              107
Severe Ice Storm      61
Typhoon               56
Drought               36
Earthquake            27
Coastal Storm         25
Other                 16
Freezing              16
Mud/Landslide          6
Toxic Substances       6
Volcano                5
Fishing Losses         4
Human Cause            3
Tsunami                3
Terrorist              2
Dam/Levee Break        2
Chemical               1
Name: incidentType, dtype: int64

In [50]:
# dropping disasterType
# incidentType is going to be input into model
df.drop(columns = ['disasterType'], inplace = True)

In [51]:
df.head()

Unnamed: 0,disasterNumber,state,incidentType,year,month
4478,495,MI,Severe Storm(s),1976,3
4485,494,NY,Severe Ice Storm,1976,3
4525,496,WI,Flood,1976,3
4570,497,OK,Tornado,1976,4
4574,498,AR,Tornado,1976,4


In [52]:
df.to_csv('../data/disasters_clean.csv')