# Exploration of the US disaster declaration dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [22]:
dataset_path = './datasets/us_disaster_declarations.csv'
df = pd.read_csv(dataset_path)

In [3]:
df.head()

Unnamed: 0,fema_declaration_string,disaster_number,state,declaration_type,declaration_date,fy_declared,incident_type,declaration_title,ih_program_declared,ia_program_declared,...,place_code,designated_area,declaration_request_number,last_ia_filing_date,incident_id,region,designated_incident_types,last_refresh,hash,id
0,DR-1-GA,1,GA,DR,1953-05-02T00:00:00Z,1953,Tornado,Tornado,0,1,...,0,Statewide,53013,,53013,4,,2024-08-27T18:22:14Z,413ff808d79f08a6710f6b78f361d5a7de692711,8943dfcf-9786-4e51-8889-d62014034bb2
1,DR-2-TX,2,TX,DR,1953-05-15T00:00:00Z,1953,Tornado,Tornado & Heavy Rainfall,0,1,...,0,Statewide,53003,,53003,6,W,2024-08-27T18:22:14Z,8a8bc885c003cb873c201bb6a3a2771a6d84efb1,ff821327-6b90-4246-b19f-fff8c4b288a8
2,DR-3-LA,3,LA,DR,1953-05-29T00:00:00Z,1953,Flood,Flood,0,1,...,0,Statewide,53005,,53005,6,,2024-08-27T18:22:14Z,b6e6f19ae3c0d2383b7b873b8495bd2770f2ff9a,cd461e08-5ac9-4e70-8507-9c7a3cbff265
3,DR-4-MI,4,MI,DR,1953-06-02T00:00:00Z,1953,Tornado,Tornado,0,1,...,0,Statewide,53004,,53004,5,,2024-08-27T18:22:14Z,34f0061012c8069f145d56a3537cd327b7d4e49b,53be0c04-d2ae-42fb-b070-a01b0a50b7f6
4,DR-5-MT,5,MT,DR,1953-06-06T00:00:00Z,1953,Flood,Floods,0,1,...,0,Statewide,53006,,53006,8,,2024-08-27T18:22:14Z,3bdbec258e4640c3f02971dbc1f9dbc3ebbfc96a,4b3ed0ac-299b-49f0-80d4-9a2a6bacd5a4


## Data preparation

operations to be carried out:
- drop every col outside incident_type, declaration_date, state, place_code, incident_begin_date, incident_end_date 
- remove all rows with NaN data or incident_type
- encode state col
- filter only useful incident_type

drop useless cols

In [23]:
drop_cols = df.columns.difference(['incident_type', 'declaration_title', 'declaration_date', 'state', 'place_code', 'incident_begin_date', 'incident_end_date'])
df.drop(columns=drop_cols, inplace=True)
df.head()

Unnamed: 0,state,declaration_date,incident_type,declaration_title,incident_begin_date,incident_end_date,place_code
0,GA,1953-05-02T00:00:00Z,Tornado,Tornado,1953-05-02T00:00:00Z,1953-05-02T00:00:00Z,0
1,TX,1953-05-15T00:00:00Z,Tornado,Tornado & Heavy Rainfall,1953-05-15T00:00:00Z,1953-05-15T00:00:00Z,0
2,LA,1953-05-29T00:00:00Z,Flood,Flood,1953-05-29T00:00:00Z,1953-05-29T00:00:00Z,0
3,MI,1953-06-02T00:00:00Z,Tornado,Tornado,1953-06-02T00:00:00Z,1953-06-02T00:00:00Z,0
4,MT,1953-06-06T00:00:00Z,Flood,Floods,1953-06-06T00:00:00Z,1953-06-06T00:00:00Z,0


remove NaN rows

In [24]:
df.isna().sum()

state                    0
declaration_date         0
incident_type            0
declaration_title        0
incident_begin_date      0
incident_end_date      522
place_code               0
dtype: int64

filter out useless incident types

In [27]:
set(df["incident_type"])

{'Biological',
 'Chemical',
 'Coastal Storm',
 'Dam/Levee Break',
 'Drought',
 'Earthquake',
 'Fire',
 'Fishing Losses',
 'Flood',
 'Freezing',
 'Human Cause',
 'Hurricane',
 'Mud/Landslide',
 'Other',
 'Severe Ice Storm',
 'Severe Storm',
 'Snowstorm',
 'Straight-Line Winds',
 'Terrorist',
 'Tornado',
 'Toxic Substances',
 'Tropical Depression',
 'Tropical Storm',
 'Tsunami',
 'Typhoon',
 'Volcanic Eruption',
 'Winter Storm'}

In [28]:
valuable_types = [ 'Coastal Storm', 'Drought', 'Fire', 'Flood', 'Freezing', 'Flood', 'Hurricane', 
                  'Mud/Landslide',  'Severe Ice Storm', 'Snowstorm',  'Severe Storm', 'Tornado', 
                  'Tropical Storm', 'Tsunami', 'Typhoon', 'Winter Storm']
df = df[df['incident_type'].isin(valuable_types)]
df['incident_type'].value_counts()

incident_type
Severe Storm        19267
Hurricane           13721
Flood               11204
Fire                 3843
Snowstorm            3707
Severe Ice Storm     2956
Tornado              1623
Drought              1292
Tropical Storm       1059
Coastal Storm         637
Freezing              301
Winter Storm          149
Typhoon               130
Mud/Landslide          44
Tsunami                 9
Name: count, dtype: int64

In [33]:
print("Final dataset size: ", int(df.size / len(df.columns)))

Final dataset size:  59942


## Data visualization