In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from random import choice
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon
import seaborn as sns

sns.set_style("whitegrid")
pd.set_option('display.max_rows', 500)

In [3]:
dtype = {
    'BEGIN_YEARMONTH': int,
    'BEGIN_DAY': int,
    'BEGIN_TIME': 'Int64',
    'END_YEARMONTH': int,
    'END_DAY': int,
    'END_TIME': int,
    'EPISODE_ID': 'Int64',
    'EVENT_ID': 'Int64',
    'STATE': str,
    'STATE_FIPS': 'Int64',
    'YEAR': 'Int64',
    'MONTH_NAME': str,
    'EVENT_TYPE': str,
    'CZ_TYPE': str,
    'CZ_FIPS': 'Int64',
    'CZ_NAME': str,
    'WFO': str,
    'BEGIN_DATE_TIME': str,
    'CZ_TIMEZONE': str,
    'END_DATE_TIME': str,
    'INJURIES_DIRECT': 'Int64',
    'INJURIES_INDIRECT': 'Int64',
    'DEATHS_DIRECT': 'Int64',
    'DEATHS_INDIRECT': 'Int64',
    'DAMAGE_PROPERTY': str,
    'DAMAGE_CROPS': str,
    'SOURCE': str,
    'MAGNITUDE': 'Float64',
    'MAGNITUDE_TYPE': str,
    'FLOOD_CAUSE': str,
    'TOR_F_SCALE': str,
    'TOR_LENGTH': 'Float64',
    'TOR_WIDTH': 'Float64',
    'TOR_OTHER_WFO': str,
    'TOR_OTHER_CZ_STATE': str,
    'TOR_OTHER_CZ_FIPS': 'Int64',
    'TOR_OTHER_CZ_NAME': str,
    'BEGIN_RANGE': 'Float64',
    'BEGIN_AZIMUTH': str,
    'BEGIN_LOCATION': str,
    'END_RANGE': 'Float64',
    'END_AZIMUTH': str,
    'END_LOCATION': str,
    'BEGIN_LAT': 'Float64',
    'BEGIN_LON': 'Float64',
    'END_LAT': 'Float64',
    'END_LON': 'Float64',
    'EPISODE_NARRATIVE': str,
    'EVENT_NARRATIVE': str,
    'DATA_SOURCE': str,
    'TOR_F_SCALE_MAPPED': 'str',
    'DAMAGE_CROPS_DESUFFIX': 'Float64',
    'DAMAGE_PROPERTY_DESUFFIX': 'Float64'
}

tdf = pd.read_csv('../StormEvents_details_WORKING.csv',dtype=dtype)
tdf['TOR_F_SCALE_MAPPED'] = tdf.TOR_F_SCALE_MAPPED.fillna('N/A')

tdf['BEGIN_YEAR'] = tdf['BEGIN_YEARMONTH'].apply(lambda ym : int(str(ym)[0:4]))
tdf['BEGIN_DECADE'] = (tdf['BEGIN_YEAR'] // 10) * 10
tdf['BEGIN_MONTH'] = tdf['BEGIN_YEARMONTH'].apply(lambda ym : int(str(ym)[-2:]))
tdf['CASUALTIES'] = tdf[['DEATHS_DIRECT','DEATHS_INDIRECT','INJURIES_DIRECT','INJURIES_INDIRECT']].sum(axis=1)

tdf.rename({'Unnamed: 0':'STORM_ID'},axis=1,inplace=True)

grouped = tdf.groupby('EPISODE_ID').count().reset_index()[['EPISODE_ID','STORM_ID']]
multi_episodes = grouped[grouped['STORM_ID'] > 1]['EPISODE_ID']
tdf['CLUSTER_EPISODE_ID'] = tdf['EPISODE_ID'].apply(lambda e : e if e in multi_episodes else np.nan)



In [4]:
counties = pd.read_csv('../assets/county_population.csv')
counties.sample(10)

Unnamed: 0.1,Unnamed: 0,fips,county,state,year,population,latitude,longitude
38821,38821,22019,Calcasieu,Louisiana,2018,203177,30.23,-93.358
29392,29392,22007,Assumption,Louisiana,2015,22898,29.9,-91.051
115348,115348,42029,Chester,Pennsylvania,1982,325773,39.974,-75.75
125676,125676,1037,Coosa,Alabama,1986,11129,32.931,-86.243
17533,17533,35049,Santa Fe,New Mexico,2011,145847,35.514,-105.966
113368,113368,8069,Larimer,Colorado,1982,157410,40.663,-105.482
66945,66945,20165,Rush,Kansas,2007,3169,38.524,-99.309
54108,54108,18009,Blackford,Indiana,2003,13722,40.473,-85.324
21443,21443,48135,Ector,Texas,2012,144495,31.865,-102.543
155885,155885,37181,Vance,North Carolina,1975,34000,36.365,-78.405
