In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
KM_PER_NM = 1.852
def calculate_ellipse_area(r_ne, r_se, r_sw, r_nw):
	if any(np.isnan(x) for x in [r_ne, r_se, r_sw, r_nw]):
		return np.nan
	a = KM_PER_NM * (r_ne + r_sw)
	b = KM_PER_NM * (r_nw + r_se)
	area = np.pi * a * b
	return np.nan if area == 0 else area

In [4]:
def calculate_circle_area(r):
	if np.isnan(r) or r == 0:
		return np.nan
	return np.pi * (KM_PER_NM * r) ** 2

In [6]:
df = pd.read_csv("../data/raw/hurdat2_cleaned.csv", parse_dates=["formation_datetime"])
df['year'] = df.formation_datetime.dt.year
df = df[df['year'] >= 2004]

In [7]:
speeds = [34, 50, 64]
for v in speeds:
	df[f'size_{v}kt_sqkm'] = df.apply(axis = 1, func = lambda row: calculate_ellipse_area(row[f'r_{v}kt_ne'], row[f'r_{v}kt_se'], row[f'r_{v}kt_sw'], row[f'r_{v}kt_nw']))

df['size_max_sqkm'] = df.apply(axis = 1, func = lambda row: calculate_circle_area(row['r_max_sus']))

In [8]:
gb = df.groupby('code').agg(
	{
		'size_34kt_sqkm': 'max',
		'size_50kt_sqkm': 'max',
		'size_64kt_sqkm': 'max',
		'size_max_sqkm': 'max'
	}
)
gb.reset_index(inplace = True)

In [9]:
lst = ['size_34kt_sqkm', 'size_50kt_sqkm', 'size_64kt_sqkm', 'size_max_sqkm']
gb.describe()

Unnamed: 0,size_34kt_sqkm,size_50kt_sqkm,size_64kt_sqkm,size_max_sqkm
count,343.0,234.0,160.0,110.0
mean,1104759.0,279026.6,80540.774086,114468.6
std,1863165.0,392722.2,104750.776762,235889.5
min,4310.144,269.384,1077.536121,269.384
25%,129304.3,38791.3,12930.433451,9697.825
50%,387913.0,101827.2,35558.691989,52799.27
75%,1280113.0,348044.2,111457.642505,107753.6
max,16757840.0,2143219.0,568939.071831,1724058.0


In [10]:
gb_filled = gb.fillna(0) # can't use nan in max

In [11]:
gb_filled['biggest'] = gb_filled[lst].max(axis=1)
len(gb_filled[gb_filled['biggest'] != 0])

349

In [12]:
results = gb_filled[['code','biggest']]
results.columns = ['code','peak_size_sqkm']
results

Unnamed: 0,code,peak_size_sqkm
0,AL012004,1.535489e+06
1,AL012005,2.941674e+05
2,AL012006,1.803526e+06
3,AL012007,3.025721e+06
4,AL012008,3.771376e+05
...,...,...
370,AL292020,6.465217e+05
371,AL302005,5.689391e+05
372,AL302020,8.350905e+05
373,AL312005,4.714221e+05


In [13]:
results.loc[results.peak_size_sqkm == 0,'peak_size_sqkm'] = np.nan
results

Unnamed: 0,code,peak_size_sqkm
0,AL012004,1.535489e+06
1,AL012005,2.941674e+05
2,AL012006,1.803526e+06
3,AL012007,3.025721e+06
4,AL012008,3.771376e+05
...,...,...
370,AL292020,6.465217e+05
371,AL302005,5.689391e+05
372,AL302020,8.350905e+05
373,AL312005,4.714221e+05


In [14]:
results.to_csv('../data/processed/outcomes/size.csv', index = False)