In [None]:
import pandas as pd
from autogluon.tabular import TabularPredictor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import math

In [127]:
stormevents2013 = pd.read_csv("../data/StormEvents_details-ftp_v1.0_d2013_c20250520.csv")

In [88]:
def convert_to_number(x):
    if pd.isna(x):
        return None

    if 'K' in x:
        return float(x.replace('K', '')) * 1000
    elif 'M' in x:
        return float(x.replace('M', '')) * 1000000
    else:
        try:
            return float(x)
        except ValueError:
            return None
    return x


In [128]:
stormevents2013['DAMAGE_PROPERTY_NUM'] = stormevents2013['DAMAGE_PROPERTY'].apply(convert_to_number)
stormevents2013['DAMAGE_CROPS_NUM'] = stormevents2013['DAMAGE_CROPS'].apply(convert_to_number)


In [129]:
#Drop both damages when NA, make 0 when only one is NA
stormevents2013 = stormevents2013.dropna(subset=['DAMAGE_PROPERTY_NUM', 'DAMAGE_CROPS_NUM'], how='all')
stormevents2013['was_missing_damages'] = stormevents2013[['DAMAGE_PROPERTY_NUM', 'DAMAGE_CROPS_NUM']].isna().any(axis=1).astype(int)
stormevents2013[['DAMAGE_PROPERTY_NUM', 'DAMAGE_CROPS_NUM']] = stormevents2013[['DAMAGE_PROPERTY_NUM', 'DAMAGE_CROPS_NUM']].fillna(0)


In [None]:
# Add total damages
stormevents2013['total_damages'] = stormevents2013['DAMAGE_PROPERTY_NUM'] + stormevents2013['DAMAGE_CROPS_NUM']

(52259, 55)

In [131]:
#Change be and end date time to dtype datetime and get duration of the event
stormevents2013['BEGIN_DATE_TIME'] = pd.to_datetime(stormevents2013['BEGIN_DATE_TIME'])
stormevents2013['END_DATE_TIME'] = pd.to_datetime(stormevents2013['END_DATE_TIME'])
stormevents2013['event_duration'] = stormevents2013['END_DATE_TIME'] - stormevents2013['BEGIN_DATE_TIME'] 
#Drop time columns but the date_time columns
stormevents2013 = stormevents2013.drop(['BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'CZ_TIMEZONE', 'END_YEARMONTH', 'END_DAY', 'END_TIME', 'BEGIN_DATE_TIME', 'END_DATE_TIME', 'YEAR', 'MONTH_NAME'], axis = 1)


  stormevents2013['BEGIN_DATE_TIME'] = pd.to_datetime(stormevents2013['BEGIN_DATE_TIME'])
  stormevents2013['END_DATE_TIME'] = pd.to_datetime(stormevents2013['END_DATE_TIME'])


In [166]:
# Convert them to total seconds duration
stormevents2013['event_duration'] = stormevents2013['event_duration'].dt.total_seconds() # convert to seconds

# Confirm conversion
stormevents2013['event_duration'].head()

0    118800.0
1     57600.0
2    151200.0
3         0.0
4     93600.0
Name: event_duration, dtype: float64

In [132]:
#Drop the damages columns (singluar)
stormevents2013 = stormevents2013.drop(['DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'DAMAGE_PROPERTY_NUM', 'DAMAGE_CROPS_NUM'], axis = 1)
#Drop the episode and event id
stormevents2013 = stormevents2013.drop(['EPISODE_ID', 'EVENT_ID'], axis = 1)

In [146]:
# --- Handle and engineer spatial features ---
for col in ['BEGIN_LAT','BEGIN_LON','END_LAT','END_LON']:
    stormevents2013[col] = pd.to_numeric(stormevents2013[col], errors='coerce')

stormevents2013['END_LAT'] = stormevents2013['END_LAT'].fillna(stormevents2013['BEGIN_LAT'])
stormevents2013['END_LON'] = stormevents2013['END_LON'].fillna(stormevents2013['BEGIN_LON'])

# Distance in km
#def haversine(lat1, lon1, lat2, lon2):
#    R = 6371
#    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
#    dlat, dlon = lat2 - lat1, lon2 - lon1
#    a = np.sin(dlat/2)*2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)*2
#    return 2 * R * np.arcsin(np.sqrt(a))

def haversine(lat1, lon1, lat2, lon2):
    
    # distance between latitudes
    # and longitudes
    dLat = (lat2 - lat1) * math.pi / 180.0
    dLon = (lon2 - lon1) * math.pi / 180.0

    # convert to radians
    lat1 = (lat1) * math.pi / 180.0
    lat2 = (lat2) * math.pi / 180.0

    # apply formulae
    a = (pow(math.sin(dLat / 2), 2) + pow(math.sin(dLon / 2), 2) * math.cos(lat1) * math.cos(lat2))
    rad = 6371
    c = 2 * math.asin(math.sqrt(a))
    return rad * c


stormevents2013['EVENT_LENGTH_KM'] = stormevents2013.apply(lambda row: haversine(row['BEGIN_LAT'], row['BEGIN_LON'], row['END_LAT'], row['END_LON']), axis=1)

#stormevents2013 = stormevents2013.drop(columns=['BEGIN_LAT','BEGIN_LON','END_LAT','END_LON'], errors='ignore')

In [None]:
#from geopy.distance import geodesic

#stormevents2013['distance_km'] = stormevents2013.apply(
#    lambda row: geodesic((row.BEGIN_LAT, row.BEGIN_LON), (row.END_LAT, row.END_LON)).km, axis=1
#)

ModuleNotFoundError: No module named 'geopy'

In [167]:
train_stormevents2013, test_stormevents2013 = train_test_split(stormevents2013,
                                                    random_state=42, 
                                                    test_size = 1/3,
                                                    shuffle=True)

In [168]:
print(train_stormevents2013.shape)

print(test_stormevents2013.shape)
stormevents2013['event_duration']

(34839, 40)
(17420, 40)


0        118800.0
1         57600.0
2        151200.0
3             0.0
4         93600.0
           ...   
59980      6300.0
59981      6840.0
59983       600.0
59984         0.0
59985         0.0
Name: event_duration, Length: 52259, dtype: float64

In [None]:
predictor = TabularPredictor(label='total_damages').fit(train_data=train_stormevents2013, time_limit=600)

No path specified. Models will be saved in: "AutogluonModels\ag-20251015_151258"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.4
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          8
Memory Avail:       0.45 GB / 7.89 GB (5.7%)
Disk Space Avail:   821.76 GB / 931.51 GB (88.2%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.
	presets='best'    : Maximize accuracy. Recommended for most users. Use in competitions a

In [None]:
predictor.leaderboard(silent=False)

These features in provided data are not utilized by the predictor and will be ignored: ['END_YEARMONTH', 'YEAR', 'DATA_SOURCE']
Computing feature importance via permutation shuffling for 49 features using 100 rows with 5 shuffle sets...


                 model     score_val              eval_metric  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      RandomForestMSE -3.086176e+06  root_mean_squared_error       0.116967  288.641121                0.116967         288.641121            1       True          3
1  WeightedEnsemble_L2 -3.086176e+06  root_mean_squared_error       0.119331  288.749012                0.002364           0.107891            2       True          7
2        ExtraTreesMSE -3.898738e+06  root_mean_squared_error       0.068081  113.920583                0.068081         113.920583            1       True          4
3      NeuralNetFastAI -4.131118e+06  root_mean_squared_error       0.313870   40.374139                0.313870          40.374139            1       True          5
4           LightGBMXT -4.167347e+06  root_mean_squared_error       0.057502   54.101977                0.057502          54.101977            1       True          

	103.82s	= Expected runtime (20.76s per shuffle set)


KeyboardInterrupt: 

In [None]:
importance = predictor.feature_importance(test_stormevents2013, subsample_size = 100)

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
EPISODE_NARRATIVE,776942.34583,1436538.0,0.146556,5,3734793.0,-2180909.0
BEGIN_DATE_TIME,85235.11948,99708.02,0.064262,5,290535.3,-120065.1
DAMAGE_PROPERTY,67518.598879,95156.24,0.093894,5,263446.6,-128409.4
WFO,25549.971204,41340.85,0.119573,5,110671.4,-59571.41
MAGNITUDE,10217.075843,19601.92,0.154288,5,50577.71,-30143.56
END_DATE_TIME,8932.033948,22258.53,0.210142,5,54762.66,-36898.59
END_AZIMUTH,7448.510976,19395.39,0.219452,5,47383.88,-32486.86
EVENT_ID,5345.54577,10183.94,0.152813,5,26314.42,-15623.33
END_LOCATION,3921.301535,7009.882,0.13958,5,18354.75,-10512.14
BEGIN_RANGE,1412.298387,2799.124,0.161159,5,7175.733,-4351.136


In [None]:
print(importance)

In [None]:
pd.crosstab(stormevents2013['DAMAGE_PROPERTY_NUM'].isna(), stormevents2013['DAMAGE_CROPS_NUM'].isna())
#stormevents2013['DAMAGE_CROPS_NUM'].isna()
#plt.hist(stormevents2013['DAMAGE_CROPS_NUM'], bins = 10)
stormevents2013[['BEGIN_DATE_TIME', 'BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME']].head()

Unnamed: 0,BEGIN_DATE_TIME,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME
0,23-FEB-13 19:00:00,201302,23,1900
1,14-DEC-13 21:00:00,201312,14,2100
2,07-MAR-13 15:00:00,201303,7,1500
3,07-OCT-13 18:30:00,201310,7,1830
4,08-FEB-13 15:00:00,201302,8,1500
