# HURDAT2 to ML Features Workflow

Transform HURDAT2 Atlantic hurricane data into census tract-level features using the **Max Distance Envelope Approach**.

**Input**: Raw HURDAT2 text file  
**Output**: CSV where each row = one storm's impact on one census tract  
**Key Innovation**: Envelope polygon method for efficient wind field modeling

---

## Notebook Structure (7 Sections, 22 Cells)

1. **Data Acquisition & Basic Parsing** (Cells 1-3)
2. **Data Profiling & Understanding** (Cells 4-6) 
3. **Single Storm Envelope (Hurricane Ida Test)** (Cells 7-10)
4. **Census Tract Integration** (Cells 11-13)
5. **Wind Speed Calculations** (Cells 14-16)
6. **Scale to Multiple Storms** (Cells 17-19)
7. **Export & Validation** (Cells 20-22)

---

## Section 1: Data Acquisition & Basic Parsing

Parse raw HURDAT2 format → clean DataFrame

In [6]:
# Cell 1: Download HURDAT2 data
import os
import requests
import pandas as pd
import numpy as np
from pathlib import Path

# Set up paths
base_dir = Path("..").resolve()
input_dir = base_dir / "input_data"
output_dir = base_dir / "outputs"

# Create directories if they don't exist
input_dir.mkdir(exist_ok=True)
output_dir.mkdir(exist_ok=True)

# Download HURDAT2 Atlantic data if not present
# Alternative: use raw GitHub source or archive.org mirror
hurdat_url = "https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2024-040425.txt"
hurdat_file = input_dir / "hurdat2-atlantic.txt"

if not hurdat_file.exists():
    print("Downloading HURDAT2 Atlantic data...")
    
    # Try with headers to mimic browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(hurdat_url, headers=headers)
    response.raise_for_status()
    
    # Check if we got HTML instead of text data
    if response.text.strip().startswith('<'):
        print("ERROR: Got HTML page instead of data file")
        print("Manual download required from: https://www.nhc.noaa.gov/data/hurdat/")
        print("Please download hurdat2-1851-2024-040425.txt manually")
    else:
        with open(hurdat_file, 'w') as f:
            f.write(response.text)
        print(f"Downloaded to {hurdat_file}")
else:
    print(f"HURDAT2 file already exists: {hurdat_file}")

if hurdat_file.exists():
    print(f"File size: {hurdat_file.stat().st_size:,} bytes")

HURDAT2 file already exists: /Users/Michael/hurricane-data-etl/hurdat2/input_data/hurdat2-atlantic.txt
File size: 7,034,638 bytes


In [None]:
# Cell 2: Parse HURDAT2 format and create initial DataFrame
import sys
import importlib

# Add src directory and handle module reloading
sys.path.append('../src')

# Import with reload to get latest version
import parse_raw
importlib.reload(parse_raw)
from parse_raw import parse_hurdat2_file

# Parse the raw HURDAT2 data
print("Parsing HURDAT2 data...")
df_raw = parse_hurdat2_file(hurdat_file)

print(f"\n=== PARSING DIAGNOSTICS ===")
print(f"Raw records parsed: {len(df_raw):,}")
print(f"Unique storms: {df_raw['storm_id'].nunique():,}")
print(f"Date range: {df_raw['date'].min()} to {df_raw['date'].max()}")
print(f"Years covered: {df_raw['date'].dt.year.min()} to {df_raw['date'].dt.year.max()}")

# Show sample of parsed data
print(f"\n=== SAMPLE PARSED DATA ===")
print(df_raw.head())

# Check for any parsing issues
print(f"\n=== DATA QUALITY CHECKS ===")
null_counts = df_raw.isnull().sum()
print("Null values per column:")
for col, count in null_counts[null_counts > 0].items():
    print(f"  {col}: {count:,}")

if null_counts.sum() == 0:
    print("✅ No null values detected")

print(f"Memory usage: {df_raw.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

In [None]:
# Cell 3: Data cleaning and validation
import sys
import importlib

# Add src directory and reload module
sys.path.append('../src')
import profile_clean
importlib.reload(profile_clean)
from profile_clean import clean_hurdat2_data

# Clean the data
print("Cleaning and validating HURDAT2 data...")
df_clean = clean_hurdat2_data(df_raw)

print(f"\n=== CLEANING RESULTS ===")
print(f"Records before cleaning: {len(df_raw):,}")
print(f"Records after cleaning: {len(df_clean):,}")
print(f"Records removed: {len(df_raw) - len(df_clean):,} ({(len(df_raw) - len(df_clean))/len(df_raw)*100:.1f}%)")

# Data validation tests
print(f"\n=== VALIDATION TESTS ===")

# Test 1: Wind speeds are reasonable
wind_data = df_clean['max_wind'].dropna()
if len(wind_data) > 0:
    max_wind = wind_data.max()
    min_wind = wind_data.min()
    print(f"✅ Wind speeds: {min_wind}-{max_wind} kt (reasonable range)")
else:
    print("⚠️ No wind speed data available")

# Test 2: Coordinates are valid
lat_range = (df_clean['lat'].min(), df_clean['lat'].max())
lon_range = (df_clean['lon'].min(), df_clean['lon'].max())
print(f"✅ Latitude range: {lat_range[0]:.1f} to {lat_range[1]:.1f}°")
print(f"✅ Longitude range: {lon_range[0]:.1f} to {lon_range[1]:.1f}°")

# Test 3: Data continuity - simplified approach
storm_counts = df_clean['storm_id'].value_counts()
avg_points_per_storm = storm_counts.mean()
max_points_per_storm = storm_counts.max()
print(f"✅ Average observations per storm: {avg_points_per_storm:.1f}")
print(f"✅ Maximum observations per storm: {max_points_per_storm}")

# Test 4: Storm categories
cat_counts = df_clean['category'].value_counts().sort_index()
print(f"✅ Storm categories distribution:")
for cat, count in cat_counts.items():
    print(f"   {cat}: {count:,} records")

# Test 5: Temporal coverage
year_range = (df_clean['year'].min(), df_clean['year'].max())
print(f"✅ Year coverage: {year_range[0]} to {year_range[1]} ({year_range[1] - year_range[0] + 1} years)")

print(f"\n=== READY FOR ANALYSIS ===")
print("✅ Data successfully parsed, cleaned, and validated")
print(f"Final dataset: {len(df_clean):,} records from {df_clean['storm_id'].nunique():,} storms")