In [48]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [49]:
HEADER_COLUMNS = ['basin_cycloneid', 'name', 'entries']

In [50]:
def parse_header_line(line):
    """Parse a header line from the HURDAT2 dataset."""
    header_data = {
        "Basin": line[0:2].strip(),               # Spaces 1-2
        "CycloneID": line[2:4].strip(),           # Spaces 3-4
        "Year": line[4:8].strip(),                # Spaces 5-8
        "Name": line[18:28].strip(),              # Spaces 19-28
        "Entries": line[33:36].strip()            # Spaces 34-36
    }
    return header_data

In [51]:
def parse_data_line(line):
    # Extract the fixed-width fields based on the provided format
    parsed_data = {
        "Year": line[0:4].strip(),
        "Month": line[4:6].strip(),
        "Day": line[6:8].strip(),
        "Hours": line[10:12].strip(),
        "Minutes": line[12:14].strip(),
        "RecordID": line[16:17].strip(),
        "Status": line[19:21].strip(),
        "Latitude": line[23:27].strip(),
        "LatHemisphere": line[27:28].strip(),
        "Longitude": line[30:35].strip(),
        "LonHemisphere": line[35:36].strip(),
        "MaxWind": line[38:41].strip(),
        "MinPressure": line[43:47].strip(),
        "34NE": line[49:53].strip(),
        "34SE": line[55:59].strip(),
        "34SW": line[61:65].strip(),
        "34NW": line[67:71].strip(),
        "50NE": line[73:77].strip(),
        "50SE": line[79:83].strip(),
        "50SW": line[85:89].strip(),
        "50NW": line[91:95].strip(),
        "64NE": line[97:101].strip(),
        "64SE": line[103:107].strip(),
        "64SW": line[109:113].strip(),
        "64NW": line[115:119].strip(),
        "RadiusMaxWind": line[121:125].strip()
    }
    return parsed_data

In [52]:
def convert_lat_lon(lat, lat_hem, lon, lon_hem):
    # Convert latitude and longitude to decimal degrees
    lat = float(lat) * (1 if lat_hem == 'N' else -1)
    lon = float(lon) * (1 if lon_hem == 'E' else -1)
    return lat, lon

In [53]:
def parse_hurdat2(file_path):
    data = []
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        
        # Identify header lines
        if line.startswith('AL'):
            # Parse the header line
            header = parse_header_line(line)
            header['Entries'] = int(header['Entries'])  # Convert Entries to an integer
            i += 1  # Move to the next line
            
            # Process data lines associated with this header
            for _ in range(header['Entries']):
                data_line = lines[i].strip()
                parsed_data = parse_data_line(data_line)
                
                # Convert latitude and longitude to decimal degrees
                lat, lon = convert_lat_lon(parsed_data['Latitude'], parsed_data['LatHemisphere'], 
                                           parsed_data['Longitude'], parsed_data['LonHemisphere'])
                
                parsed_data['Latitude'] = lat
                parsed_data['Longitude'] = lon
                
                # Combine header, parsed data, and geometry
                combined_dict = {**header, **parsed_data}
                data.append(combined_dict)
                i += 1
        else:
            i += 1  # Skip lines that don't start with header identifiers
            
    # Convert to GeoDataFrame
    gdf = gpd.GeoDataFrame(data)
    
    return gdf

In [54]:
def test_entries_count(df):
    """
    Test to check that the number of data entries for each cyclone matches the number in the Entries column.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the parsed HURDAT2 data.

    Returns:
    bool: True if all entries match, False otherwise.
    """

    # Group the DataFrame by Basin_CycloneID and count the number of entries
    grouped = df.groupby(['Basin', 'CycloneID', 'Year', 'Name']).size().reset_index(name='n_rows')

    # Merge the count with the Entries column
    merged = pd.merge(grouped, df[['Basin', 'CycloneID', 'Entries', 'Year', 'Name']].drop_duplicates(), 
                      on=['Basin', 'CycloneID', 'Year', 'Name'], how='left')

    # Check if the counts match the Entries column
    mismatch = merged[merged['n_rows'] != merged['Entries']]

    if not mismatch.empty:
        print("Mismatch found:")
        print(mismatch)
        return False
    else:
        print("All entry counts match the Entries column.")
        return True


In [55]:
df = parse_hurdat2('data/hurdat2-1851-2023-051124.txt')

In [56]:
test_entries_count(df)

Mismatch found:
     Basin CycloneID  Year   Name  n_rows  Entries
1768    AL        16  1954  ALICE       7       29
1769    AL        16  1955  ALICE      22       29
1972    AL        31  2005   ZETA       8       36
1973    AL        31  2006   ZETA      28       36


False

grouping by 'Basin', 'CycloneID', 'Year', and 'Name' reveals two pairs of hurricanes that are listed as the same hurricane but appear in subsequent years. These are the only two atlantic hurricanes to span two calendar years, so there is actually no issue here in our parse even though we get the warning from the test_entries_count() function.

Find columns with no variance, evaluate if they're useful columns, and if not remove.

In [57]:
df.loc[:, df.nunique() == 1]

Unnamed: 0,Basin,LatHemisphere
0,AL,N
1,AL,N
2,AL,N
3,AL,N
4,AL,N
...,...,...
54744,AL,N
54745,AL,N
54746,AL,N
54747,AL,N


all hurricanes are in the atlantic basin and above the equator. Let's remove these columns since they're not informative.

In [58]:
df = df.loc[:, df.nunique() > 1]

In [59]:
df.select_dtypes('object').columns

Index(['CycloneID', 'Year', 'Name', 'Month', 'Day', 'Hours', 'Minutes',
       'RecordID', 'Status', 'LonHemisphere', 'MaxWind', 'MinPressure', '34NE',
       '34SE', '34SW', '34NW', '50NE', '50SE', '50SW', '50NW', '64NE', '64SE',
       '64SW', '64NW', 'RadiusMaxWind'],
      dtype='object')

In [60]:
# type cast columns
df['CycloneID'] = df['CycloneID'].astype('string')
df['Year'] = df['Year'].astype(int)
df['Name'] = df['Name'].astype('string')
df['Month'] = df['Month'].astype(int)
df['Day'] = df['Day'].astype(int)
df['Hours'] = df['Hours'].astype(int)
df['Minutes'] = df['Minutes'].astype(int)
df['RecordID'] = df['RecordID'].astype('string')
df['Status'] = df['Status'].astype('string')
df['LonHemisphere'] = df['LonHemisphere'].astype('string')
df['MaxWind'] = pd.to_numeric(df['MaxWind'], errors='coerce')
df['MinPressure'] = pd.to_numeric(df['MinPressure'], errors='coerce')
df[['34NE', '34SE', '34SW', '34NW', 
    '50NE', '50SE', '50SW', '50NW', 
    '64NE', '64SE', '64SW', '64NW', 
    'RadiusMaxWind']] = df[['34NE', '34SE', '34SW', '34NW', 
                            '50NE', '50SE', '50SW', '50NW', 
                            '64NE', '64SE', '64SW', '64NW', 
                            'RadiusMaxWind']].apply(pd.to_numeric, errors='coerce')

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54749 entries, 0 to 54748
Data columns (total 28 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CycloneID      54749 non-null  string 
 1   Year           54749 non-null  int32  
 2   Name           54749 non-null  string 
 3   Entries        54749 non-null  int64  
 4   Month          54749 non-null  int32  
 5   Day            54749 non-null  int32  
 6   Hours          54749 non-null  int32  
 7   Minutes        54749 non-null  int32  
 8   RecordID       54749 non-null  string 
 9   Status         54749 non-null  string 
 10  Latitude       54749 non-null  float64
 11  Longitude      54749 non-null  float64
 12  LonHemisphere  54749 non-null  string 
 13  MaxWind        54749 non-null  int64  
 14  MinPressure    54749 non-null  int64  
 15  34NE           54749 non-null  int64  
 16  34SE           54749 non-null  int64  
 17  34SW           54749 non-null  int64  
 18  34NW  

In [62]:
df['Status'].unique()

<StringArray>
['HU', 'TS', 'EX', 'TD', 'LO', 'DB', 'SS', 'SD', 'WV']
Length: 9, dtype: string

Status of system. Options are:

1. TD – Tropical cyclone of tropical depression intensity (< 34 knots)
2. TS – Tropical cyclone of tropical storm intensity (34-63 knots)
3. HU – Tropical cyclone of hurricane intensity (> 64 knots)
4. EX – Extratropical cyclone (of any intensity)
5. SD – Subtropical cyclone of subtropical depression intensity (< 34 knots)
6. SS – Subtropical cyclone of subtropical storm intensity (> 34 knots)
7. LO – A low that is neither a tropical cyclone, a subtropical cyclone, nor an extratropical cyclone (of any intensity)
8. WV – Tropical Wave (of any intensity)
9. DB – Disturbance (of any intensity)

combine dates/times into datetimes

In [63]:
df['Datetime'] = pd.to_datetime(df[['Year', 'Month', 'Day', 'Hours', 'Minutes']])

In [64]:
df['Datetime'].head()

0   1851-06-25 00:00:00
1   1851-06-25 06:00:00
2   1851-06-25 12:00:00
3   1851-06-25 18:00:00
4   1851-06-25 21:00:00
Name: Datetime, dtype: datetime64[ns]

save to parquet

In [65]:
df.Name.unique()

<StringArray>
[   'UNNAMED',       'ABLE',      'BAKER',    'CHARLIE',        'DOG',
       'EASY',        'FOX',     'GEORGE',        'HOW',       'ITEM',
 ...
     'TERESA',     'VICTOR',      'WANDA',     'TWELVE',     'MARTIN',
     'HAROLD',     'IDALIA',     'MARGOT',      'NIGEL', 'TWENTY-ONE']
Length: 319, dtype: string

In [66]:
df.to_parquet('data/hurdat2.parquet')