In [7]:
import datetime
import numpy as np
import os
import pandas as pd
import requests
import time
import xarray as xr

### SHIPS documentation
Reference: https://rammb-data.cira.colostate.edu/ships/data/ships_predictor_file.pdf


In [8]:
def adjust_SHIPS(TMP: pd.DataFrame) -> pd.DataFrame:

    ''' Modify raw SHIPS data for datetime formatting and units. '''
    
    # Set data types to custom fields
    TMP = TMP.astype({'hour': 'int32', 'max_wind': 'int32', 'center_lat': 'float', 'center_lon': 'float', 'min_slp': 'int32'})
    # Convert integers in 'date' to full datetimes
    TMP['time'] = TMP['time'].apply(lambda s: datetime.datetime.strptime(s, '%y%m%d')) + pd.to_timedelta(TMP['hour'], unit='h')
    TMP = TMP.drop(columns=['hour'])
    # Convert wind from knots to meters per second
    for field_name in ['max_wind', 'SHRD']:
        TMP[field_name] = TMP[field_name] * 0.514444
    # Add season to DataFrame for compatibility with IBTrACS, see conditional logic below:
    # "Season (year) that the storm began. Due to how the Southern Hemisphere defines a season, 
    # storms after July 1 are classified with the following year."
    TMP['season'] = TMP.apply(lambda x: x['time'].year if x['time'].month < 7 else x['time'].year, axis=1)

    return TMP

In [9]:
def load_SHIPS() -> pd.DataFrame:

    ''' 
    Method to load SHIPS data from local text file. 
    
    Text file obtained from: https://rammb-data.cira.colostate.edu/ships/data/AL/lsdiaga_1982_2022_sat_ts_5day.txt.
    '''

    # Define pathname to pull data from
    pathname = '/scratch/gpfs/GEOCLIM/gr7610/tiger3/reference/datasets/SHIPS/lsdiaga_1982_2022_sat_ts_5day.txt'
    # List of variable names of interest (only including SHRD - or vertical wind shear from 850 to 200 hPa - for now)
    varnames = ['SHRD']
    # Container for SHIPS data pertaining to `varnames`
    container = []
    
    # Placeholder for iterand storm ID name, is None before a storm ID is found
    storm_ID = None
    with open(pathname, 'r') as file:
        # Iterate over all lines in SHIPS text file
        for line_number, line in enumerate(file):
            # Strip the data
            line_entry = line.strip().split()
            
            # Capture the storm characteristics if 'HEAD' is the last index
            if line_entry[-1] == 'HEAD': 
                date, hour, day, lat, lon, min_SLP, storm_ID = line_entry[1:-1]
    
            # Only grab line if the last entry/column is in the list of specified varnames
            if line_entry[-1] in varnames:
                # Scrape zeroth-index, which is assumed to correspond to the zeroth-hour 
                # (hence the "ZH" designator, and is an approximation of observations)
                # Divide by 10 to convert to proper value (reference: https://rammb-data.cira.colostate.edu/ships/data/ships_predictor_file.pdf)
                property_ZH = int(line_entry[0]) / 10
                # Generate entry-specific dictionary to append to container for future concatenation
                entry = {'time': date, 
                         'hour': hour, 
                         'max_wind': day, 
                         'center_lat': lat, 
                         'center_lon': lon, 
                         'min_slp': min_SLP, 
                         'storm_id': storm_ID, 
                         line_entry[-1]: property_ZH}
                container.append(entry)

    # Load into Pandas DataFrame
    data = pd.DataFrame(container) 

    return data

In [10]:
def merge_IBTrACS_SHIPS_TC(SHIPS_TC: pd.DataFrame,
                           dataset_IBTrACS: xr.Dataset,
                           diagnostic: bool=True):

    ''' Function to merge IBTrACS and SHIPS data for a given TC. '''

    IBTrACS_TCs = [] # container list for the output TC

    # Filter the IBTrACS dataset by the iterand season to minimize data processing
    dataset_IBTrACS = dataset_IBTrACS.where(dataset_IBTrACS['season'] == SHIPS_TC['season'].min(), drop=True)

    # Iterate over all TCs that are left
    for index in range(len(dataset_IBTrACS.storm)):
        # Get the iterand TC
        IBTrACS_TC = dataset_IBTrACS.isel(storm=index)
        original_axis_length = len(IBTrACS_TC['date_time'])
        # Get indices with valid timestamps
        indices = np.where(~np.isnan(IBTrACS_TC['time']))[0]
        IBTrACS_TC = IBTrACS_TC.isel(date_time=indices)
        
        # Round to nearest hour
        IBTrACS_TC['time'] = IBTrACS_TC['time'].dt.round('h')

        if diagnostic:
            print(f'Iterand season: {IBTrACS_TC['season'].min()}, IBTrACS timestamp extent: {IBTrACS_TC.time.min().dt.round('h'), IBTrACS_TC.time.max().dt.round('h')}')
            print(f"{'#'*128}")

        # Filter 1: time check
        # Iterand IBTrACS timestamps must be greater than the referenced SHIPS TC
        if IBTrACS_TC.time.min() <= SHIPS_TC['time'].min() and IBTrACS_TC.time.max() >= SHIPS_TC['time'].max():
            # Filters 2 and 3 (filter in parallel)
            # Filter 2: latitude check - check for matching latitudes between SHIPS and IBTrACS, length of matching list > 0
            IBTrACS_TC_index_lat = [index for index, item in enumerate(SHIPS_TC['center_lat']) 
                                    if SHIPS_TC['time'].iloc[index] in IBTrACS_TC.time]
            # Filter 3: longitude check - check for matching longitude between SHIPS and IBTrACS, length of matching list > 0
            # However, here we grab indices for where the matches occur. 
            # These indices will be used to assign shear values to the iterand timestamps in IBTrACS
            IBTrACS_TC_index_lons = []
            # Iterate over all longitudes in the iterand TC
            for index, item in enumerate(SHIPS_TC['center_lon']):
                # Get the index of the SHIPS timestamp that matches the IBTrACS timestamp
                if SHIPS_TC['time'].iloc[index] in IBTrACS_TC.time:
                    # Only get indices of IBTrACS timestamps that are in SHIPS
                    valid_indices = np.where(IBTrACS_TC.time.values == SHIPS_TC['time'].iloc[index])
                    if len(valid_indices) > 0: IBTrACS_TC_index_lons.append(valid_indices[0])
            # Flatten indices into a single list
            IBTrACS_TC_index_lons = np.concatenate(IBTrACS_TC_index_lons).ravel()

            # Check if filters apply
            if len(IBTrACS_TC_index_lat) > 0 and len(IBTrACS_TC_index_lons) > 0:
                arr_TMP = np.full(len(IBTrACS_TC['wmo_wind']), np.nan)
                # Ensure that SHIPS shear values and number of matching indices are the same
                if len(IBTrACS_TC_index_lons) == len(SHIPS_TC['SHRD']):
                    
                    arr_TMP[IBTrACS_TC_index_lons] = SHIPS_TC['SHRD']
                    IBTrACS_TC['shear-200_850'] = IBTrACS_TC['wmo_wind']
                    if diagnostic: print(f'Length of IBTrACS data: {len(IBTrACS_TC['shear-200_850'].data)}')
                    # Assign the data to the dataset
                    IBTrACS_TC['shear-200_850'].data = arr_TMP

                    # Apply padding with nans along the `date_time` axis to maintain congruence for concatenation
                    IBTrACS_TC = IBTrACS_TC.pad(date_time=(0, original_axis_length - len(IBTrACS_TC['date_time'])))
                    
                    IBTrACS_TCs.append(IBTrACS_TC)
                else:
                    print('Length of SHIPS shear values and matching IBTrACS-SHIPS coordinates is not equal.')
                    continue

    return IBTrACS_TCs

#### Step 1: load SHIPS data, perform corrections, and load IBTrACS data into memory

In [11]:
# Load SHIPS data
data = load_SHIPS()
# Perform data cleaning and conversions
data = adjust_SHIPS(data)
# Load IBTrACS data into memory for faster processing
TRACS = xr.open_dataset('/scratch/gpfs/GEOCLIM/gr7610/tiger3/reference/datasets/IBTrACS/IBTrACS.since1980.v04r01.nc').load()

#### Step 2. process SHIPS data by integrating it with IBTrACS data

In [12]:
# Initialize a container for all TCs
TMP_TCs = []

# Iterate over all TCs
for storm_number, storm_id in enumerate(data['storm_id'].unique()):
    
    start_time = time.time()
    TMP_TC = merge_IBTrACS_SHIPS_TC(SHIPS_TC=data.loc[data['storm_id'] == storm_id], 
                                    dataset_IBTrACS=TRACS, 
                                    diagnostic=False)
    print(f'Elapsed time for storm {storm_id}: {(time.time() - start_time):.2f} s')

    # Only append if the function yields a non-empty dataset
    if len(TMP_TC) > 0: TMP_TCs.append(TMP_TC[0])

    # if storm_number > 5: break

# Get number of TCs processed
unique_TCs = len([item for item in TMP_TCs if len(item) > 0])

Elapsed time for storm AL011982: 1.12 s
Elapsed time for storm AL031982: 0.91 s
Elapsed time for storm AL051982: 0.91 s
Elapsed time for storm AL061982: 0.88 s
Elapsed time for storm AL081982: 0.90 s
Elapsed time for storm AL031983: 0.76 s
Elapsed time for storm AL041983: 0.77 s
Elapsed time for storm AL051983: 0.78 s
Elapsed time for storm AL061983: 0.79 s
Elapsed time for storm AL061984: 0.90 s
Elapsed time for storm AL071984: 0.91 s
Elapsed time for storm AL081984: 0.93 s
Elapsed time for storm AL101984: 0.90 s
Elapsed time for storm AL111984: 0.91 s
Elapsed time for storm AL121984: 0.93 s
Elapsed time for storm AL131984: 0.90 s
Elapsed time for storm AL141984: 0.89 s
Elapsed time for storm AL151984: 0.90 s
Elapsed time for storm AL161984: 0.90 s
Elapsed time for storm AL181984: 0.90 s
Elapsed time for storm AL201984: 0.91 s
Elapsed time for storm AL011985: 0.94 s
Elapsed time for storm AL021985: 1.01 s
Elapsed time for storm AL031985: 0.94 s
Elapsed time for storm AL041985: 0.93 s


#### Step 3. concatenate and save data

In [13]:
TMP_TCs = xr.concat(TMP_TCs, dim='storm')

storage_dirname = '/scratch/gpfs/GEOCLIM/gr7610/tiger3/reference/datasets/IBTrACS'
storage_filename = 'IBTrACS.since1980.v04r01.SHIPS.basin-NA.nc'
TMP_TCs.to_netcdf(os.path.join(storage_dirname, storage_filename))