# 05_event_dataset.ipynb

### Creates a Pandas DataFrame of extreme event attributes and saves a CSV file for each labeled event.


In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from skimage.measure import regionprops 
# from dask_image.ndmeasure import label as label_dask
from skimage.measure import label as label_np
from datetime import date
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import dask
dask.config.set({"array.slicing.split_large_chunks": False});

#### Import Ocetrac labels and SST data


In [2]:
file_path_list = ('/burg/abernathey/users/hillary/oisst_v21/ocetrac_notrend.nc',
                  '/burg/abernathey/users/hillary/oisst_v21/positive_extremes_OISSTv21.nc')

ds = xr.open_mfdataset(file_path_list)
ssta = ds.mhw_ssta_notrend
labels = ds.ocetrac_labels
num_events = labels.attrs['final objects tracked']
print(f"There are {num_events} total events tracked.")

ds

There are 770 total events tracked.


Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.69 GiB 3.69 GiB Shape (478, 720, 1440) (478, 720, 1440) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",1440  720  478,

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.69 GiB 3.69 GiB Shape (478, 720, 1440) (478, 720, 1440) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",1440  720  478,

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.91 MiB,7.91 MiB
Shape,"(720, 1440)","(720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 7.91 MiB 7.91 MiB Shape (720, 1440) (720, 1440) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",1440  720,

Unnamed: 0,Array,Chunk
Bytes,7.91 MiB,7.91 MiB
Shape,"(720, 1440)","(720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.91 MiB,7.91 MiB
Shape,"(720, 1440)","(720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 7.91 MiB 7.91 MiB Shape (720, 1440) (720, 1440) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",1440  720,

Unnamed: 0,Array,Chunk
Bytes,7.91 MiB,7.91 MiB
Shape,"(720, 1440)","(720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.69 GiB 3.69 GiB Shape (478, 720, 1440) (478, 720, 1440) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",1440  720  478,

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.69 GiB 3.69 GiB Shape (478, 720, 1440) (478, 720, 1440) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",1440  720  478,

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.69 GiB 3.69 GiB Shape (478, 720, 1440) (478, 720, 1440) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",1440  720  478,

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.69 GiB 3.69 GiB Shape (478, 720, 1440) (478, 720, 1440) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",1440  720  478,

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.69 GiB 3.69 GiB Shape (478, 720, 1440) (478, 720, 1440) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",1440  720  478,

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.69 GiB 3.69 GiB Shape (478, 720, 1440) (478, 720, 1440) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",1440  720  478,

Unnamed: 0,Array,Chunk
Bytes,3.69 GiB,3.69 GiB
Shape,"(478, 720, 1440)","(478, 720, 1440)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [3]:
%%time
labels.load();

CPU times: user 0 ns, sys: 2.95 s, total: 2.95 s
Wall time: 2.99 s


#### Event Attributes
        
- **id** : Unique label given to the MHW [int]
- **date** : Dates corresponding to the event [datetime format]
- **coords** : Latitude and longitude of all points contained in the event [(lat,lon)]
- **centroid** : Center of each object contained in the event [(lat,lon)]
- **duration** : Duration of event [months]
- **intensity_max** : Maximum intensity at each time interval [degC]
- **intensity_mean** : Mean intensity at each time interval [degC]
- **intensity_min** : Minimum intensity at each time interval [degC]
- **intensity_cumulative** : Cumulated intensity over the entire event [degC months]
- **area** : Area of the event at each time interval [km2]

TO ADD?
- number of centroids
- maximum distance between centroids
- date peak [datetime format]
- rate onset [degC / month]
- rate decline [degC / month]

In [4]:
def _wrap(labels):
    ''' Impose periodic boundary and wrap labels, then reorder the labels'''
    first_column = labels[..., 0]
    last_column = labels[..., -1]

    stacked = first_column.stack(z=['time','lat'])
    unique_first = np.unique(stacked[stacked.notnull()])

    # This loop iterates over the unique values in the first column, finds the location of those values in 
    # the first columnm and then uses that index to replace the values in the last column with the first column value
    for i in enumerate(unique_first):
        first = np.where(first_column == i[1])
        last = last_column[first[0], first[1]]
        stacked = last.stack(z=['time','lat'])
        bad_labels = np.unique(stacked[stacked.notnull()])
        replace = np.isin(labels, bad_labels)
        labels = labels.where(replace==False, other=i[1])
    
    labels = labels.fillna(0)
    labels_wrapped = np.unique(labels, return_inverse=True)[1].reshape(labels.shape)
    labels_wrapped = xr.DataArray(labels_wrapped, dims=labels.dims, coords=labels.coords)

    return labels_wrapped

def _get_labels(binary_images):
    '''function used to label binary images at each time step using skimage.measure.label'''
    blobs_labels = label_np(binary_images, background=0)
    return blobs_labels
    
def _get_centroids(sub_labels):
    '''This function uses skimage.measure.regionprops to find the centroids of objects assigned 
    to each unique label'''
    props = regionprops(sub_labels.astype('int'))
    centroids = [(float(sub_labels.lat[round(p.centroid[0])].values),
                  float(sub_labels.lon[round(p.centroid[1])].values)) for p in props]
    for i in range(0,len(centroids)):
        if centroids[i][1] >= 359.875:
            centroids[i] = (centroids[i][0], list(centroids[i])[1] - 359.875)
    
    return centroids

def _get_intensity_area(event, ssta, mhw):
    '''Calculates event intensities and area at each time interval using anomaly data and coordinates 
    cooresponding to the event.'''
    
    event_ssta = ssta.where(event>0, drop=True)        
    mhw['intensity_mean'].append(event_ssta.mean(('lat','lon')).values)
    mhw['intensity_max'].append(event_ssta.max(('lat','lon')).values) 
    mhw['intensity_min'].append(event_ssta.min(('lat','lon')).values)
    mhw['intensity_cumulative'].append(np.nansum(event_ssta))
    coords = event.stack(z=('lat','lon'))
    coord_pairs = [(coords.isel(time=t[0]).dropna(dim='z', how='any').z.lat.values, 
                      coords.isel(time=t[0]).dropna(dim='z', how='any').z.lon.values) for t in enumerate(event.time)]

    mhw['coords'].append(coord_pairs)

    # Calculate weighted cell area assuming 0.25º resolution data and 111 km per degree of latitude
    y, x = zip(*coord_pairs)
    dlon = [np.cos(y[c]*np.pi/180)*(111*.25) for c in np.arange(0, len(coord_pairs))]
    dlat = (111*.25) * np.ones(len(dlon))
    cell_area = [np.sum(dlon[c]*dlat[c]) for c in np.arange(0, len(coord_pairs))]
    mhw['area'].append(cell_area)

    return mhw
    
def to_dataframe(event, ssta):
    '''
    Creates a Pandas DataFrame of event attributes.
    
    Parameters
    ----------
      event : xarray.DataArray   
              Image set containing only objects corresponding to the event of interest. 
              Dimensions should be ('time', 'lat', 'lon')
              
      ssta  : xarray.DataArray
              Temperature vector [1D numpy array of length T]
    
    Returns
    -------
    
    mhw : pandas.DataFrame
          Marine heat wave event attributes. The keys listed below are 
          are contained within the dataset.
 
        'id'                     Unique label given to the MHW [int]
        'date'                   Dates corresponding to the event [datetime format]
        'coords'                 Latitude and longitude of all points contained in the event [tuple(lat,lon)]
        'centroid'               Center of each object contained in the event [tuple(lat,lon)]
        'duration'               Duration of event [months]
        'intensity_max'          Maximum intensity at each time interval [degC]
        'intensity_mean'         Mean intensity at each time interval [degC]
        'intensity_min'          Minimum intensity at each time interval [degC]
        'intensity_cumulative'   Cumulated intensity over the entire event [degC months]
        'area'                   Area of the event at each time interval [km2]
        
    '''
    
    # Initialize dictionary 
    mhw = {}
    mhw['id'] = [] # event label
    mhw['date'] = [] # datetime format
    mhw['coords'] = [] # (lat, lon)
    mhw['centroid'] = []  # (lat, lon)
    mhw['duration'] = [] # [months]
    mhw['intensity_max'] = [] # [deg C]
    mhw['intensity_mean'] = [] # [deg C]
    mhw['intensity_min'] = [] # [deg C]
    mhw['intensity_cumulative'] = [] # [deg C]
    mhw['area'] = [] # [km2]

    # TO ADD:
    # mhw['rate_onset'] = [] # [deg C / month]
    # mhw['rate_decline'] = [] # [deg C / month]

    mhw['id'].append(int(np.nanmedian(event.values)))
    mhw['date'].append(event.time.values.astype('datetime64[M]'))
    mhw['duration'].append(event.time.shape[0])

    # Turn images into binary
    binary_event = event.where(event>=0, other=0)
    binary_event = binary_event.where(binary_event==0, other=1)
      
    sub_labels = xr.apply_ufunc(_get_labels, binary_event,
                                input_core_dims=[['lat', 'lon']],
                                output_core_dims=[['lat', 'lon']],
                                output_dtypes=[binary_event.dtype],
                                vectorize=True,
                                dask='parallelized')
    
    # Turn background to NaNs
    sub_labels = xr.DataArray(sub_labels, dims=binary_event.dims, coords=binary_event.coords)
    sub_labels = sub_labels.where(sub_labels>0, drop=False, other=np.nan) 

    # The labels are repeated each time step, therefore we relabel them to be consecutive
    for p in range(1, sub_labels.shape[0]):
        sub_labels[p,:,:] = sub_labels[p,:,:].values + sub_labels[p-1,:,:].max().values
    
    sub_labels_wrapped = _wrap(sub_labels)
    
    mhw = _get_intensity_area(event, ssta, mhw)
    
    centroid = []
    for s in np.arange(0, sub_labels_wrapped.shape[0]):
        lx = sub_labels_wrapped.isel(time=s)
        east = lx.where(lx.lon < 180, drop=True)
        east['lon'] = np.arange(360.125, 540.125, .25)
        append_east = xr.concat([lx.where(lx.lon >= 180, drop=True), east], dim="lon")
        centroid.append(_get_centroids(append_east))
    mhw['centroid'].append(centroid)
    
    mhw = pd.DataFrame(dict([(name, pd.Series(data)) for name,data in mhw.items()]))

    return mhw


#  🚀 Lift-off!

In [None]:
for i in tqdm(np.arange(613, num_events+1)):
    
    '''Loop through all unique MHW labels and a save a CSV file 
    containing event attributes for each event'''
    
    event = labels.where(labels==i, drop=True).reindex({"lon": labels.lon.values})
    mhw = to_dataframe(event, ssta)
    
    path = f'/burg/abernathey/users/hillary/ocetrac/mhw_{str(i)}.csv'
    mhw.to_csv(path, index=False)
    
print(f'The keys in this dataset are {list(mhw)}.')

In [None]:
mhw

In [None]:
event = labels.where(labels==i, drop=True).reindex({"lon": labels.lon.values})
event.shape

In [None]:
# Initialize dictionary 
mhw = {}
mhw['id'] = [] # event label
mhw['date'] = [] # datetime format
mhw['coords'] = [] # (lat, lon)
mhw['centroid'] = []  # (lat, lon)
mhw['duration'] = [] # [months]
mhw['intensity_max'] = [] # [deg C]
mhw['intensity_mean'] = [] # [deg C]
mhw['intensity_min'] = [] # [deg C]
mhw['intensity_cumulative'] = [] # [deg C]
mhw['area'] = [] # [km2]

# TO ADD:
# mhw['rate_onset'] = [] # [deg C / month]
# mhw['rate_decline'] = [] # [deg C / month]

mhw['id'].append(int(np.nanmedian(event.values)))
mhw['date'].append(event.time.values.astype('datetime64[M]'))
mhw['duration'].append(event.time.shape[0])

# Turn images into binary
binary_event = event.where(event>=0, other=0)
binary_event = binary_event.where(binary_event==0, other=1)

sub_labels = xr.apply_ufunc(_get_labels, binary_event,
                            input_core_dims=[['lat', 'lon']],
                            output_core_dims=[['lat', 'lon']],
                            output_dtypes=[binary_event.dtype],
                            vectorize=True,
                            dask='parallelized')

# Turn background to NaNs
sub_labels = xr.DataArray(sub_labels, dims=binary_event.dims, coords=binary_event.coords)
sub_labels = sub_labels.where(sub_labels>0, drop=False, other=np.nan) 

# The labels are repeated each time step, therefore we relabel them to be consecutive
for p in range(1, sub_labels.shape[0]):
    sub_labels[p,:,:] = sub_labels[p,:,:].values + sub_labels[p-1,:,:].max().values

sub_labels_wrapped = _wrap(sub_labels)

mhw = _get_intensity_area(event, ssta, mhw)

In [None]:
centroid = []
for s in np.arange(0, sub_labels_wrapped.shape[0]):
    lx = sub_labels_wrapped.isel(time=s)
    east = lx.where(lx.lon < 180, drop=True)
    east['lon'] = np.arange(360.125, 540.125, .25)
    append_east = xr.concat([lx.where(lx.lon >= 180, drop=True), east], dim="lon")
    centroid.append(_get_centroids(append_east))
mhw['centroid'].append(centroid)

## Let's walk through an example for a single event.

In [5]:
# Event label 
i = 651 #699 
event = labels.where(labels==i, drop=True).reindex({"lon": labels.lon.values})

print(f'The event is labeled {i}')
print(f'This event lasted {event.time.shape[0]} months.')
print('The dates for this event were', event.time[0].values.astype('datetime64[M]'),
      '–', event.time[-1].values.astype('datetime64[M]'))


The event is labeled 651
This event lasted 61 months.
The dates for this event were 2013-10 – 2018-10


In [6]:
%%time
mhw = to_dataframe(event, ssta)
mhw

CPU times: user 12.3 s, sys: 1min 46s, total: 1min 58s
Wall time: 2min


Unnamed: 0,id,date,coords,centroid,duration,intensity_max,intensity_mean,intensity_min,intensity_cumulative,area
0,651,"[2013-10, 2013-11, 2013-12, 2014-01, 2014-02, ...","[([38.875, 38.875, 38.875, 38.875, 38.875, 38....","[[(42.875, 307.125)], [(43.875, 198.625)], [(4...",61,"[2.3478753786274638, 2.4350872232164757, 4.934...","[1.6885408260441834, 1.6941107161660418, 2.004...","[1.2617499174393885, 1.1898635315648622, 1.203...",1209554.0,"[1615863.6858287915, 1901884.0255846302, 83243..."


In [31]:
mhw['date'].tolist()[0][0]

numpy.datetime64('2013-10')

In [9]:
ds = mhw.to_xarray()

In [47]:
ds['date'] = ds.date.values.tolist()

MissingDimensionsError: cannot set variable 'date' with 2-dimensional data without explicit dimension names. Pass a tuple of (dims, data) instead.

In [39]:
ds.date[0].values.tolist()[0]

numpy.datetime64('2013-10')

In [27]:
test = ds.date[0].values.tolist()
test[0]

numpy.datetime64('2013-10')

In [None]:
event.plot(col='time', col_wrap=event.shape[0], robust=True);

In [None]:
import sys
print(f'size of mhw is {sys.getsizeof(mhw)/1e6 *700} mb')

In [None]:
y, x = zip(*mhw['coords'][0])
plt.scatter(x[1], y[1], s=1, c='k')

In [None]:
for c in np.arange(0,mhw['duration'][0]):
    plt.figure()
    y, x = zip(*mhw['centroid'][0][c])
    event[c,:,:].plot()
    plt.scatter(x, y, c='k')
    plt.show()
    print('mean intensity (ºC)', round(mhw['intensity_mean'][0][c],2))
    print('max intensity (ºC)', round(mhw['intensity_max'][0][c],2))
    print('min intensity (ºC)', round(mhw['intensity_min'][0][c],2))
    print('area (km2)', round(mhw['area'][0][c],2))

#### To CSV

In [41]:
path = '/burg/abernathey/users/hillary/ocetrac/test.nc'
ds.to_netcdf(path)

ValueError: Could not convert object to NumPy datetime

#### Read CSV to Pandas DataFrame

In [None]:
df = pd.read_csv(path)
df