This file will prepare the test and training target variable datasets (by selecting from contingencies and pph datasets in data/raw_data) and save in data/processed_data

In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
slgt = False
if slgt:
    slgt_str = '_slgt'
    categories = ['SLGT', 'ENH', 'MDT', 'HIGH']
else:
    slgt_str = ''
    categories = ['MDT', 'HIGH']

In [3]:
# Load datasets
contingencies = xr.load_dataset('data/raw_data/contingency_regions.nc')
pph = xr.load_dataset('data/raw_data/labelled_pph.nc')

# Define hazard names in order
hazards = ['All Hazard', 'Wind', 'Hail', 'Tornado']

# Create dictionary for shifts from pph
east_keys = ['E_SH_NUM', 'E_SH_W_NUM', 'E_SH_H_NUM', 'E_SH_T_NUM']
north_keys = ['N_SH_NUM', 'N_SH_W_NUM', 'N_SH_H_NUM', 'N_SH_T_NUM']

# Select valid dates
missing_dates = [
    '200204250000', '200208300000', '200304150000', '200304160000',
    '200306250000', '200307270000', '200307280000', '200312280000',
    '200404140000', '200408090000', '200905280000', '201105210000',
    '202005240000', '200510240000'
]

dates_of_interest = pph['time'].where(pph['MAX_CAT'].isin(categories), drop=True)
dates_of_interest = dates_of_interest.where(dates_of_interest > '200203310000', drop=True)
dates_of_interest = dates_of_interest.where(~dates_of_interest.isin(missing_dates), drop=True)


# Compute bias per hazard
def compute_bias(hazard):
    da = contingencies.sel(hazard=hazard, region='CONUS')
    # bias = (da['a'] + da['b']) / (da['a'] + da['c']) # for ratio bias
    bias = da['b'] - da['c']  # for difference bias
    return bias.sel(time=dates_of_interest)


bias = xr.concat(
    [compute_bias(h) for h in hazards], dim='hazard'
).assign_coords(hazard=hazards)

# Collect and select shifts
east_shift = xr.concat(
    [pph[k].sel(time=dates_of_interest) for k in east_keys], dim='hazard'
).assign_coords(hazard=hazards)

north_shift = xr.concat(
    [pph[k].sel(time=dates_of_interest) for k in north_keys], dim='hazard'
).assign_coords(hazard=hazards)

# Combine into Dataset
target_ds = xr.Dataset({
    'bias': bias,
    'east_shift': east_shift,
    'north_shift': north_shift
})

target_ds = target_ds.drop_vars([v for v in target_ds.coords if v not in ['time', 'hazard']])
target_ds['time'] = pd.to_datetime(target_ds['time'].values, format='%Y%m%d%H%M')

In [4]:
target_ds_zero = xr.zeros_like(target_ds)

View full data

In [5]:
def plot_metrics_with_rolling_avg(ds, title='Hazard Metrics', save_path=None, rolling_days=365):
    """
    Plot scatter, line of best fit, and 1-year running average for each variable and hazard in the dataset.

    Parameters
    ----------
    ds : xarray.Dataset
        Dataset with coordinates 'time' and 'hazard', and variables like 'bias', 'east_shift', 'north_shift'.
    title : str
        Title for the figure.
    save_path : str or None
        Path to save the figure. If None, the plot will be shown instead.
    rolling_days : int
        Number of days to use for the rolling average window.
    """

    ds = ds.copy()
    variables = ['bias', 'east_shift', 'north_shift']
    hazards = ds.hazard.values

    fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 10), sharex=True)

    for row, var in enumerate(variables):
        for col, hazard in enumerate(hazards):
            ax = axes[row, col]
            da = ds[var].sel(hazard=hazard)

            # Convert to pandas for rolling and fitting
            df = da.to_pandas().dropna()
            rolling_mean = df.rolling(f'{rolling_days}D').mean()

            # Time conversion for regression (float days since start)
            x = (df.index - df.index[0]).total_seconds() / (24 * 3600)
            y = df.values

            if len(x) >= 2:
                # Fit line of best fit (1st-degree poly)
                coeffs = np.polyfit(x, y, deg=1)
                trend = np.poly1d(coeffs)

                # Create fitted line over the full date range
                ax.plot(df.index, trend(x), color='red', linestyle='--', label='Linear Fit')

            # Plot raw data and rolling average
            ax.scatter(df.index, y, s=10, alpha=0.6, label='Raw')
            ax.plot(df.index, rolling_mean.values, color='black', label='1-Year Avg')

            ax.set_title(f'{var.replace("_", " ").title()} - {hazard}')
            if row == 2:
                ax.set_xlabel('Date')
            if col == 0:
                ax.set_ylabel(var.replace("_", " ").title())

    fig.suptitle(title, fontsize=16)
    fig.tight_layout(rect=[0, 0, 1, 0.97])

    # Global legend
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper right')

    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        fig.savefig(save_path, dpi=300)
        plt.close(fig)
    else:
        plt.show()

In [6]:
plot_metrics_with_rolling_avg(target_ds, title='Raw Targets with 1-Year Rolling Average', save_path='figs/raw_full_targets' + slgt_str + '.png', rolling_days=365)

Split into train and test datasets

In [7]:
target_train_ds = target_ds.sel(time=slice('2002-01-01', '2019-12-31'))
target_test_ds = target_ds.sel(time=slice('2020-01-01', '2024-12-31'))

target_train_ds_zero = target_ds_zero.sel(time=slice('2002-01-01', '2019-12-31'))
target_test_ds_zero = target_ds_zero.sel(time=slice('2020-01-01', '2024-12-31'))

In [8]:
plot_metrics_with_rolling_avg(target_train_ds, title='Training Targets with 1-Year Rolling Average', save_path='figs/raw_training_targets' + slgt_str + '.png', rolling_days=365)
plot_metrics_with_rolling_avg(target_test_ds, title='Test Targets with 1-Year Rolling Average', save_path='figs/raw_test_targets' + slgt_str + '.png', rolling_days=365)

Detrend training data

In [9]:
detrended_train_ds = target_train_ds.copy()
detrended_train_ds_zero = target_train_ds_zero.copy()

for var in ['bias', 'east_shift', 'north_shift']:
    for hazard in target_train_ds.hazard.values:
        da = target_train_ds[var].sel(hazard=hazard)
        da_zero = target_train_ds_zero[var].sel(hazard=hazard)
        times = da['time']

        # Convert time to numeric (e.g., float days since start)
        t_numeric = (times - times[0]) / np.timedelta64(1, 'D')
        t_numeric = t_numeric.astype(float)

        y = da.values

        # Fit linear trend: y = m * t + b
        m, b = np.polyfit(t_numeric, y, 1)
        trend = m * t_numeric + b

        # Get value of trend at last training time
        t_last = ((times[-1] - times[0]) / np.timedelta64(1, 'D')).astype(float)
        offset = m * t_last + b

        # Subtract trend and add offset to anchor at end of training
        adjusted = y - trend + offset
        adjusted_zero = da_zero.values - trend + offset

        # Create a new DataArray with adjusted values
        adjusted_da = da.copy()
        adjusted_da.loc[dict(time=times)] = adjusted

        adjusted_da_zero = da_zero.copy()
        adjusted_da_zero.loc[dict(time=times)] = adjusted_zero

        # Assign it back to the detrended dataset
        detrended_train_ds[var].loc[dict(hazard=hazard)] = adjusted_da
        detrended_train_ds_zero[var].loc[dict(hazard=hazard)] = adjusted_da_zero


In [10]:
plot_metrics_with_rolling_avg(detrended_train_ds, title='Detrended Training Targets with 1-Year Rolling Average', save_path='figs/detrended_training_targets' + slgt_str + '.png', rolling_days=365)
plot_metrics_with_rolling_avg(detrended_train_ds_zero, title='Detrended Zeros with 1-Year Rolling Average', save_path='figs/detrended_training_targets_zero' + slgt_str + '.png', rolling_days=365)

Standardize all datasets by test mean and std

In [11]:
standardized_train_ds = detrended_train_ds.copy()
standardized_test_ds = target_test_ds.copy()

standardized_train_ds_zero = detrended_train_ds_zero.copy()
standardized_test_ds_zero = target_test_ds_zero.copy()

train_stats = {}

for var in ['bias', 'east_shift', 'north_shift']:
    train_stats[var] = {}

    for hazard in detrended_train_ds.hazard.values:
        # Select the training values
        train_values = detrended_train_ds[var].sel(hazard=hazard).values

        # Compute mean and std
        mean = np.mean(train_values)
        std = np.std(train_values)

        # Save stats for later
        train_stats[var][hazard.item()] = {'mean': mean, 'std': std}

        # Standardize both datasets
        standardized_train_ds[var].loc[dict(hazard=hazard)] = (
            detrended_train_ds[var].sel(hazard=hazard) - mean
        ) / std

        standardized_test_ds[var].loc[dict(hazard=hazard)] = (
            target_test_ds[var].sel(hazard=hazard) - mean
        ) / std

        # Zeros
        standardized_train_ds_zero[var].loc[dict(hazard=hazard)] = (
            detrended_train_ds_zero[var].sel(hazard=hazard) - mean
        ) / std

        standardized_test_ds_zero[var].loc[dict(hazard=hazard)] = (
            target_test_ds_zero[var].sel(hazard=hazard) - mean
        ) / std

In [12]:
means = xr.DataArray(
    [[train_stats[var][hazard]['mean'] for hazard in standardized_train_ds.hazard.values] for var in train_stats],
    coords=[list(train_stats.keys()), standardized_train_ds.hazard.values],
    dims=["variable", "hazard"]
)

stds = xr.DataArray(
    [[train_stats[var][hazard]['std'] for hazard in standardized_train_ds.hazard.values] for var in train_stats],
    coords=[list(train_stats.keys()), standardized_train_ds.hazard.values],
    dims=["variable", "hazard"]
)

standardized_train_ds["train_mean"] = means
standardized_train_ds["train_std"] = stds

standardized_test_ds["train_mean"] = means
standardized_test_ds["train_std"] = stds



standardized_train_ds_zero["train_mean"] = means
standardized_train_ds_zero["train_std"] = stds

standardized_test_ds_zero["train_mean"] = means
standardized_test_ds_zero["train_std"] = stds

In [13]:
plot_metrics_with_rolling_avg(standardized_train_ds, title='Standardized Training Targets with 1-Year Rolling Average', save_path='figs/standardized_training_targets' + slgt_str + '.png', rolling_days=365)
plot_metrics_with_rolling_avg(standardized_test_ds, title='Standardized Test Targets with 1-Year Rolling Average', save_path='figs/standardized_test_targets' + slgt_str + '.png', rolling_days=365)

plot_metrics_with_rolling_avg(standardized_train_ds_zero, title='Standardized Training Zeros with 1-Year Rolling Average', save_path='figs/standardized_training_targets_zero' + slgt_str + '.png', rolling_days=365)
plot_metrics_with_rolling_avg(standardized_test_ds_zero, title='Standardized Test Zeros with 1-Year Rolling Average', save_path='figs/standardized_test_targets_zero' + slgt_str + '.png', rolling_days=365)

In [14]:
standardized_train_ds

Save

In [15]:
standardized_train_ds.to_netcdf('data/processed_data/train_targets' + slgt_str + '.nc')
standardized_test_ds.to_netcdf('data/processed_data/test_targets' + slgt_str + '.nc')

standardized_train_ds_zero.to_netcdf('data/processed_data/train_targets_zero' + slgt_str + '.nc')
standardized_test_ds_zero.to_netcdf('data/processed_data/test_targets_zero' + slgt_str + '.nc')