This note book is to create new validation figures for windspeed from IFS with the madis mesonet QC final dataset. 

In [1]:
import os
import xarray as xr
import glob
import numpy as np
import sys
import importlib
import pandas as pd
import warnings

In [2]:
sys.path.insert(0, "/home/kylabazlen/")
processing = importlib.import_module("2026_nvhackathon.validation.processing")
subset_grid_to_point_xy = processing.subset_grid_to_point_xy
forecast_obs_merge = processing.forecast_obs_merge
convert_1d_to_2d_latlon = processing.convert_1d_to_2d_latlon
metrics = importlib.import_module("2026_nvhackathon.validation.metrics")
add_gof_stats = metrics.add_gof_stats

plotting = importlib.import_module("2026_nvhackathon.validation.plotting")
confusion_matrix = plotting.confusion_matrix
plot_confusion_matrix = plotting.plot_confusion_matrix

configure_style = plotting.configure_style
COLORS = plotting.COLORS
MARKERS = plotting.MARKERS

In [3]:
observations = '/project/cowy-nvhackathon/cowy-wildfire/data/observations/cowy_madis_metar_mesonet_2024.nc'
# ifs_paths = sorted(glob.glob("/gscratch/kylabazlen/nwp/ifs/yearly_datasets/*"))
ifs_paths = sorted(glob.glob("/project/cowy-nvhackathon/cowy-wildfire/data/nwp/ifs_yearly/*"))

In [4]:
# Open the observations file
obs_ds = xr.open_dataset(observations) #space, time diminsions

# Open the first IFS file
ifs_ds = xr.open_dataset(ifs_paths[0]) #time, latitude, longitude diminsions


To continue decoding into a timedelta64 dtype, either set `decode_timedelta=True` when opening this dataset, or add the attribute `dtype='timedelta64[ns]'` to this variable on disk.
To opt-in to future behavior, set `decode_timedelta=False`.
  ifs_ds = xr.open_dataset(ifs_paths[0]) #time, latitude, longitude diminsions


In [None]:
#drop stations with all NA windspeed data
has_data_mask = ~obs_ds['windspeed_10m'].isnull().all(dim='time')

obs_ds_clean = obs_ds.sel(space=has_data_mask)

print(f"Original stations: {obs_ds.dims['space']}")
print(f"Stations with data: {obs_ds_clean.dims['space']}")

In [None]:
import pandas as pd
from pathlib import Path
import re 

results = []
mbe_records = []

for fp in ifs_paths:
    ifs_ds = xr.open_dataset(fp)

    ds = convert_1d_to_2d_latlon(ds=ifs_ds)
    ifs_subset = subset_grid_to_point_xy(ds=ds, point_ds=obs_ds_clean)
    all_data_ifs = forecast_obs_merge(ds1=ifs_subset, ds2=obs_ds_clean, ds1_timevar="valid_time", ds2_timevar="time")
    all_w_gof = add_gof_stats(ds=all_data_ifs, var1="ws_10", var2="obs_windspeed_10m")
    results.append(all_w_gof)

    # Extract filename and forecast hour
    filename = Path(fp).stem
    forecast_hour = int(re.search(r'_f(\d+)', filename).group(1))
    
    
    # Calculate mean GOF across stations
    mean_mbe = all_w_gof['ws_10_vs_obs_windspeed_10m_MBE'].mean(dim='space').item()  # adjust 'station' to your actual dimension name
    mean_rmse = all_w_gof['ws_10_vs_obs_windspeed_10m_RMSE'].mean(dim='space').item()  # adjust 'station' to your actual dimension name

    mbe_records.append({
        'file': Path(fp).stem,
        'forecast_hr': forecast_hour,
        'mean_mbe': mean_mbe,
        'mean_rmse': mean_rmse

    })
    
    ifs_ds.close()

# Create DataFrame
mbe_df = pd.DataFrame(mbe_records)
print(mbe_df)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(mbe_df['forecast_hr'], mbe_df['mean_mbe'], marker='o', linewidth=2, markersize=6)
plt.xlabel('Forecast Hour')
plt.ylabel('Mean MBE')
plt.title('Mean Bias Error by Forecast Hour')
plt.axhline(y=0, color='k', linestyle='--', alpha=0.3)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from pathlib import Path
import re
import numpy as np

results = []
mbe_records = []

# Define wind speed bins (0-5, 5-10, 10-15, etc.)
ws_bins = np.arange(0, 35, 5)  # Adjust upper limit as needed
bin_labels = [f"{ws_bins[i]}-{ws_bins[i+1]}" for i in range(len(ws_bins)-1)]

for fp in ifs_paths:
    ifs_ds = xr.open_dataset(fp)
    ds = convert_1d_to_2d_latlon(ds=ifs_ds)
    ifs_subset = subset_grid_to_point_xy(ds=ds, point_ds=obs_ds)
    all_data_ifs = forecast_obs_merge(ds1=ifs_subset, ds2=obs_ds, ds1_timevar="valid_time", ds2_timevar="time")
    all_w_gof = add_gof_stats(ds=all_data_ifs, var1="ws_10", var2="obs_windspeed_10m")
    results.append(all_w_gof)

    # Extract filename and forecast hour
    filename = Path(fp).stem
    forecast_hour = int(re.search(r'_f(\d+)', filename).group(1))

    # Calculate mean GOF across stations (total)
    mean_mbe = all_w_gof['ws_10_vs_obs_windspeed_10m_MBE'].mean(dim='space').item()
    mean_rmse = all_w_gof['ws_10_vs_obs_windspeed_10m_RMSE'].mean(dim='space').item()

    # Initialize record with total stats
    record = {
        'file': filename,
        'forecast_hr': forecast_hour,
        'mean_mbe_total': mean_mbe,
        'mean_rmse_total': mean_rmse
    }

    # Get forecast and observed wind speed arrays
    forecast_ws = all_data_ifs['ws_10'].values.flatten()
    obs_ws = all_data_ifs['obs_windspeed_10m'].values.flatten()

    # Remove NaN pairs
    valid_mask = ~np.isnan(forecast_ws) & ~np.isnan(obs_ws)
    forecast_ws_valid = forecast_ws[valid_mask]
    obs_ws_valid = obs_ws[valid_mask]

    # Calculate error
    error = forecast_ws_valid - obs_ws_valid

    # Bin by observed wind speed and calculate MBE/RMSE for each bin
    bin_indices = np.digitize(obs_ws_valid, ws_bins) - 1  # -1 to make 0-indexed

    for i, label in enumerate(bin_labels):
        bin_mask = bin_indices == i
        
        if np.sum(bin_mask) > 0:
            bin_errors = error[bin_mask]
            bin_mbe = np.mean(bin_errors)
            bin_rmse = np.sqrt(np.mean(bin_errors**2))
            bin_count = np.sum(bin_mask)
        else:
            bin_mbe = np.nan
            bin_rmse = np.nan
            bin_count = 0

        record[f'mbe_{label}'] = bin_mbe
        record[f'rmse_{label}'] = bin_rmse
        record[f'count_{label}'] = bin_count

    mbe_records.append(record)
    ifs_ds.close()

# Create DataFrame
mbe_df = pd.DataFrame(mbe_records)
print(mbe_df)

# Optional: Display column info
print("\nColumns:", mbe_df.columns.tolist())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm

# Apply the style configuration
configure_style()

# Define the bin columns and labels
bin_labels = ['0-5', '5-10', '10-15', '15-20', '20-25']
mbe_columns = [f'rmse_{label}' for label in bin_labels]

# Create reversed PuBuGn colormap (high to low)
# Sample from 0.1-0.65 range to stay in saturated colors (avoid the light purple end)
cmap = cm.PuBuGn_r
colors = [cmap(0.1 + 0.55 * i / (len(bin_labels) - 1)) for i in range(len(bin_labels))]

# Create figure
fig, ax = plt.subplots(figsize=(10, 6))

# Plot each wind speed bin
for i, (col, label) in enumerate(zip(mbe_columns, bin_labels)):
    ax.plot(
        mbe_df['forecast_hr'], 
        mbe_df[col], 
        marker=MARKERS[i], 
        color=colors[i], 
        label=f'{label} m/s',
        linewidth=2,
        markersize=5
    )

# Plot total MBE as a dashed black line
ax.plot(
    mbe_df['forecast_hr'], 
    mbe_df['mean_rmse_total'], 
    marker=MARKERS[5], 
    color='black', 
    label='All',
    linewidth=2.5,
    markersize=6,
    linestyle='--'
)

# Add horizontal line at zero
ax.axhline(y=0, color='black', linestyle='-', linewidth=1, alpha=0.5)

# Labels and title
ax.set_xlabel('Forecast Hour')
ax.set_ylabel('RMSE (m/s)')
ax.set_title('IFS Wind Speed RMSE')

# Set x-ticks to match forecast hours
ax.set_xticks(mbe_df['forecast_hr'])

# Legend on the right outside the plot
ax.legend(
    title='Observed Wind Speed', 
    loc='center left', 
    bbox_to_anchor=(1.02, 0.5),
    framealpha=0.9
)

# Adjust layout to make room for legend
fig.subplots_adjust(right=0.82)

plt.show()

In [None]:
import numpy as np
import pandas as pd

# Collect all observations and predictions across all forecast hours
all_obs = []
all_pred = []

for result in results:
    obs = result['obs_windspeed_10m'].values.ravel()
    pred = result['ws_10'].values.ravel()
    all_obs.append(obs)
    all_pred.append(pred)

# Concatenate into single arrays
all_obs = np.concatenate(all_obs)
all_pred = np.concatenate(all_pred)

# Remove NaNs if present
mask = ~np.isnan(all_obs) & ~np.isnan(all_pred)
all_obs = all_obs[mask]
all_pred = all_pred[mask]

# Compute single confusion matrix
threshold = 15
cm_all = confusion_matrix(all_obs, all_pred, threshold=threshold)
print(cm_all)

# Plot it
plot_confusion_matrix(cm_all, title=f"Confusion Matrix (All Forecast Hours, threshold={threshold})")