# Experiment 1: No-fire December set (PWWB + AirNow)

This notebook focuses on running Experiment 1, which uses data from December (no-fire period) combining PWWB and AirNow datasets.

# Data parameters

In [1]:
# define bounding box
lat_bottom, lat_top = 33.5, 34.5
lon_bottom, lon_top = -118.75, -117.0
extent = (lon_bottom, lon_top, lat_bottom, lat_top)

# input data shape
dim = 200
frames_per_sample = 5

# date range of data - two weeks of December for this experiment 2024-12-08-00", "2024-12-21-00"
dec_start_date, dec_end_date = "2024-12-01-00", "2025-01-01-00"

# Data ingestion and preprocessing

In [2]:
# python nonsense that allows you to import from sibling directories
import sys
sys.path.append("..")

import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from dotenv import load_dotenv

# Import the new PWWB implementation and dataset manager
from libs.pwwb import PWWBData
from libs.pwwb.utils.dataset_manager import create_dataset_manager

# Import the AirNow data class
from libs.airnowdata import AirNowData

# Load environment variables (API keys, credentials)
load_dotenv()

# split data
def train_test_split(X, train_size=0.75):
    split_idx = int(X.shape[0] * train_size)
    X_train, X_test = X[:split_idx], X[split_idx:]
    
    return X_train, X_test

# scale training data, then scale test data based on training data stats
from sklearn.preprocessing import StandardScaler
def std_scale(X_train, X_test):
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(X_train.reshape(-1, 1)).reshape(X_train.shape)
    scaled_test = scaler.transform(X_test.reshape(-1, 1)).reshape(X_test.shape)

    return scaled_train, scaled_test

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Create output directory for results
output_dir = "experiment_output"
os.makedirs(output_dir, exist_ok=True)

# Create dataset manager
manager = create_dataset_manager(
    registry_file="experiment1_registry.json",
    cache_dir="data/pwwb_cache/"
)

# List existing datasets
print("Existing datasets:")
try:
    display(manager.list_datasets())
except:
    print("No existing datasets found.")

Existing datasets:


Unnamed: 0,name,created,description,start_date,end_date,channels
0,dec2024_MAIC_TROPOMI_N02_METAR_WIND,2025-05-20T17:58:39.575630,"December 2024 - month with MAIAC, TROPOMI NO2,...",2024-12-01-00,2024-12-3-00,"maiac, tropomi, metar"
1,dec2024_MAIC_TROPOMI_N02_METAR_WIND_UV_ONLY,2025-05-20T18:34:46.342993,"December 2024 - month with MAIAC, TROPOMI NO2,...",2024-12-01-00,2024-12-3-00,"maiac, tropomi, metar"


In [4]:
# Adjust end date for AirNow
dec_end_date_adj = pd.to_datetime(dec_end_date) - pd.Timedelta(hours=1)

# Dataset name and description
dataset_name = "dec2024_MAIC_TROPOMI_N02_METAR_WIND_UV_ONLY"
dataset_desc = "December 2024 - two weeks with MAIAC, TROPOMI NO2, METAR Wind U/V components only"

# ========== 1. Load December PWWB Data ==========
print("\nLoading December PWWB data...")

# Check if dataset already exists in the registry
if manager.get_dataset_info(dataset_name) is not None:
    print(f"Dataset '{dataset_name}' already exists, loading from cache...")
    dec_pwwb = manager.load_dataset(dataset_name, PWWBData)
else:
    print(f"Dataset '{dataset_name}' not found, creating new one...")
    # Create the dataset with the specified channels
    dec_pwwb = manager.create_dataset(
        name=dataset_name,
        description=dataset_desc,
        PWWBData_class=PWWBData,
        start_date=dec_start_date,
        end_date=dec_end_date,
        extent=extent,
        frames_per_sample=frames_per_sample,
        dim=dim,
        include_channels={
            'maiac': True,                     # Include MAIAC AOD
            'tropomi': ['TROPOMI_NO2'],        # Only include NO2 from TROPOMI
            'metar': ['METAR_Wind_U', 'METAR_Wind_V'],  # Only wind components from METAR
            'modis_fire': False,               # Exclude MODIS fire data
            'merra2': False                    # Exclude MERRA2 data
        },
        verbose=True,
        output_dir=output_dir
    )
    # Save the dataset
    dec_pwwb.save_data()

# Get the data and channel info
X_dec_pwwb = dec_pwwb.data
channel_info = dec_pwwb.get_channel_info()
print(f"✓ December PWWB data shape: {X_dec_pwwb.shape}")
print(f"  Channels: {channel_info['channel_names']}")

# ========== 2. Load December AirNow Data ==========
print("\nLoading December AirNow data...")
dec_airnow = AirNowData(
    start_date=dec_start_date,
    end_date=dec_end_date_adj,
    extent=extent,
    airnow_api_key=os.getenv('AIRNOW_API_KEY'),
    frames_per_sample=frames_per_sample,
    dim=dim,
    elevation_path="../libs/inputs/elevation.npy",
    mask_path="../libs/inputs/mask.npy",
    force_reprocess=False
)
X_dec_airnow = dec_airnow.data
Y_dec = dec_airnow.target_stations
print(f"✓ December AirNow data shape: {X_dec_airnow.shape}")
if Y_dec is not None:
    print(f"  December target stations shape: {Y_dec.shape}")
else:
    print("  No December target stations available")


Loading December PWWB data...
Dataset 'dec2024_MAIC_TROPOMI_N02_METAR_WIND_UV_ONLY' already exists, loading from cache...
Using cache prefix: dec2024_MAIC_TROPOMI_N02_METAR_WIND_UV_ONLY_
Initialized PWWBData with 48 hourly timestamps
Date range: 2024-12-01 00:00:00 to 2024-12-02 23:00:00
Channels included: ['maiac', 'tropomi', 'metar']
TROPOMI channels: ['TROPOMI_NO2']
METAR channels: ['METAR_Wind_U', 'METAR_Wind_V']
Processing MAIAC AOD data...
Fetching MAIAC AOD data for 2 unique dates


  self.timestamps = pd.date_range(self.start_date, self.end_date, freq='H')


AOD data shape before resize: (5, 1200, 1200)
AOD data type: int16
AOD data min/max: -28672/1180
Averaged AOD data to shape: (1200, 1200)
Resized AOD data to shape: (200, 200)
Successfully processed AOD data for 2024-12-01
AOD data shape before resize: (4, 1200, 1200)
AOD data type: int16
AOD data min/max: -28672/582
Averaged AOD data to shape: (1200, 1200)
Resized AOD data to shape: (200, 200)
Successfully processed AOD data for 2024-12-02
Created MAIAC AOD data with shape (48, 200, 200, 1)
Processing TROPOMI data...
Including TROPOMI channels: ['TROPOMI_NO2']
Fetching TROPOMI data for 2 unique dates
Processing TROPOMI products: ['NO2']
Processing TROPOMI data for date: 2024-12-01
Successfully processed NO2 data
Processing TROPOMI data for date: 2024-12-02
Successfully processed NO2 data
Created TROPOMI data with shape (48, 200, 200, 1)
Processing METAR meteorological data...
Initialized MetarDataSource with 2 channels: ['METAR_Wind_U', 'METAR_Wind_V']
Will fetch these raw variables: 

  full_date_range = pd.date_range(start=start_date_with_margin, end=end_date_with_margin, freq='H')
  station_df['timestep'] = station_df['valid'].dt.ceil('H')
  station_df['timestep'] = station_df['valid'].dt.ceil('H')
  station_df['timestep'] = station_df['valid'].dt.ceil('H')
  full_range = pd.date_range(start=self.timestamps[0], end=self.timestamps[-1], freq='H')


Created METAR data with shape (48, 200, 200, 2)
Final channels: ['METAR_Wind_U', 'METAR_Wind_V']
METAR_Wind_U sample stats: min=5.75, max=6.48, mean=6.09
METAR_Wind_V sample stats: min=0.00, max=2.36, mean=0.06
Saving data to cache: data/pwwb_cache/dec2024_MAIC_TROPOMI_N02_METAR_WIND_UV_ONLY_metar_u_v_data.npy
Final data shape: (44, 5, 200, 200, 4)

Channel Statistics:

Channel 0: MAIAC_AOD
  Min: -28672.000000000007
  Max: -11408.21815610717
  Mean: -22674.333462043887
  Std: 5129.642805682386
  Data coverage: 100.00% (40000/40000 non-zero pixels)

Channel 1: TROPOMI_NO2
  Min: 0.00012060817200200961
  Max: 0.0005489974885201102
  Mean: 0.00029664643713431493
  Std: 8.174618747433876e-05
  Data coverage: 100.00% (40000/40000 non-zero pixels)

Channel 2: METAR_Wind_U
  Min: 5.75
  Max: 6.483879083422768
  Mean: 6.086833897749474
  Std: 0.0759926512972382
  Data coverage: 100.00% (40000/40000 non-zero pixels)

Channel 3: METAR_Wind_V
  Min: 0.0
  Max: 2.359938988947114
  Mean: 0.0628026

In [5]:
# ========== 3. Create Experiment 1 dataset ==========
print("\nCreating Experiment 1 dataset...")

# Experiment 1: No-fire December set (PWWB + AirNow)
print("  Experiment 1: No-fire December set (PWWB + AirNow)")
X_exp1 = np.concatenate([X_dec_pwwb, X_dec_airnow], axis=-1)
print(f"    Combined shape: {X_exp1.shape}")

# Display the number of channels from each source
pwwb_channels = X_dec_pwwb.shape[4]
airnow_channels = X_dec_airnow.shape[4]
print(f"    PWWB channels: {pwwb_channels}, AirNow channels: {airnow_channels}, Total: {X_exp1.shape[4]}")


Creating Experiment 1 dataset...
  Experiment 1: No-fire December set (PWWB + AirNow)


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 44 and the array at index 1 has size 740

In [None]:
# ========== 4. Train/Test Split for experiment ==========
print("\nCreating train/test splits for experiment...")
# Experiment 1 splits
X_exp1_train, X_exp1_test = train_test_split(X_exp1, train_size=0.75)
Y_dec_train, Y_dec_test = train_test_split(Y_dec, train_size=0.75)
print(f"  Experiment 1: Train={X_exp1_train.shape}, Test={X_exp1_test.shape}")

# ========== 5. Standardize data ==========
print("\nStandardizing data...")

# Experiment 1 standardization
X_exp1_train_scaled, X_exp1_test_scaled = std_scale(X_exp1_train, X_exp1_test)
print(f"  Experiment 1: Scaled train={X_exp1_train_scaled.shape}, test={X_exp1_test_scaled.shape}")

In [None]:
# ========== 6. Save prepared datasets ==========
print("\nSaving prepared dataset...")

# Create directory for experiment
exp_dir = os.path.join(output_dir, "experiment1")
os.makedirs(exp_dir, exist_ok=True)

# Save Experiment 1 data
np.save(os.path.join(exp_dir, "X_train.npy"), X_exp1_train_scaled)
np.save(os.path.join(exp_dir, "X_test.npy"), X_exp1_test_scaled)
np.save(os.path.join(exp_dir, "y_train.npy"), Y_dec_train)
np.save(os.path.join(exp_dir, "y_test.npy"), Y_dec_test)

# Save metadata
metadata = {
    "date_range": f"{dec_start_date} to {dec_end_date}",
    "extent": extent,
    "dim": dim,
    "frames_per_sample": frames_per_sample,
    "pwwb_channels": channel_info['channel_names'],
    "airnow_channels": ["AirNow_PM25"]
}

with open(os.path.join(exp_dir, "metadata.json"), "w") as f:
    import json
    json.dump(metadata, f, indent=2)

print("\n✓ Dataset prepared and saved!")

# Data visualization

In [None]:
# Function to visualize data from experiment
def visualize_experiment_data(X, y, channel_names=None, sample_idx=None):
    """Visualize data from the experiment"""
    # Get a random sample if none provided
    if sample_idx is None:
        np.random.seed(42)
        sample_idx = np.random.choice(range(len(X)), size=1)[0]
    
    # Get channel information
    n_channels = X.shape[4]
    n_frames = X.shape[1]
    
    # Use provided channel names or create default ones
    if channel_names is None or len(channel_names) != n_channels:
        channel_names = [f"Channel {i}" for i in range(n_channels)]
    
    # Create figure
    fig, axes = plt.subplots(n_channels, n_frames, figsize=(3*n_frames, 2*n_channels))
    if n_channels == 1:
        axes = axes.reshape(1, -1)
    
    # Plot each channel and frame
    for c in range(n_channels):
        for f in range(n_frames):
            ax = axes[c, f]
            im = ax.imshow(X[sample_idx, f, :, :, c])
            plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
            if f == 0:
                ax.set_ylabel(channel_names[c])
            ax.set_title(f"Frame {f+1}")
    
    # Set title
    plt.suptitle(f"Experiment 1: No-fire December set (PWWB + AirNow)\nSample {sample_idx}")
    plt.tight_layout()
    plt.show()
    
    # Print target values
    if y is not None:
        print(f"Target values: {y[sample_idx]}")

# Create combined channel names list
all_channel_names = channel_info['channel_names'] + ["AirNow_PM25"]

# Visualize a sample from the experiment
print("Visualizing data...")
visualize_experiment_data(X_exp1_train_scaled, Y_dec_train, channel_names=all_channel_names)

# Model

In [None]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.models import Model
from keras.layers import Conv3D
from keras.layers import ConvLSTM2D
from keras.layers import BatchNormalization
from keras.layers import Convolution2D, MaxPooling3D, Flatten, Reshape
from keras.layers import TimeDistributed
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import InputLayer

tf.keras.backend.set_image_data_format('channels_last')

In [None]:
# Run Experiment 1: No-fire December set (PWWB + AirNow)
print("\n==== Running Experiment 1: No-fire December set (PWWB + AirNow) ====")
print(f"Training data shape: {X_exp1_train_scaled.shape}")
print(f"Target data shape: {Y_dec_train.shape}")

# Build model
seq = Sequential()

seq.add(
    InputLayer(shape=X_exp1_train_scaled.shape[1:])
)

seq.add(
    ConvLSTM2D(
            filters=15, 
            kernel_size=(3, 3),
            padding='same', 
            return_sequences=True
    )
)

seq.add(
    ConvLSTM2D(
        filters=30, 
        kernel_size=(3, 3),
        padding='same', 
        return_sequences=True
    )
)

seq.add(
    Conv3D(
        filters=15, 
        kernel_size=(3, 3, 3),
        activation='relu',
        padding='same'    
    )
)

seq.add(
    Conv3D(
        filters=1, 
        kernel_size=(3, 3, 3),
        activation='relu',
        padding='same'
    )
)

seq.add(Flatten())
seq.add(Dense(Y_dec_train.shape[1], activation='relu'))

# Compile model
seq.compile(loss='mean_absolute_error', optimizer='adam')

# Print model summary
seq.summary()

# Train model
print(f"\nTraining model...")
epochs = 100  # Reduced epochs for faster testing
batch_size = 4
history = seq.fit(
    X_exp1_train_scaled, Y_dec_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)

# Evaluate model
print(f"\nEvaluating model...")
test_loss = seq.evaluate(X_exp1_test_scaled, Y_dec_test, verbose=0)
print(f"Test MAE: {test_loss:.4f}")

# Make predictions
y_pred = seq.predict(X_exp1_test_scaled, verbose=0)

# Calculate metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(Y_dec_test, y_pred)
rmse = np.sqrt(mean_squared_error(Y_dec_test, y_pred))
r2 = r2_score(Y_dec_test, y_pred)

print(f"Mean Absolute Error: {mae:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

# Plot training history
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
if 'val_loss' in history.history:
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.legend()
plt.title('Experiment 1: No-fire December set (PWWB + AirNow)\nTraining Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MAE)')
plt.grid(True, alpha=0.3)
plt.show()

# Save results
results_dir = os.path.join(output_dir, "experiment1", "results")
os.makedirs(results_dir, exist_ok=True)

np.save(os.path.join(results_dir, "y_pred.npy"), y_pred)
seq.save(os.path.join(results_dir, "model.h5"))

exp1_results = {
    'model': seq,
    'history': history,
    'loss': test_loss,
    'mae': mae,
    'rmse': rmse,
    'r2': r2,
    'y_pred': y_pred,
    'y_test': Y_dec_test
}

# Evaluate

In [None]:
print(f"\nDetailed analysis for Experiment 1:")
X_test = X_exp1_test_scaled
y_test = Y_dec_test
y_pred = exp1_results['y_pred']
model = exp1_results['model']
description = "No-fire December set (PWWB + AirNow), two weeks"

print(f"Analyzing Experiment 1: {description}")

In [None]:
from libs.plotting import (
    plot_prediction_comparison,
    plot_scatter_comparison,
    plot_error_by_sensor,
    plot_time_series_comparison,
    plot_input_frames,
    print_metrics
)

# Sensor names (use AirNow sensor names if available)
if hasattr(dec_airnow, 'sensor_names') and dec_airnow.sensor_names is not None:
    sensor_names = dec_airnow.sensor_names
else:
    sensor_names = [
        "North Holywood", 
        "Los Angeles - N. Main Street", 
        "Compton",
        "Crestline - Lake Gregory",
        "Fontana - Arrow Highway",
        "Glendora - Laurel",
        "Lake Elsinore - W. Flint Street",
        "Long Beach Signal Hill",
        "Mira Loma - Van Buren",
        "Reseda",
        "Riverside - Rubidoux",
        "Santa Clarita",
        "Simi Valley - Cochran Street",
        "Temecula (Lake Skinner)"
    ]

print("\n1. Plotting prediction comparison...")
plot_prediction_comparison(y_pred, y_test, sensor_names, sample_idx=8)

print("\n2. Plotting scatter comparison...")
plot_scatter_comparison(y_pred, y_test)

print("\n3. Plotting error by sensor...")
plot_error_by_sensor(y_pred, y_test, sensor_names)

print("\n4. Plotting time series comparison...")
plot_time_series_comparison(y_pred, y_test, sensor_names)
    
print("\n5. Plotting time series with shifted predictions...")
plot_time_series_comparison(y_pred, y_test, sensor_names, shift_pred=1)

print("\n6. Printing metrics...")
print_metrics(y_pred, y_test, sensor_names)

In [None]:
# Save experiment comparison
with open(os.path.join(output_dir, 'experiment1_results.txt'), 'w') as f:
    f.write("==== Experiment 1 Results ====\n")
    f.write(f"Experiment 1 (No-fire December, two weeks): MAE = {exp1_results['mae']:.4f}, RMSE = {exp1_results['rmse']:.4f}, R² = {exp1_results['r2']:.4f}\n")
    f.write(f"\nAnalysis completed on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

print("\nExperiment 1 complete!")