# Chlorophyll-a Concentration Processing for Frontend Visualization

This notebook processes PACE OCI chlorophyll-a data and exports it as JSON for the React frontend.

In [None]:
import os
import glob
import numpy as np
import xarray as xr
import json
from datetime import datetime

# Get the dataset directory
dataset_dir = f"{os.path.dirname(os.getcwd())}/datasets"
chloro_dir = f"{dataset_dir}/realtime_chlorophyll_concentration"

print(f"Chlorophyll data directory: {chloro_dir}")

In [None]:
# List available chlorophyll files
chloro_files = sorted(glob.glob(f"{chloro_dir}/PACE_OCI.*.L3m.*.CHL.*.nc"))
print(f"Found {len(chloro_files)} chlorophyll data files\n")

# Show the most recent files
print("Most recent files:")
for file in chloro_files[-5:]:
    print(f"  {os.path.basename(file)}")

In [None]:
# Load the most recent file with 0.1 degree resolution for balance between detail and performance
degree_files = [f for f in chloro_files if '.0p1deg.' in f]

if degree_files:
    # Use the most recent 0.1 degree file
    latest_file = degree_files[-1]
else:
    # Fallback to the most recent file
    latest_file = chloro_files[-1]

print(f"Loading: {os.path.basename(latest_file)}")

# Open the dataset
ds = xr.open_dataset(latest_file)
print(f"\nDataset info:")
print(f"  Variables: {list(ds.data_vars)}")
print(f"  Dimensions: {ds.dims}")
print(f"  Coordinates: {list(ds.coords)}")

# Show available chlorophyll variables
print(f"\nChlorophyll variables:")
for var in ds.data_vars:
    if hasattr(ds[var], 'long_name'):
        print(f"  {var}: {ds[var].long_name}")
    else:
        print(f"  {var}: (no description)")

In [None]:
# Extract chlorophyll-a data
# Look for the chlor_a variable which is the standard name for chlorophyll-a
if 'chlor_a' in ds.data_vars:
    primary_var = 'chlor_a'
elif 'chl' in ds.data_vars:
    primary_var = 'chl'
elif 'chlorophyll' in ds.data_vars:
    primary_var = 'chlorophyll'
else:
    # Use the first non-metadata variable
    available_vars = [v for v in ds.data_vars if v not in ['palette', 'qual', 'quality', 'l2_flags']]
    primary_var = available_vars[0] if available_vars else None

if primary_var:
    print(f"Using primary variable: {primary_var}")
    chloro_data = ds[primary_var]
    
    # Get coordinates
    lons = ds['lon'].values if 'lon' in ds.coords else ds['longitude'].values
    lats = ds['lat'].values if 'lat' in ds.coords else ds['latitude'].values
    
    print(f"Data shape: {chloro_data.shape}")
    print(f"Longitude range: [{lons.min():.2f}, {lons.max():.2f}]")
    print(f"Latitude range: [{lats.min():.2f}, {lats.max():.2f}]")
else:
    print("No suitable chlorophyll variables found")

In [None]:
# Get statistics for the chlorophyll data
if primary_var:
    chloro_values = chloro_data.values
    valid_chloro = chloro_values[~np.isnan(chloro_values)]
    
    if len(valid_chloro) > 0:
        print(f"\nChlorophyll-a concentration statistics:")
        print(f"  Min: {valid_chloro.min():.6f}")
        print(f"  Max: {valid_chloro.max():.6f}")
        print(f"  Mean: {valid_chloro.mean():.6f}")
        print(f"  Median: {np.median(valid_chloro):.6f}")
        print(f"  Valid points: {len(valid_chloro):,} / {chloro_values.size:,}")
        
        # Check units if available
        if hasattr(chloro_data, 'units'):
            print(f"  Units: {chloro_data.units}")

In [None]:
# Process chlorophyll data for heatmap visualization
# Export actual concentration values

# Prepare heatmap data in [lat, lng, concentration] format
heat_data = []

# Determine sampling factor based on file resolution
if '.0p1deg.' in latest_file:
    # 0.1 degree resolution - moderate sampling
    sample_factor = 2  # Sample every 2nd point for 0.2 degree resolution
elif '.4km.' in latest_file:
    # 4km resolution - more aggressive sampling
    sample_factor = 20  # Sample every 20th point
else:
    # Default sampling
    sample_factor = 5

print(f"Processing chlorophyll data with sampling factor: {sample_factor}")
print(f"This will create a grid with approximately {len(lons)//sample_factor} x {len(lats)//sample_factor} points")

for i in range(0, len(lons), sample_factor):
    for j in range(0, len(lats), sample_factor):
        concentration = chloro_values[j, i]  # Note: NetCDF typically stores as (lat, lon)
        
        if not np.isnan(concentration) and concentration > 0:  # Filter out zero/negative values
            # Convert lon from 0-360 to -180-180 if needed
            lon = float(lons[i])
            if lon > 180:
                lon = lon - 360
            
            # Store actual concentration value in mg/m^3
            heat_data.append([float(lats[j]), lon, float(concentration)])

print(f"\nGenerated {len(heat_data):,} chlorophyll data points")

# Calculate actual resolution in degrees
if '.0p1deg.' in latest_file:
    grid_resolution = sample_factor * 0.1
elif '.4km.' in latest_file:
    grid_resolution = sample_factor * 0.04
else:
    grid_resolution = sample_factor * 0.1
    
print(f"Grid resolution: approximately {grid_resolution:.2f} degrees (~{grid_resolution * 111:.0f} km at equator)")

# Show concentration range in the exported data
if heat_data:
    concentrations_in_data = [point[2] for point in heat_data]
    print(f"\nConcentration range in exported data:")
    print(f"  Min: {min(concentrations_in_data):.6f} mg/m³")
    print(f"  Max: {max(concentrations_in_data):.6f} mg/m³")
    print(f"  Mean: {np.mean(concentrations_in_data):.6f} mg/m³")
    
    # Check regional variations
    equator_conc = [p[2] for p in heat_data if -10 < p[0] < 10]
    north_conc = [p[2] for p in heat_data if p[0] > 30]
    south_conc = [p[2] for p in heat_data if p[0] < -30]
    
    if equator_conc:
        print(f"\nEquatorial region: {min(equator_conc):.6f} to {max(equator_conc):.6f} mg/m³")
    if north_conc:
        print(f"Northern region (>30°N): {min(north_conc):.6f} to {max(north_conc):.6f} mg/m³")
    if south_conc:
        print(f"Southern region (<30°S): {min(south_conc):.6f} to {max(south_conc):.6f} mg/m³")

In [None]:
# Save chlorophyll data to JSON for frontend
output_path = '../frontend/src/pages/landing/chlorophyll_data.json'

# Create metadata to help frontend understand the data
metadata = {
    "variable": primary_var if primary_var else "unknown",
    "units": chloro_data.units if hasattr(chloro_data, 'units') else "mg/m³",
    "date": os.path.basename(latest_file).split('.')[1],  # Extract date from filename
    "min_value": float(min(concentrations_in_data)) if heat_data else 0,
    "max_value": float(max(concentrations_in_data)) if heat_data else 1,
    "mean_value": float(np.mean(concentrations_in_data)) if heat_data else 0.5
}

# Combine data and metadata
output_data = {
    "metadata": metadata,
    "data": heat_data
}

with open(output_path, 'w') as f:
    json.dump(output_data, f)

print(f"\nSaved chlorophyll data to {output_path}")
print(f"File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")
print(f"\nMetadata saved:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

In [None]:
# Optional: Create a visualization to verify the data
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

if primary_var and len(valid_chloro) > 0:
    # Create a simple plot to verify the chlorophyll distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot 1: Concentration distribution histogram (log scale)
    # Use log scale for better visualization of chlorophyll concentrations
    log_valid = np.log10(valid_chloro[valid_chloro > 0])  # Log transform positive values
    
    ax1.hist(log_valid, bins=50, edgecolor='black', alpha=0.7, color='seagreen')
    ax1.axvline(np.log10(valid_chloro.min()) if valid_chloro.min() > 0 else -3, 
                color='blue', linestyle='--', label=f'Min: {valid_chloro.min():.3f}')
    ax1.axvline(np.log10(valid_chloro.mean()), 
                color='green', linestyle='--', label=f'Mean: {valid_chloro.mean():.3f}')
    ax1.axvline(np.log10(valid_chloro.max()), 
                color='red', linestyle='--', label=f'Max: {valid_chloro.max():.3f}')
    
    ax1.set_xlabel('Log10(Concentration mg/m³)')
    ax1.set_ylabel('Frequency')
    ax1.set_title(f'Chlorophyll-a Concentration Distribution')
    ax1.legend(loc='upper right', fontsize=9)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Global chlorophyll map
    # Use a green-blue colormap for chlorophyll
    cmap = plt.cm.viridis  # Good for chlorophyll visualization
    # Use log normalization for better visualization
    norm = mcolors.LogNorm(vmin=max(valid_chloro.min(), 0.001), vmax=valid_chloro.max())
    
    # Create meshgrid for plotting
    lon_grid, lat_grid = np.meshgrid(lons, lats)
    
    # Plot the chlorophyll data
    im = ax2.pcolormesh(lon_grid, lat_grid, chloro_values, cmap=cmap, norm=norm, shading='auto')
    ax2.set_xlabel('Longitude')
    ax2.set_ylabel('Latitude')
    ax2.set_title(f'Chlorophyll-a Concentration - {os.path.basename(latest_file).split(".")[1]}')
    ax2.set_xlim(-180, 180)
    ax2.set_ylim(-90, 90)
    
    # Add colorbar
    cbar = plt.colorbar(im, ax=ax2, label=f'Chlorophyll-a ({chloro_data.units if hasattr(chloro_data, "units") else "mg/m³"})', shrink=0.8)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nVisualization complete.")
else:
    print("No data available for visualization")