In [18]:
import pandas as pd
import numpy as np
import xarray as xr
from datetime import datetime
import earthaccess
from tqdm import tqdm

# === Authenticate ===
auth = earthaccess.login()
assert auth.authenticated, "Earthaccess login failed."

# === Load simplified catch data ===
df = pd.read_csv("simplified_catch_summary.csv")

# === Helper: convert date to 8-day PACE period ===
def get_8day_range(date_str):
    dt = pd.to_datetime(date_str)
    doy = (dt - pd.Timestamp("2024-01-01")).days
    start_day = 8 * (doy // 8)
    start_date = pd.Timestamp("2024-01-01") + pd.Timedelta(days=start_day)
    end_date = start_date + pd.Timedelta(days=7)
    return start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")

# === Caching logic ===
rrs_cache = {}
avw_cache = {}

def load_cached_dataset(short_name, start_date, end_date, cache, product_type="8D"):
    key = (short_name, start_date, product_type)
    if key in cache:
        return cache[key]

    granule_pattern = f"*.{product_type}.*.4km.*"

    results = earthaccess.search_data(
        short_name=short_name,
        temporal=(start_date, end_date),
        granule_name=granule_pattern
    )
    if not results:
        return None

    try:
        ds = xr.open_dataset(earthaccess.open(results)[0])
        cache[key] = ds
        return ds
    except Exception as e:
        print(f"Failed to open dataset for {short_name} on {start_date} ({product_type}): {e}")
        return None

def try_extract(ds_rrs, ds_avw, lat, lon):
    if ds_rrs is None or ds_avw is None:
        return None, None, None

    try:
        lat_idx = np.abs(ds_rrs["lat"].values - lat).argmin()
        lon_idx = np.abs(ds_rrs["lon"].values - lon).argmin()

        lat_slice = slice(max(lat_idx - 2, 0), min(lat_idx + 3, ds_rrs.sizes["lat"]))
        lon_slice = slice(max(lon_idx - 2, 0), min(lon_idx + 3, ds_rrs.sizes["lon"]))

        rrs_vals = ds_rrs["Rrs"][lat_slice, lon_slice, :].mean(dim=("lat", "lon")).values
        avw_val = ds_avw["avw"][lat_slice, lon_slice].mean().item()
        wavelengths = ds_rrs["wavelength"].values

        return rrs_vals, avw_val, wavelengths

    except Exception as e:
        print(f"Error extracting Rrs/AVW: {e}")
        return None, None, None

# === Main loop ===
rrs_data = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    tow_date = row["TOWDATETIME_EST"]
    lat, lon = row["LAT"], row["LON"]
    start_date, end_date = get_8day_range(tow_date)

    # === Try 8-day first ===
    ds_rrs = load_cached_dataset("PACE_OCI_L3M_RRS", start_date, end_date, rrs_cache, product_type="8D")
    ds_avw = load_cached_dataset("PACE_OCI_L3M_AVW", start_date, end_date, avw_cache, product_type="8D")
    rrs_vals, avw_val, wavelengths = try_extract(ds_rrs, ds_avw, lat, lon)

    # === Fallback to monthly if 8-day fails or is all NaNs ===
    if rrs_vals is None or np.all(np.isnan(rrs_vals)):
        monthly_start = pd.to_datetime(tow_date).replace(day=1)
        monthly_end = (monthly_start + pd.offsets.MonthEnd(0)).strftime("%Y-%m-%d")
        monthly_start = monthly_start.strftime("%Y-%m-%d")

        ds_rrs = load_cached_dataset("PACE_OCI_L3M_RRS", monthly_start, monthly_end, rrs_cache, product_type="MO")
        ds_avw = load_cached_dataset("PACE_OCI_L3M_AVW", monthly_start, monthly_end, avw_cache, product_type="MO")
        rrs_vals, avw_val, wavelengths = try_extract(ds_rrs, ds_avw, lat, lon)

        if rrs_vals is not None and not np.all(np.isnan(rrs_vals)):
            print(f"⚠️  Used monthly fallback for {tow_date}")
        else:
            print(f"❌ No valid Rrs data even in monthly fallback for {tow_date}")
            continue

    # === Store result ===
    result = {
        "TOWDATETIME_EST": tow_date,
        "AVW": avw_val
    }
    for wl, val in zip(wavelengths, rrs_vals):
        result[f"Rrs_{int(wl)}"] = val

    rrs_data.append(result)

# === Merge with fisheries data and save ===
rrs_df = pd.DataFrame(rrs_data)
merged = pd.merge(df, rrs_df, on="TOWDATETIME_EST", how="left")
merged.to_csv("fisheries_with_pace_rrs_avw2.csv", index=False)
print("✅ Saved merged file: fisheries_with_pace_rrs_avw2.csv")

# === Compute brightness from Rrs ===
rrs_cols = [col for col in merged.columns if col.startswith("Rrs_")]
rrs_wavelengths = [int(col.split("_")[1]) for col in rrs_cols]
rrs_filtered = [(wl, col) for wl, col in zip(rrs_wavelengths, rrs_cols) if 400 <= wl <= 700]
rrs_filtered.sort()
wavelengths, rrs_ordered_cols = zip(*rrs_filtered)

rrs_values = merged[list(rrs_ordered_cols)].values
rrs_brightness = np.trapz(rrs_values, x=wavelengths, axis=1)
merged["Rrs_brightness"] = rrs_brightness

merged.to_csv("fisheries_with_pace_rrs_avw2.csv", index=False)
print("✅ Added Rrs_brightness and saved updated CSV.")




  0%|          | 0/368 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

  1%|▏         | 5/368 [01:40<1:21:05, 13.40s/it]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

  2%|▏         | 8/368 [02:09<1:10:28, 11.75s/it]

⚠️  Used monthly fallback for 2024-03-08 00:32:00
⚠️  Used monthly fallback for 2024-03-08 02:24:00
⚠️  Used monthly fallback for 2024-03-08 04:49:00
⚠️  Used monthly fallback for 2024-03-08 08:10:00


  4%|▎         | 13/368 [02:10<26:16,  4.44s/it] 

⚠️  Used monthly fallback for 2024-03-08 10:07:00
⚠️  Used monthly fallback for 2024-03-08 12:47:00
⚠️  Used monthly fallback for 2024-03-08 16:50:00
⚠️  Used monthly fallback for 2024-03-08 19:20:00


  4%|▍         | 15/368 [02:10<18:20,  3.12s/it]

⚠️  Used monthly fallback for 2024-03-08 22:20:00
⚠️  Used monthly fallback for 2024-03-09 00:52:00
⚠️  Used monthly fallback for 2024-03-09 03:27:00


  5%|▌         | 19/368 [02:10<09:01,  1.55s/it]

⚠️  Used monthly fallback for 2024-03-09 05:35:00
⚠️  Used monthly fallback for 2024-03-09 08:41:00
⚠️  Used monthly fallback for 2024-03-09 11:53:00
⚠️  Used monthly fallback for 2024-03-09 14:29:00


  6%|▌         | 21/368 [02:11<07:22,  1.28s/it]

⚠️  Used monthly fallback for 2024-03-09 16:49:00


  8%|▊         | 29/368 [02:14<03:21,  1.69it/s]

⚠️  Used monthly fallback for 2024-03-10 09:15:00


 11%|█▏        | 42/368 [02:30<01:30,  3.60it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

 29%|██▉       | 107/368 [03:20<00:22, 11.66it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

 33%|███▎      | 120/368 [04:00<05:33,  1.35s/it]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

 35%|███▍      | 128/368 [04:30<08:44,  2.18s/it]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

 36%|███▌      | 131/368 [04:49<16:50,  4.26s/it]

⚠️  Used monthly fallback for 2024-04-02 21:50:00


 38%|███▊      | 138/368 [04:51<07:39,  2.00s/it]

⚠️  Used monthly fallback for 2024-04-04 11:58:00


 39%|███▉      | 145/368 [04:51<03:21,  1.11it/s]

⚠️  Used monthly fallback for 2024-04-05 16:54:00


QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

 39%|███▉      | 145/368 [05:10<03:21,  1.11it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

 57%|█████▋    | 210/368 [06:30<00:14, 10.87it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

 67%|██████▋   | 246/368 [08:00<00:32,  3.74it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

 76%|███████▌  | 280/368 [08:50<00:23,  3.74it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

 78%|███████▊  | 286/368 [09:30<03:19,  2.43s/it]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

 79%|███████▉  | 290/368 [09:57<06:30,  5.00s/it]

⚠️  Used monthly fallback for 2024-05-01 00:24:00
⚠️  Used monthly fallback for 2024-05-01 03:15:00
⚠️  Used monthly fallback for 2024-05-01 05:46:00
⚠️  Used monthly fallback for 2024-05-01 09:14:00


 81%|████████  | 298/368 [09:58<01:55,  1.64s/it]

⚠️  Used monthly fallback for 2024-05-02 03:13:00
⚠️  Used monthly fallback for 2024-05-02 05:17:00
⚠️  Used monthly fallback for 2024-05-02 07:42:00


 82%|████████▏ | 303/368 [09:58<00:53,  1.21it/s]

⚠️  Used monthly fallback for 2024-05-02 11:13:00
⚠️  Used monthly fallback for 2024-05-03 01:10:00


 83%|████████▎ | 307/368 [09:58<00:29,  2.07it/s]

⚠️  Used monthly fallback for 2024-05-03 04:01:00
⚠️  Used monthly fallback for 2024-05-03 07:47:00
⚠️  Used monthly fallback for 2024-05-03 10:35:00
⚠️  Used monthly fallback for 2024-05-03 13:53:00


 85%|████████▌ | 313/368 [10:00<00:19,  2.86it/s]

⚠️  Used monthly fallback for 2024-05-03 17:03:00


 92%|█████████▏| 337/368 [10:20<00:02, 10.61it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 368/368 [10:58<00:00,  1.79s/it]
  rrs_brightness = np.trapz(rrs_values, x=wavelengths, axis=1)


✅ Saved merged file: fisheries_with_pace_rrs_avw2.csv
✅ Added Rrs_brightness and saved updated CSV.


In [None]:
import pandas as pd
import numpy as np
import xarray as xr
from datetime import datetime
import earthaccess
from tqdm import tqdm

# === Authenticate ===
auth = earthaccess.login()
assert auth.authenticated, "Earthaccess login failed."

# === Load simplified catch data ===
df = pd.read_csv("simplified_catch_summary.csv")

# === Helper: convert date to 8-day PACE period ===
def get_8day_range(date_str):
    dt = pd.to_datetime(date_str)
    doy = (dt - pd.Timestamp("2024-01-01")).days
    start_day = 8 * (doy // 8)
    start_date = pd.Timestamp("2024-01-01") + pd.Timedelta(days=start_day)
    end_date = start_date + pd.Timedelta(days=7)
    return start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")

# === Caching logic ===
kd_cache = {}

def load_cached_dataset(short_name, start_date, end_date, cache):
    key = (short_name, start_date)
    if key in cache:
        return cache[key]
    
    results = earthaccess.search_data(
        short_name=short_name,
        temporal=(start_date, end_date),
        granule_name="*.8D.*.4km.*"
    )
    if not results:
        return None

    try:
        ds = xr.open_dataset(earthaccess.open(results)[0])
        cache[key] = ds
        return ds
    except Exception as e:
        print(f"Failed to open dataset for {short_name} on {start_date}: {e}")
        return None

# === Main loop ===
kd_data = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    tow_date = row["TOWDATETIME_EST"]
    lat, lon = row["LAT"], row["LON"]
    start_date, end_date = get_8day_range(tow_date)

    # === Load Kd dataset with caching ===
    ds_kd = load_cached_dataset("PACE_OCI_L3M_KD", start_date, end_date, kd_cache)

    if ds_kd is None:
        print(f"Skipping {tow_date} due to missing Kd data.")
        continue

    try:
        # === Find nearest grid point ===
        lat_idx = np.abs(ds_kd["lat"].values - lat).argmin()
        lon_idx = np.abs(ds_kd["lon"].values - lon).argmin()

        lat_slice = slice(max(lat_idx - 2, 0), min(lat_idx + 3, ds_kd.sizes["lat"]))
        lon_slice = slice(max(lon_idx - 2, 0), min(lon_idx + 3, ds_kd.sizes["lon"]))

        # === Kd: mean over 5x5 box ===
        kd_vals = ds_kd["Kd"][lat_slice, lon_slice, :].mean(dim=("lat", "lon")).values
        wavelengths = ds_kd["wavelength"].values

        if np.all(np.isnan(kd_vals)):
            print(f"All Kd NaN for {tow_date}")
            continue

        # === Store result ===
        result = {
            "TOWDATETIME_EST": tow_date
        }

        for wl, val in zip(wavelengths, kd_vals):
            result[f"Kd_{int(wl)}"] = val

        kd_data.append(result)

    except Exception as e:
        print(f"Skipping {tow_date} at {lat:.2f}, {lon:.2f} due to error: {e}")
        continue

# === Merge with fisheries data and save ===
kd_df = pd.DataFrame(kd_data)
merged = pd.merge(df, kd_df, on="TOWDATETIME_EST", how="left")
merged.to_csv("fisheries_with_pace_kd.csv", index=False)

print("✅ Done! Output saved to: fisheries_with_pace_kd.csv")