<span style="color:hotpink; font-size:40px; font-weight:bold;">Packages and imports</span>

In [3]:
### standard imports ###

import numpy as np
import pandas as pd
import xarray as xr
import gcsfs

### Python file with supporting functions ###

import residual_utils as prerun

### set up for getting files from leap bucket ###

fs = gcsfs.GCSFileSystem()

2025-09-18 20:03:21.907357: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-09-18 20:03:21.931277: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-09-18 20:03:21.939280: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-18 20:03:21.957443: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<span style="color:hotpink; font-size:40px; font-weight:bold;">Setting date range</span>

In [3]:
### Setting the date range to unify the date type ###

# Define date range
date_range_start = '1982-02-01T00:00:00.000000000'
date_range_end = '2023-12-31T00:00:00.000000000'

# create date vector, adds 14 days to start & end
dates = pd.date_range(start=date_range_start, 
                      end=date_range_end,freq='MS')

<span style="color:hotpink; font-size:40px; font-weight:bold;">Setting paths</span>

In [4]:
### set paths ###

### paths for loading: ###
# directory of regridded members from notebook 00
ensemble_dir_abby = 'gs://leap-persistent/abbysh/pco2_all_members_1982-2023/00_regridded_members'
# directory of reference zarr files
zarr_dir = 'gs://leap-persistent/abbysh/zarr_files_'

### loading: ###
# atmospheric xco2 file
xco2_path = f"{zarr_dir}/xco2_cmip6_183501-224912_monthstart.zarr"
# socat data file
socat_path = f"{zarr_dir}/socat_mask_feb1982-dec2023.zarr" 
            # this is the SOCAT mask that is used to make the "testbed-truth' -> consistent with the 'real-world' SOCAT coverage (so that the ML reconstructions are based on real-world pCO2 coverages)
            # testbed-truth -> we have every 1°x1° monthly a pCO2 value (because of the ESM member pCO2 estimates) -> using that as the "truth" so we have the entire world to validate the ML against
            # if we would use only the real pCO2 SOCAT data, we wouldn't even be able to validate the performance of the ML products in the southern ocean (becasuse super little data there)
# topo and land-sea masks
topo_path = f"{zarr_dir}/GEBCO_2014_1x1_global.zarr"
lsmask_path = f"{zarr_dir}/lsmask.zarr"

#############################################

### paths for saving: ###
your_username = "mauriekeppens"
# directory of where to save ML inputs from this (01) notebook (save it in your bucket!!)
xgb_data_dir = f"gs://leap-persistent/{your_username}/Ensemble_Testbed/01_ML_inputs" 

<span style="color:hotpink; font-size:40px; font-weight:bold;">Calculating chlorophyll climatology</span>

Here is where clorophyll climatology is calculated. Climatology is calculated from model output 01-1998 through the end of the time period (here, 12-2023), by calculating the monthly average chlorophyll of the whole time period. This climatology is saved in a separate file in the individual regridded member folders. This file is currently used to set feature variables for ML in notebook 02, but those feature variables are not *actually used* for the ML. 

You do not need to run this code, because chlorophyll climatology is already saved in the regridded members directory that you're loading from.

In [36]:
### chl_clim code here ###

a = fs.ls(ensemble_dir)
for ens_path in a[0:1]:
    ens = ens_path.split('/')[-1]
    mems = fs.ls(ens_path)
    for mem in mems:
        memo = mem.split('/')[-1]
        zarrs = fs.ls(mem)
        for z in zarrs:
            zarr_name = z.split('/')[-1]
            d = xr.open_mfdataset('gs://'+z,engine='zarr')
            chl = d.chl

            ### year selection ### remove hard code ###
            chl_clim = chl.sel(time=slice('1998-01-01', '2023-12-30')).groupby("time.month").mean("time")

            if 'lev_partial' in chl_clim.dims:
                chl_clim = chl_clim.sel({'lev_partial':1},drop=True)
            
            chl_out_new = xr.Dataset({'chl_clim':(["month","lat","lon"],chl_clim.data)},
                    coords={'time': (['month'],chl_clim.month.data),
                            'lon':(['lon'],chl_clim.xlon.data),
                            'lat': (['lat'],chl_clim.ylat.data)})
            
            # print(chl_out_new)
            #out_path = f'{ensemble_dir}/{ens}/{memo}/chlclim_{zarr_name}'
            #print(out_path,chl_out_new)
            chl_out_new.to_zarr(out_path, mode="w", zarr_format=2)
            print(f'finished with {ens}:{memo}')

gs://leap-persistent/mauriekeppens/Ensemble_Testbed/00_regridded_members/ACCESS-ESM1-5/member_r10i1p1f1/chlclim_ACCESS-ESM1-5.r10i1p1f1.Omon.zarr <xarray.Dataset> Size: 3MB
Dimensions:   (month: 12, lat: 180, lon: 360)
Coordinates:
    time      (month) int64 96B 1 2 3 4 5 6 7 8 9 10 11 12
  * lon       (lon) float64 3kB -179.5 -178.5 -177.5 ... 177.5 178.5 179.5
  * lat       (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 86.5 87.5 88.5 89.5
Dimensions without coordinates: month
Data variables:
    chl_clim  (month, lat, lon) float32 3MB dask.array<chunksize=(12, 180, 360), meta=np.ndarray>
finished with ACCESS-ESM1-5:member_r10i1p1f1
gs://leap-persistent/mauriekeppens/Ensemble_Testbed/00_regridded_members/ACCESS-ESM1-5/member_r15i1p1f1/chlclim_ACCESS-ESM1-5.r15i1p1f1.Omon.zarr <xarray.Dataset> Size: 3MB
Dimensions:   (month: 12, lat: 180, lon: 360)
Coordinates:
    time      (month) int64 96B 1 2 3 4 5 6 7 8 9 10 11 12
  * lon       (lon) float64 3kB -179.5 -178.5 -177.5 ... 177.5 178

In [73]:
# Check if ESM members have the chl chlimatology (check for the first 5 members as a double check)

esms = fs.ls(ensemble_dir_abby)

for ens_path in esms[1:2]: # skip the first and only the first ESM:ACCESS-ESM1-5
    ens = ens_path.split('/')[-1]
    members = fs.ls(ens_path)
    
    for mem in members:
        memo = mem.split('/')[-1]
        # check of er een chlclim folder bestaat
        chlclim_paths = [x for x in fs.ls(mem) if 'chlclim' in x]
        
        if chlclim_paths:
            print(f"{ens}/{memo} -> chlclim exists: {chlclim_paths}")
        else:
            print(f"{ens}/{memo} -> chlclim does NOT exist")

ACCESS-ESM1-5/member_r10i1p1f1 -> chlclim exists: ['leap-persistent/abbysh/pco2_all_members_1982-2023/00_regridded_members/ACCESS-ESM1-5/member_r10i1p1f1/chlclim_ACCESS-ESM1-5.r10i1p1f1.Omon.zarr']
ACCESS-ESM1-5/member_r15i1p1f1 -> chlclim exists: ['leap-persistent/abbysh/pco2_all_members_1982-2023/00_regridded_members/ACCESS-ESM1-5/member_r15i1p1f1/chlclim_ACCESS-ESM1-5.r15i1p1f1.Omon.zarr']
ACCESS-ESM1-5/member_r17i1p1f1 -> chlclim exists: ['leap-persistent/abbysh/pco2_all_members_1982-2023/00_regridded_members/ACCESS-ESM1-5/member_r17i1p1f1/chlclim_ACCESS-ESM1-5.r17i1p1f1.Omon.zarr']
ACCESS-ESM1-5/member_r1i1p1f1 -> chlclim exists: ['leap-persistent/abbysh/pco2_all_members_1982-2023/00_regridded_members/ACCESS-ESM1-5/member_r1i1p1f1/chlclim_ACCESS-ESM1-5.r1i1p1f1.Omon.zarr']
ACCESS-ESM1-5/member_r22i1p1f1 -> chlclim exists: ['leap-persistent/abbysh/pco2_all_members_1982-2023/00_regridded_members/ACCESS-ESM1-5/member_r22i1p1f1/chlclim_ACCESS-ESM1-5.r22i1p1f1.Omon.zarr']
ACCESS-ESM1-5

<span style="color:hotpink; font-size:40px; font-weight:bold;">Loading list of ESMs and members in testbed</span>

<span style="color:lightblue; font-size:30px; font-weight:bold;">For all members for each ESM</span>

In [65]:
### loads list of Earth System Models ("ensembles") and members for the full testbed ###

ensembles = []
mems_dict = dict()

# list of all ensembles in that ensemble_dir
for path in fs.ls(ensemble_dir_abby)[1:]:
    ens_name = path.split('/')[-1] 
    if ens not in ensembles: # Checks if the ensemble is already in the list; if not, adds it.
        ensembles.append(ens_name)

        # members of that ensemble
        members = []
        for mem_path in fs.ls(path): # find members of that ensemble
            mem_name = mem_path.split('/')[-1]
            members.append(mem_name)
        mems_dict[ens_name] = members # Adds the member names to the dictionary under that ensemble

# Print results
print("Ensembles found:")
print(ensembles)
print("\nMembers for each ensemble:")
for ens, members in mems_dict.items():
    print(f"{ens}: {members}")

Ensembles found:
['ACCESS-ESM1-5', 'CESM2', 'CESM2-WACCM', 'CMCC-ESM2', 'CanESM5', 'CanESM5-CanOE', 'GFDL-ESM4', 'MPI-ESM1-2-LR', 'UKESM1-0-LL']

Members for each ensemble:
ACCESS-ESM1-5: ['member_r10i1p1f1', 'member_r15i1p1f1', 'member_r17i1p1f1', 'member_r1i1p1f1', 'member_r22i1p1f1', 'member_r26i1p1f1', 'member_r27i1p1f1', 'member_r2i1p1f1', 'member_r31i1p1f1', 'member_r32i1p1f1', 'member_r33i1p1f1', 'member_r34i1p1f1', 'member_r35i1p1f1', 'member_r36i1p1f1', 'member_r37i1p1f1', 'member_r38i1p1f1', 'member_r39i1p1f1', 'member_r3i1p1f1', 'member_r40i1p1f1', 'member_r4i1p1f1', 'member_r5i1p1f1', 'member_r7i1p1f1', 'member_r8i1p1f1']
CESM2: ['member_r10i1p1f1', 'member_r11i1p1f1', 'member_r4i1p1f1']
CESM2-WACCM: ['member_r1i1p1f1', 'member_r2i1p1f1', 'member_r3i1p1f1']
CMCC-ESM2: ['member_r1i1p1f1']
CanESM5: ['member_r10i1p1f1', 'member_r10i1p2f1', 'member_r1i1p1f1', 'member_r1i1p2f1', 'member_r2i1p1f1', 'member_r2i1p2f1', 'member_r3i1p1f1', 'member_r3i1p2f1', 'member_r4i1p1f1', 'membe

In [66]:
# total number of members
total_members = sum(len(members) for members in mems_dict.values())
print(f"Total number of members: {total_members}")

# number of members per ensemble
for ens, members in mems_dict.items():
    print(f"{ens}: {len(members)} members")

Total number of members: 105
ACCESS-ESM1-5: 23 members
CESM2: 3 members
CESM2-WACCM: 3 members
CMCC-ESM2: 1 members
CanESM5: 20 members
CanESM5-CanOE: 3 members
GFDL-ESM4: 1 members
MPI-ESM1-2-LR: 46 members
UKESM1-0-LL: 5 members


<span style="color:hotpink; font-size:40px; font-weight:bold;">Running code to make ML inputs</span>

In [67]:
### creating pandas dataframes out of "raw" data to prep for ML ###
import os

N_time = len(dates)
member_counter = 0 

### loop through each ESM
for ens, mem_list in mems_dict.items(): 

    ### loop through each member of that ESM
    for member in mem_list:
        
        init_date = str(dates[0].year) + format(dates[0].month,'02d')
        fin_date  = str(dates[-1].year) + format(dates[-1].month,'02d')
        
        output_dir = f"{xgb_data_dir}/{ens}/{member}"
        save_file = f"{output_dir}/MLinput_{ens}_{member.split('_')[-1]}_mon_1x1_{init_date}_{fin_date}.pkl"
        
        if fs.exists(save_file):
            print(f"Skipping {ens}, {member} (already exists)")
            continue

        print(f"Making dataframe {member_counter}: {ens}, {member}")

        try:
            ### uses utility function file to make data into dataframe for ML use
            df = prerun.create_inputs(ensemble_dir_abby, ens, member, dates, N_time,
                                      xco2_path,
                                      socat_path,
                                      topo_path,
                                      lsmask_path)
        except Exception as e:
            print(f"Failed to create MLinput for {ens}, {member}: {e}")
            continue

        ### Save the pandas dataframe to workspace
        prerun.save_clean_data(df, xgb_data_dir, ens, member, dates)
        member_counter += 1

Making dataframe 0: ACCESS-ESM1-5, member_r10i1p1f1
Starting data saving process
member_r10i1p1f1 save complete
Making dataframe 1: ACCESS-ESM1-5, member_r15i1p1f1
Starting data saving process
member_r15i1p1f1 save complete
Making dataframe 2: ACCESS-ESM1-5, member_r17i1p1f1
Starting data saving process


KeyboardInterrupt: 

In [76]:
# Check whether we have all members saved
import gcsfs

fs = gcsfs.GCSFileSystem()

base_path = f'gs://leap-persistent/{your_username}/Ensemble_Testbed/01_ML_inputs'
ensembles = fs.ls(base_path)
print("Ensembles found:", ensembles)

total_members = 0
for ens_path in ensembles:
    members = fs.ls(ens_path)
    print(f"{ens_path} members:", [m.split('/')[-1] for m in members])
    total_members += len(members)

print("Total members:", total_members)

Ensembles found: ['leap-persistent/mauriekeppens/Ensemble_Testbed/01_ML_inputs/ACCESS-ESM1-5']
leap-persistent/mauriekeppens/Ensemble_Testbed/01_ML_inputs/ACCESS-ESM1-5 members: ['member_r10i1p1f1', 'member_r15i1p1f1', 'member_r17i1p1f1']
Total members: 3


In [1]:
# HOW MANY MEMBERS DATAFRAME INPUTS IN ABBY'S FOLDER
all_members = []
root = "gs://leap-persistent/abbysh/pco2_all_members_1982-2023/post01_xgb_inputs"

for model_folder in fs.ls(root)[:]:   
    members = fs.ls(model_folder)
    all_members.extend(members)

print("Number of saved ESM members:", len(all_members))

NameError: name 'fs' is not defined