<span style="color:hotpink; font-size:40px; font-weight:bold;">Packages and imports</span>

In [1]:
### standard imports ###

import numpy as np
import pandas as pd
import xarray as xr
import gcsfs

### Python file with supporting functions ###

import residual_utils as prerun

### set up for getting files from leap bucket ###

fs = gcsfs.GCSFileSystem()

2025-01-22 21:13:10.165009: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-22 21:13:10.176353: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-22 21:13:10.190782: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-22 21:13:10.194988: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-22 21:13:10.205098: I tensorflow/core/platform/cpu_feature_guar

<span style="color:hotpink; font-size:40px; font-weight:bold;">Setting date range</span>

In [2]:
### Setting the date range to unify the date type ###

# Define date range
date_range_start = '1982-02-01T00:00:00.000000000'
date_range_end = '2023-12-31T00:00:00.000000000'

# create date vector, adds 14 days to start & end
dates = pd.date_range(start=date_range_start, 
                      end=date_range_end,freq='MS')

<span style="color:hotpink; font-size:40px; font-weight:bold;">Setting paths</span>

In [None]:
### set paths ###

### paths for loading: ###

# directory of regridded members from notebook 00
ensemble_dir = "gs://leap-persistent/abbysh/pco2_residual_1982-2023/00_regridded_members" # path to regridded data

# directory of reference zarr files
zarr_dir = 'gs://leap-persistent/abbysh/zarr_files_'

# atmospheric xco2 file
xco2_path = f"{zarr_dir}/xco2_cmip6_183501-224912_monthstart.zarr"

# socat data file
socat_path = f"{zarr_dir}/socat_mask_feb1982-dec2023.zarr"

# topo and land-sea masks
topo_path = f"{zarr_dir}/GEBCO_2014_1x1_global.zarr"
lsmask_path = f"{zarr_dir}/lsmask.zarr"

#############################################

### paths for saving: ###

your_username = 

# directory of where to save ML inputs from this (01) notebook (save it in your bucket!!)
xgb_data_dir = f"gs://leap-persistent/{your_username}/pco2_residual/post01_xgb_inputs" # where to store cleaned data from 01 (this notebook), for use in 02 notebook, for xgboost

<span style="color:hotpink; font-size:40px; font-weight:bold;">Calculating chlorophyll climatology</span>

Here is where clorophyll climatology is calculated. Climatology is calculated from model output 01-1998 through the end of the time period (here, 12-2023), by calculating the monthly average chlorophyll of the whole time period. This climatology is saved in a separate file in the individual regridded member folders. This file is currently used to set feature variables for ML in notebook 02, but those feature variables are not *actually used* for the ML. 

You do not need to run this code, because chlorophyll climatology is already saved in the regridded members directory that you're loading from.

In [None]:
### chl_clim code here ###

# a = fs.ls(ensemble_dir)
# for ens_path in a[1:]:
#     ens = ens_path.split('/')[-1]
#     mems = fs.ls(ens_path)
#     for mem in mems:
#         memo = mem.split('/')[-1]
#         zarrs = fs.ls(mem)
#         for z in zarrs:
#             zarr_name = z.split('/')[-1]
#             d = xr.open_mfdataset('gs://'+z,engine='zarr')
#             chl = d.chl

#             ### year selection ### remove hard code ###
#             chl_clim = chl.sel(time=slice('1998-01-01', '2023-12-30')).groupby("time.month").mean("time")

#             if 'lev_partial' in chl_clim.dims:
#                 chl_clim = chl_clim.sel({'lev_partial':1},drop=True)
            
#             chl_out_new = xr.Dataset({'chl_clim':(["month","lat","lon"],chl_clim.data)},
#                     coords={'time': (['month'],chl_clim.month.data),
#                             'lon':(['lon'],chl_clim.xlon.data),
#                             'lat': (['lat'],chl_clim.ylat.data)})
            
#             # print(chl_out_new)
#             out_path = f'{ensemble_dir}/{ens}/{memo}/chlclim_{zarr_name}'
#             print(out_path,chl_out_new)
#             chl_out_new.to_zarr(out_path)
#             print(f'finished with {ens}:{memo}')

<span style="color:hotpink; font-size:40px; font-weight:bold;">Loading list of ESMs and members in testbed</span>

<span style="color:lightblue; font-size:30px; font-weight:bold;">For all members for each ESM</span>

In [None]:
### loads list of Earth System Models ("ensembles") and members for the full testbed ###

ensembles = []
for path in fs.ls(ensemble_dir):
    ens = path.split('/')[-1].split('.')[0]
    if ens not in ensembles:
        ensembles.append(ens)

mems_dict = dict()
a = fs.ls(ensemble_dir)
for ens_path in a:
    ens = ens_path.split('/')[-1]
    mems = fs.ls(ens_path)
    for mem in mems:
        memo = mem.split('/')[-1]
        
        if ens not in mems_dict:
            mems_dict[ens] = [memo]

        elif ens in mems_dict:
            mems_dict[ens].append(memo)

In [None]:
mems_dict

<span style="color:hotpink; font-size:40px; font-weight:bold;">Running code to make ML inputs</span>

In [None]:
### creating pandas dataframes out of "raw" data to prep for ML ###

N_time = len(dates)
member_counter = 0 

### loop through each ESM
for ens, mem_list in mems_dict.items(): 

    ### loop through each member of that ESM
    for member in mem_list:
        
        print(f'making dataframe {member_counter}: {ens,member}')

        ### uses utility function file to make data into dataframe for ML use
        df = prerun.create_inputs(ensemble_dir, ens, member, dates, N_time,
                                  xco2_path,
                                  socat_path,
                                  topo_path,
                                  lsmask_path)

        ### Save the pandas dataframe to workspace
        prerun.save_clean_data(df, xgb_data_dir, ens, member, dates)
        member_counter += 1