<span style="color:hotpink; font-size:40px; font-weight:bold;">Packages and imports</span>

In [2]:
### standard imports ###

import numpy as np
import pandas as pd
import xarray as xr
import gcsfs

### Python file with supporting functions ###

import residual_utils as prerun

### set up for getting files from leap bucket ###

fs = gcsfs.GCSFileSystem()

<span style="color:hotpink; font-size:40px; font-weight:bold;">Setting date range</span>

In [3]:
### Setting the date range to unify the date type ###

# Define date range
date_range_start = '1982-02-01T00:00:00.000000000'
date_range_end = '2023-12-31T00:00:00.000000000'

# create date vector, adds 14 days to start & end
dates = pd.date_range(start=date_range_start, 
                      end=date_range_end,freq='MS')

<span style="color:hotpink; font-size:40px; font-weight:bold;">Setting paths</span>

In [25]:
### set paths ###

### paths for loading: ###

# directory of regridded members from notebook 00
ensemble_dir = "gs://leap-persistent/abbysh/pco2_all_members_1982-2023/00_regridded_members" # path to regridded data

# directory of reference zarr files
zarr_dir = 'gs://leap-persistent/abbysh/zarr_files_'

# atmospheric xco2 file
xco2_path = f"{zarr_dir}/xco2_cmip6_183501-224912_monthstart.zarr"

# socat data file
socat_path = f"{zarr_dir}/socat_mask_feb1982-dec2023.zarr"

# topo and land-sea masks
topo_path = f"{zarr_dir}/GEBCO_2014_1x1_global.zarr"
lsmask_path = f"{zarr_dir}/lsmask.zarr"

#############################################

### paths for loading: ###

# directory of regridded members from notebook 00
ensemble_dir = "gs://leap-persistent/abbysh/pco2_all_members_1982-2023/00_regridded_members" # path to regridded data

# directory of reference zarr files
zarr_dir = 'gs://leap-persistent/abbysh/zarr_files_'

# atmospheric xco2 file
xco2_path = f"{zarr_dir}/xco2_cmip6_183501-224912_monthstart.zarr"

# socat data file
socat_path = f"{zarr_dir}/socat_mask_feb1982-dec2023.zarr"

# topo and land-sea masks
topo_path = f"{zarr_dir}/GEBCO_2014_1x1_global.zarr"
lsmask_path = f"{zarr_dir}/lsmask.zarr"

#############################################

### paths for saving: ###

your_username = "mauriekeppens"

# directory of where to save ML inputs from this (01) notebook (save it in your bucket!!)
xgb_data_dir = f"gs://leap-scratch/{your_username}"
# directory of where to save ML inputs from this (01) notebook (save it in your bucket!!)
#xgb_data_dir = f"gs://leap-persistent/{your_username}/pco2_residual/post01_xgb_inputs" # where to store cleaned data from 01 (this notebook), for use in 02 notebook, for xgboost

<span style="color:hotpink; font-size:40px; font-weight:bold;">Calculating chlorophyll climatology</span>

Here is where clorophyll climatology is calculated. Climatology is calculated from model output 01-1998 through the end of the time period (here, 12-2023), by calculating the monthly average chlorophyll of the whole time period. This climatology is saved in a separate file in the individual regridded member folders. This file is currently used to set feature variables for ML in notebook 02, but those feature variables are not *actually used* for the ML. 

You do not need to run this code, because chlorophyll climatology is already saved in the regridded members directory that you're loading from.

In [27]:
### chl_clim code here ###

# a = fs.ls(ensemble_dir)
# for ens_path in a[1:]:
#     ens = ens_path.split('/')[-1]
#     mems = fs.ls(ens_path)
#     for mem in mems:
#         memo = mem.split('/')[-1]
#         zarrs = fs.ls(mem)
#         for z in zarrs:
#             zarr_name = z.split('/')[-1]
#             d = xr.open_mfdataset('gs://'+z,engine='zarr')
#             chl = d.chl

#             ### year selection ### remove hard code ###
#             chl_clim = chl.sel(time=slice('1998-01-01', '2023-12-30')).groupby("time.month").mean("time")

#             if 'lev_partial' in chl_clim.dims:
#                 chl_clim = chl_clim.sel({'lev_partial':1},drop=True)
            
#             chl_out_new = xr.Dataset({'chl_clim':(["month","lat","lon"],chl_clim.data)},
#                     coords={'time': (['month'],chl_clim.month.data),
#                             'lon':(['lon'],chl_clim.xlon.data),
#                             'lat': (['lat'],chl_clim.ylat.data)})
            
#             # print(chl_out_new)
#             out_path = f'{ensemble_dir}/{ens}/{memo}/chlclim_{zarr_name}'
#             print(out_path,chl_out_new)
#             chl_out_new.to_zarr(out_path)
#             print(f'finished with {ens}:{memo}')

<span style="color:hotpink; font-size:40px; font-weight:bold;">Loading list of ESMs and members in testbed</span>

<span style="color:lightblue; font-size:30px; font-weight:bold;">For all members for each ESM</span>

In [28]:
### loads list of Earth System Models ("ensembles") and members for the full testbed ###

ensembles = []
for path in fs.ls(ensemble_dir):
    ens = path.split('/')[-1].split('.')[0]
    if ens not in ensembles:
        ensembles.append(ens)

mems_dict = dict()
a = fs.ls(ensemble_dir)
for ens_path in a:
    ens = ens_path.split('/')[-1]
    mems = fs.ls(ens_path)
    for mem in mems:
        memo = mem.split('/')[-1]
        
        if ens not in mems_dict:
            mems_dict[ens] = [memo]

        elif ens in mems_dict:
            mems_dict[ens].append(memo)

In [29]:
mems_dict

{'00_regridded_members': ['00_regridded_members',
  'ACCESS-ESM1-5',
  'CESM2',
  'CESM2-WACCM',
  'CMCC-ESM2',
  'CanESM5',
  'CanESM5-CanOE',
  'GFDL-ESM4',
  'MPI-ESM1-2-LR',
  'UKESM1-0-LL'],
 'ACCESS-ESM1-5': ['member_r10i1p1f1',
  'member_r15i1p1f1',
  'member_r17i1p1f1',
  'member_r1i1p1f1',
  'member_r22i1p1f1',
  'member_r26i1p1f1',
  'member_r27i1p1f1',
  'member_r2i1p1f1',
  'member_r31i1p1f1',
  'member_r32i1p1f1',
  'member_r33i1p1f1',
  'member_r34i1p1f1',
  'member_r35i1p1f1',
  'member_r36i1p1f1',
  'member_r37i1p1f1',
  'member_r38i1p1f1',
  'member_r39i1p1f1',
  'member_r3i1p1f1',
  'member_r40i1p1f1',
  'member_r4i1p1f1',
  'member_r5i1p1f1',
  'member_r7i1p1f1',
  'member_r8i1p1f1'],
 'CESM2': ['member_r10i1p1f1', 'member_r11i1p1f1', 'member_r4i1p1f1'],
 'CESM2-WACCM': ['member_r1i1p1f1', 'member_r2i1p1f1', 'member_r3i1p1f1'],
 'CMCC-ESM2': ['member_r1i1p1f1'],
 'CanESM5': ['member_r10i1p1f1',
  'member_r10i1p2f1',
  'member_r1i1p1f1',
  'member_r1i1p2f1',
  'member

<span style="color:hotpink; font-size:40px; font-weight:bold;">Running code to make ML inputs</span>

In [40]:
import gcsfs
fs = gcsfs.GCSFileSystem()

ensemble_dir = "gs://leap-persistent/abbysh/pco2_all_members_1982-2023/00_regridded_members"
ens = "CESM2"   # example
member = "member_r10i1p1f1"

files = fs.glob(f"{ensemble_dir}/{ens}/{member}/*")
print("Files found:", files)


Files found: ['leap-persistent/abbysh/pco2_all_members_1982-2023/00_regridded_members/CESM2/member_r10i1p1f1/CESM2.r10i1p1f1.Omon.zarr', 'leap-persistent/abbysh/pco2_all_members_1982-2023/00_regridded_members/CESM2/member_r10i1p1f1/chlclim_CESM2.r10i1p1f1.Omon.zarr']


In [37]:
### creating pandas dataframes out of "raw" data to prep for ML ###

N_time = len(dates)
member_counter = 0 

### loop through each ESM
for ens, mem_list in mems_dict.items(): 

    ### loop through each member of that ESM
    for member in mem_list:
        
        print(f'making dataframe {member_counter}: {ens,member}')

        ### uses utility function file to make data into dataframe for ML use
        df = prerun.create_inputs(ensemble_dir, ens, member, dates, N_time,
                                  xco2_path,
                                  socat_path,
                                  topo_path,
                                  lsmask_path)

        ### Save the pandas dataframe to workspace
        prerun.save_clean_data(df, xgb_data_dir, ens, member, dates)
        member_counter += 1

making dataframe 0: ('00_regridded_members', '00_regridded_members')


IndexError: list index out of range