### Import relevant Python modules

In [1]:
import numpy as np
import xarray as xr
import os
import sys
import glob


from os.path import join,expanduser,exists,split
user_home_dir = expanduser('~')

sys.path.insert(0,join(user_home_dir))
import ecco_access as ea

# indicate mode of access
# options are:

# 'download': direct download from internet to your local machine
# 'download_ifspace': like download, but only proceeds 
#                     if your machine have sufficient storage
# 's3_open': access datasets in-cloud from an AWS instance
# 's3_open_fsspec': use jsons generated with fsspec and 
#                   kerchunk libraries to speed up in-cloud access
# 's3_get': direct download from S3 in-cloud to an AWS instance
# 's3_get_ifspace': like s3_get, but only proceeds if your instance 
#                   has sufficient storage
access_mode = 'query'

In [2]:
# Suppress warning messages for a cleaner presentation
import warnings
warnings.filterwarnings('ignore')

In [3]:
import psutil

# # setting up a dask LocalCluster (only if number cores available >= 4 and available memory/core >= 2 GB)
# distributed_cores_min = 4
# distributed_mem_per_core_min = 2*(10**9)
# mem_per_core = psutil.virtual_memory().available/os.cpu_count()
# if ((os.cpu_count() >= distributed_cores_min) and \
#   (mem_per_core >= distributed_mem_per_core_min)):
#     from dask.distributed import Client
#     from dask.distributed import LocalCluster
#     cluster = LocalCluster()
#     client = Client(cluster)

In [4]:
from dask.distributed import Client

#  connec to existing LocalCluster
# the port number will be different!
client = Client("tcp://127.0.0.1:38121")
client.ncores
client.restart()

In [5]:
# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# Use this for the netcdf files stored on an s3 bucket
def get_credentials(use_earthdata=False):
    """
    This routine automatically pulls your EDL crediential from .netrc file and use it to obtain an AWS S3 credential 
    through a PO.DAAC service accessible at https://archive.podaac.earthdata.nasa.gov/s3credentials.
    From the PO.DAAC Github (https://podaac.github.io/tutorials/external/July_2022_Earthdata_Webinar.html).
    
    Returns:
    =======
    
    credentials: a dictionary with AWS secret_key, access_key, and token
    """
    # NASA EarthData hosts ECCO V4r4 fileds
    if use_earthdata == False:
        session = boto3.Session()
        credentials_b3 = session.get_credentials()
        creds_b3 = credentials_b3.get_frozen_credentials()
        
        credentials = dict()
        credentials['secretAccessKey'] = credentials_b3.secret_key
        credentials['accessKeyId'] = credentials_b3.access_key
        credentials['sessionToken'] = credentials_b3.token

    # A 'public' AWS s3 bucket hosts V4r5 fields (they will eventually move to PO.DAAC)
    else:
        credentials = requests.get('https://archive.podaac.earthdata.nasa.gov/s3credentials').json()
    
    return credentials
    

In [7]:
def init_S3FileSystem(use_earthdata=False, requester_pays=True):
    """
    This routine automatically creates an 's3 file system' object and credentials dictionary.
    The s3 file system needs to be initialized with the special aws credentials.
    
    Returns:
    =======
    
    s3: an AWS S3 filesystem, 
    credentials: a dictionary with AWS secret_key, access_key, and token

    """
    credentials = get_credentials(use_earthdata=use_earthdata)

    if use_earthdata:
        requester_pays = False
        
    s3 = s3fs.S3FileSystem(requester_pays=requester_pays,
                           anon=False,
                           key=credentials['accessKeyId'],
                           secret=credentials['secretAccessKey'], 
                           token=credentials['sessionToken'])
    
    return s3, credentials

In [8]:
import boto3
import s3fs
from pathlib import Path
import requests

s3, credentials = init_S3FileSystem(use_earthdata=True, requester_pays=False);

In [9]:

mzz_local_directory = Path('efs_ecco/mzz-jsons/MZZ_LLC0090GRID_SNAPSHOT')
# different prefixes on s3
# MZZ_day_mean_latlon
# MZZ_day_mean_native
# MZZ_mon_mean_latlon
# MZZ_mon_mean_native
# MZZ_snap_native

In [10]:
# show they are on the local disk 
np.sort(list(mzz_local_directory.glob('*json')))

array([], dtype=float64)

In [11]:
# MZZ files have been downloaded. 

# select the temperature/salinity dataset
mzz_local_file= mzz_local_directory / 'ECCO_L4_TEMP_SALINITY_LLC0090GRID_SNAPSHOT_V4R4.json'
print(mzz_local_file)

efs_ecco/mzz-jsons/MZZ_LLC0090GRID_SNAPSHOT/ECCO_L4_TEMP_SALINITY_LLC0090GRID_SNAPSHOT_V4R4.json


In [12]:
import fsspec
import zarr

In [13]:
fs = fsspec.filesystem("reference",     
                       fo=str(mzz_local_file),
                       remote_protocol="s3",
                       remote_options={"anon":False, 
                                      "key": credentials['accessKeyId'],
                                      "secret" : credentials['secretAccessKey'], 
                                      "token" : credentials['sessionToken']},
                                      skip_instance_cache=True)

fs.asynchronous = True
store = zarr.storage.FsspecStore(fs)
ds = xr.open_dataset(store, engine='zarr',
                     consolidated=False, chunks={'time':4, 'Z':50})

ds

FileNotFoundError: Unable to find group: <FsspecStore(ReferenceFileSystem, /)>

In [14]:
store

<FsspecStore(ReferenceFileSystem, /)>