# Access to data in the cloud (GCS)

In [None]:
!pip install --upgrade dask distributed dask-ml xarray zarr gcsfs cftime nc-time-axis intake intake-xarray scikit-learn matplotlib==3.0.2 seaborn

In [1]:
import sys
import gcsfs
import xarray as xr
import intake

## Read data from Google Cloud Storage (gcsfs)

### Access and listing

In [2]:
# Define cloud file system access point:
fs = gcsfs.GCSFileSystem(project='ds2class-2021', token='anon', access='read_only')

# And list content of a bucket:
fs.ls('ds2data')

['ds2data/EN.4.2.1.f.analysis.g10.zarr',
 'ds2data/GLOBAL_ARGO_SDL1000.zarr',
 'ds2data/ISAS15_TEMP_NATL.zarr',
 'ds2data/dt_global_allsat_phy_l4_mm.zarr']

But data access with ``gcsfs`` is critically dependant on the GCS set-up. For instance the following project does not allow to list the bucket content:

In [3]:
fs2 = gcsfs.GCSFileSystem(project='alert-ground-261008', token='anon', access='read_only')
try:
    fs2.ls('data_bdo2020')
except:
    print(sys.exc_info()[0])

_call non-retriable exception: Anonymous caller does not have storage.objects.list access to the Google Cloud Storage bucket.
Traceback (most recent call last):
  File "/Users/gmaze/anaconda/envs/bluecloud/lib/python3.6/site-packages/gcsfs/core.py", line 507, in _call
    self.validate_response(status, contents, json, path, headers)
  File "/Users/gmaze/anaconda/envs/bluecloud/lib/python3.6/site-packages/gcsfs/core.py", line 1228, in validate_response
    raise HttpError(error)
gcsfs.utils.HttpError: Anonymous caller does not have storage.objects.list access to the Google Cloud Storage bucket.


<class 'gcsfs.utils.HttpError'>


On the other hand, some dataset may not be free and use a requester pay model. 
In this case, you would have to properly manage authentication:

In [4]:
fs3 = gcsfs.GCSFileSystem(project='poised-honor-358', token='anon')
try:
    fs3.ls('somovar-02')
except ValueError as e:
    print(str(e))

FileNotFoundError: 

### Load data

In [5]:
gcsmap = fs.get_mapper("ds2data/EN.4.2.1.f.analysis.g10.zarr")
ds = xr.open_zarr(gcsmap)
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
print(ds)

GroupNotFoundError: group not found at path ''

In [None]:
# Load another dataset:
gcsmap = fs.get_mapper('ds2data/GLOBAL_ARGO_SDL1000.zarr')
ds = xr.open_zarr(gcsmap)
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
print(ds)

## Use intake catalog of data

The catalog also uses the gcsfs entry point, but with intake it's transparent to the user:

### Access and listing of the catalog

In [None]:
from intake import open_catalog

In [None]:
catalog_url = 'https://raw.githubusercontent.com/obidam/ds2-2020/ds2-2021/ds2_data_catalog.yml'
cat = open_catalog(catalog_url)
list(cat)

### Load data

In [None]:
ds = cat.en4.read_chunked()
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
ds

In [None]:
ds  = cat["sea_surface_height"].to_dask()
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
ds

# Pangeo data

https://github.com/pangeo-data/pangeo-datastore

https://catalog.pangeo.io/

## Explore catalog

In [None]:
from intake import open_catalog

pangeo_cat = open_catalog("https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/master.yaml")
list(pangeo_cat)

In [None]:
list(pangeo_cat.ocean)
# print(list(pangeo_cat.atmosphere))
# print(list(pangeo_cat.hydro))
# pangeo_cat.walk(depth=5)

## Access data

In [None]:
# ds = pangeo_cat.ocean.SOSE.read_chunked()
# ds = pangeo_cat.atmosphere.gmet_v1.read_chunked()
ds = pangeo_cat.ocean.sea_surface_height.read_chunked()
ds

In [None]:
ds = pangeo_cat.ocean.SOSE.to_dask()
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
print(ds)

In [None]:
ds  = pangeo_cat.ocean.MEOM_NEMO['NATL60_SSH'].to_dask()
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
print(ds)

## Access to data subset

In [None]:
ds = pangeo_cat.ocean.sea_surface_height.read_chunked()

In [None]:
# Load and plot a map
%matplotlib inline
%time sla = ds['adt'].sel(time='2009-02-12', method='nearest')
print(sla)
sla.plot()

In [None]:
# Load and plot a time series

# This slicing is much longer !

# So we recommend you insert here the connection to a dask cluster (See Tuto 01)

%time sla = ds['adt'].sel(latitude=30, method='nearest').sel(longitude=360-55, method='nearest')
print(sla)
sla.plot()