# Access to data in the cloud (GCS)

In [None]:
import sys
import gcsfs
import xarray as xr
import intake

## Read data from Google Cloud Storage (gcsfs)

### Access and listing

In [None]:
# Define cloud file system access point:
fs = gcsfs.GCSFileSystem(project='argo-france', token='anon', access='read_only')

# And list content of a bucket:
fs.ls('argodata')

But data access with ``gcsfs`` is critically dependant on the GCS set-up. For instance the following project does not allow to list the bucket content:

In [None]:
fs2 = gcsfs.GCSFileSystem(project='alert-ground-261008', token='anon', access='read_only')
try:
    fs2.ls('data_bdo2020')
except:
    print(sys.exc_info()[0])

On the other hand, some dataset may not be free and use a requester pay model. 
In this case, you would have to properly manage authentication:

In [None]:
fs3 = gcsfs.GCSFileSystem(project='poised-honor-358', token='anon')
try:
    fs3.ls('somovar-02')
except ValueError as e:
    print(str(e))

### Load data

In [None]:
gcsmap = fs.get_mapper("argodata/gridded/ISAS15_TEMP_NATL.zarr")
ds = xr.open_zarr(gcsmap)
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
print(ds)

In [None]:
# Load another dataset:
# gcsmap = fs.get_mapper('argodata/sdl/GLOBAL_ARGO_SDL2000')
gcsmap = fs.get_mapper('argodata/sdl/GLOB_HOMOGENEOUS_variables.zarr')
ds = xr.open_zarr(gcsmap)
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
print(ds)

## Use intake catalog of data

The catalog also uses the gcsfs entry point, but with intake it's transparent to the user:

### Access and listing of the catalog

In [None]:
catalog_url = 'https://raw.githubusercontent.com/obidam/ds2-2020/master/ds2_data_catalog.yml'
cat = intake.Catalog(catalog_url)
list(cat)

### Load data

In [None]:
ds = cat.en4.read_chunked()
ds

# Pangeo data

[Note that Pangeo is currently re-organising its datastore](https://discourse.pangeo.io/t/cleaning-out-the-pangeo-data-google-cloud-storage-bucket/353/22).

Some dataset access points may thus be obsolete below...

## Direct access to GCS

In [None]:
# Define cloud file system access point:
fs = gcsfs.GCSFileSystem(project='pangeo-181919', token='anon', access='read_only')
fs.ls('pangeo-data')

In [None]:
### Load a dataset:
# gcsmap = gcsfs.mapping.GCSMap('pangeo-data/dataset-duacs-rep-global-merged-allsat-phy-l4-v3-alt', gcs=fs) # AVISO altimetry
# gcsmap = gcsfs.mapping.GCSMap('pangeo-data/eNATL60-BLBT02X-ssh', gcs=fs) # High Resolution North Atlantic Ocean simulation
gcsmap = gcsfs.mapping.GCSMap('pangeo-era5/reanalysis/spatial-analysis', gcs=fs) # Hourly Atmospheric fields (1979/2018)
ds = xr.open_zarr(gcsmap)
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
print(ds)

## Intake Pangeo catalog

In [None]:
catalog_url = 'https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/master.yaml'
cat = intake.Catalog(catalog_url)
print(list(cat))
print(list(cat.ocean))
print(list(cat.atmosphere))
print(list(cat.hydro))
# cat.walk(depth=5)

In [None]:
# ds = cat.ocean.SOSE.read_chunked()
ds = cat.atmosphere.gmet_v1.read_chunked()
ds

## Access to data subset

In [None]:
# Load and plot a map
%matplotlib inline
%time sla = ds['adt'].sel(time='2009-02-12', method='nearest')
print(sla)
sla.plot()

In [None]:
# Load and plot a time series
# (this slicing is much longer !)
%time sla = ds['adt'].sel(latitude=30, method='nearest').sel(longitude=360-55, method='nearest')
print(sla)
sla.plot()