## Accessing data with an http or opendap URL is very easy

General notes:
- netcdf over http: files need to be opened individually, with #mode=bytes appended to the url
- zarr over http: can be read directly, xr.open_zarr()
- opendap urls: can be read directly, xr.open_dataset()
- ftp urls: can be read as binary files without downloading first

Cloud data:
- zarr datasets in GCS (gs) or AWS-S3 (s3) just use fsspec.get_mapper()
- AWS-S3 (s3) netcdf files need to be 'opened', using fs_s3.open(), where fs_s3  = fsspec.filesystem('s3', anon=True)

In [None]:
import xarray as xr
import fsspec
fsspec.__version__

In [None]:
# Direct access data from kage:
# Note, use your browser to explore http://kage.ldeo.columbia.edu/data/pdsi-spei/

url = 'http://kage.ldeo.columbia.edu/data/pdsi-spei/pdsi/GFDL_ESM2G/pdsi_all.nc#mode=bytes'
ds = xr.open_dataset(url,decode_times=False)
print(ds)
#ds.pdsi_all.mean('T').plot()

In [None]:
# Read sample zarr dataset from kage 

url = 'http://kage/CMIP6-zarr/CFMIP/MOHC/HadGEM3-GC31-LL/abrupt-2xCO2/r1i1p1f3/Amon/tas/gn/v20200829/'
xr.open_zarr(url,consolidated=True)

In [None]:
# Direct access CMIP6 from mary:

url = 'http://mary.ldeo.columbia.edu/CMIP6/CMIP/BCC/BCC-ESM1/historical/r1i1p1f1/Amon/clt/gn/v20181214/\
clt_Amon_BCC-ESM1_historical_r1i1p1f1_gn_185001-201412.nc#mode=bytes'

xr.open_dataset(url)

In [None]:
# Can also use an OPeNDAP url:

url = 'https://esgf-data1.llnl.gov/thredds/dodsC/css03_data/CMIP6/ScenarioMIP/AS-RCEC/TaiESM1/ssp370/r1i1p1f1/day/tasmax/gn/v20210323/tasmax_day_TaiESM1_ssp370_r1i1p1f1_gn_20150101-20241231.nc'
xr.open_dataset(url)

In [None]:
# Globus - NOPE, must set up endpoints, etc.

url = 'globus:415a6320-e49c-11e5-9798-22000b9da45e/css03_data/CMIP6/ScenarioMIP/IPSL/IPSL-CM6A-LR/ssp370/r6i1p1f1/day\
/tasmax/gr/v20190614/tasmax_day_IPSL-CM6A-LR_ssp370_r6i1p1f1_gr_20150101-21001231.nc'
# xr.open_dataset(url)

In [None]:
# Google Cloud Store, Pangeo zarr collection

path = 'CMIP6/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r2i1p1f1/day/va/gn/v20190710'

# Method 1:
url = f'https://cmip6.storage.googleapis.com/{path}'
ds = xr.open_zarr(url,consolidated=True)
print(url,'\n size of dataset:',ds.nbytes/1e9,'G \n')

# Method 2:
url = f'gs://cmip6/{path}'
ds = xr.open_zarr(fsspec.get_mapper(url),consolidated=True)
print(url,'\n size of dataset:',ds.nbytes/1e9,'G \n')

In [None]:
# Amazon S3, Pangeo zarr collection (mirror of GCS)

path = 'CMIP6/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r2i1p1f1/day/va/gn/v20190710'

# Method 1:
url = f'https://cmip6-pds.s3.amazonaws.com/{path}'
ds = xr.open_zarr(url,consolidated=True)
var = list(ds.data_vars)[0]
print(f'{url} \n {var} dataset: {ds.nbytes/1e9} G \n')

# Method 2:
url = f's3://cmip6-pds/{path}'
ds = xr.open_zarr(fsspec.get_mapper(url),consolidated=True)
var = list(ds.data_vars)[0]
print(f'{url} \n {var} dataset: {ds.nbytes/1e9} G \n')

In [None]:
# Amazon S3, GFDL netcdf collection

# can open a single file like this, but there is a better method to read from the S3 file system directly, see below

url = 'https://esgf-world.s3.amazonaws.com/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/abrupt-4xCO2/r1i1p1f1/Lmon/cLeaf/gr/v20190118/cLeaf_Lmon_IPSL-CM6A-LR_abrupt-4xCO2_r1i1p1f1_gr_185001-214912.nc#mode=bytes'
#ds = xr.open_dataset(url, decode_coords=False)
#print(url,'\n size of dataset:',ds.nbytes/1e9,'G \n')

## We can also instantiate a file system

In [None]:
# loading data from http://kage.ldeo.columbia.edu/data/*   (visit in browser to find the data)

fs_http = fsspec.filesystem('http')
files = fs_http.glob('http://kage.ldeo.columbia.edu/data/ERA5/monthly/single_level/*/2m_*.nc')
files

In [None]:
urls = [ file + '#mode=bytes' for file in files]
ds= xr.open_mfdataset(urls)
ds

In [None]:
fs_http = fsspec.filesystem('http')
files = fs_http.glob('http://kage.ldeo.columbia.edu/data/ERA5/monthly/single_level/*/vertical_integral_of_divergence*.nc')
urls = [ file + '#mode=bytes' for file in files]
ds= xr.open_mfdataset(urls)
ds

In [None]:
fs_http = fsspec.filesystem('http')
files = fs_http.glob('http://blanton/CMIP6/ScenarioMIP/ssp370/ACCESS-ESM1-5/day/r1i1p1f1/ua/gn/v20191115/*.nc')
print(files)
urls = [ file + '#mode=bytes' for file in sorted(files)]
ds= xr.open_mfdataset(urls)
ds

In [None]:
# can browse ftp sites
fs_ftp = fsspec.filesystem('ftp', host='ftp.cdc.noaa.gov') #, port=port, username=user, password=pw)
fs_ftp.glob('/Projects/Datasets/*')

In [None]:
# can directly read in single ftp files WITHOUT downloading first
import urllib
import io

url = 'ftp://ftp.cdc.noaa.gov/Projects/Datasets/ncep.reanalysis.derived/surface/air.sig995.mon.mean.nc'
req = urllib.request.Request(url)

with urllib.request.urlopen(req) as resp:
    ds = xr.open_dataset(io.BytesIO(resp.read()))
    
ds

In [None]:
# Google Cloud Store (zarr), Example

fs_gs = fsspec.filesystem('gs', anon=True)
fs_gs.ls('cmip6')  # just put the bucket name here

In [None]:
# Google Cloud Store (zarr), Example (continue)

dataset = 'gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Amon/tas/gr1/v20190726/'
xr.open_zarr(fsspec.get_mapper(dataset),consolidated=True)

In [None]:
# Amazon S3 (zarr),  Example

fs_s3 = fsspec.filesystem('s3', anon=True)

fs_s3.ls('cmip6-pds/CMIP6')  # just put the bucket name here

In [None]:
# Amazon S3 (zarr), Example  (continued)
#dataset = 's3://cmip6-pds/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Amon/tas/gr1/v20190726/'
dataset = 's3://cmip6-pds/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Amon/tas/gr1/v20190726/'

xr.open_zarr(fsspec.get_mapper(dataset),consolidated=True)

In [None]:
# Amazon S3 (netcdf), Example 1 - opening a single netcdf file

url = 's3://esgf-world/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Amon/tas/\
gr1/v20190726/tas_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.nc'
xr.open_dataset(fs_s3.open(url))

In [None]:
# Amazon S3 (netcdf), Example 2 (dataset may consist of multiple netcdf files)

dataset = 'CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Amon/tas/gr1/v20190726'
x = 's3://esgf-world/' + dataset
urls = fs_s3.glob(x+'/*')
urls

In [None]:
# Amazon S3 (netcdf), Example 2 (continued)

fobj = [fs_s3.open(f) for f in urls]
ds = xr.open_mfdataset(fobj, data_vars='minimal', 
                        use_cftime=True, join='exact', combine='nested', concat_dim='time')
ds