## Accessing data with an http or opendap URL is very easy

In [None]:
import xarray as xr

In [None]:
# Direct access data from kage:
# Note, use your browser to explore http://kage.ldeo.columbia.edu/data/pdsi-spei/

url = 'http://kage.ldeo.columbia.edu/data/pdsi-spei/pdsi/GFDL_ESM2G/pdsi_all.nc#mode=bytes'
ds = xr.open_dataset(url,decode_times=False)
print(ds)
#ds.pdsi_all.mean('T').plot()

In [None]:
# Direct access CMIP6 from mary:

url = 'http://mary.ldeo.columbia.edu/CMIP6/CMIP/BCC/BCC-ESM1/historical/r1i1p1f1/Amon/clt/gn/v20181214/\
clt_Amon_BCC-ESM1_historical_r1i1p1f1_gn_185001-201412.nc#mode=bytes'

xr.open_dataset(url)

In [None]:
# Can also use an OPeNDAP url:

url = 'https://esgf-data1.llnl.gov/thredds/dodsC/css03_data/CMIP6/ScenarioMIP/AS-RCEC/TaiESM1/ssp370/r1i1p1f1/day/tasmax/gn/v20210323/tasmax_day_TaiESM1_ssp370_r1i1p1f1_gn_20150101-20241231.nc'
xr.open_dataset(url)

In [None]:
# Globus - NOPE, must set up endpoints, etc.

url = 'globus:415a6320-e49c-11e5-9798-22000b9da45e/css03_data/CMIP6/ScenarioMIP/IPSL/IPSL-CM6A-LR/ssp370/r6i1p1f1/day\
/tasmax/gr/v20190614/tasmax_day_IPSL-CM6A-LR_ssp370_r6i1p1f1_gr_20150101-21001231.nc'
# xr.open_dataset(url)

In [None]:
# Google Cloud Store, Pangeo zarr collection

path = 'CMIP6/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r2i1p1f1/day/va/gn/v20190710'

url = f'https://cmip6.storage.googleapis.com/{path}'
ds = xr.open_zarr(url,consolidated=True)
print('size of dataset:',ds.nbytes/1e9,'G')
ds

In [None]:
# Amazon S3, Pangeo zarr collection (mirror of GCS)

path = 'CMIP6/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r2i1p1f1/day/va/gn/v20190710'

url = f'https://cmip6-pds.s3.amazonaws.com/{path}'
ds = xr.open_zarr(url,consolidated=True)
print('size of dataset:',ds.nbytes/1e9,'G')
ds

In [None]:
# Amazon S3, GFDL netcdf collection

# can open a single file like this, but there is a better method to read from the S3 file system directly, see below

#url = 'https://esgf-world.s3.amazonaws.com/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/abrupt-4xCO2/r1i1p1f1/Lmon/cLeaf/gr/v20190118/cLeaf_Lmon_IPSL-CM6A-LR_abrupt-4xCO2_r1i1p1f1_gr_185001-214912.nc#mode=bytes'
#ds = xr.open_dataset(url, decode_coords=False)
#ds

## We can also instantiate a file system

In [None]:
import xarray as xr
import fsspec

fs_http = fsspec.filesystem('http')
files = fs_http.glob('http://kage.ldeo.columbia.edu/data/ERA5/monthly/single_level/*/2m_*.nc')
files

In [None]:
urls = [ file + '#mode=bytes' for file in files]
ds= xr.open_mfdataset(urls)
ds

In [None]:
# can browse ftp sites
fs_ftp = fsspec.filesystem('ftp', host='ftp.cdc.noaa.gov') #, port=port, username=user, password=pw)
fs_ftp.glob('/Projects/Datasets/*')

In [None]:
# can directly read in single ftp files WITHOUT downloading first
import urllib
import io

url = 'ftp://ftp.cdc.noaa.gov/Projects/Datasets/ncep.reanalysis.derived/surface/air.sig995.mon.mean.nc'
req = urllib.request.Request(url)

with urllib.request.urlopen(req) as resp:
    ds = xr.open_dataset(io.BytesIO(resp.read()))
    
ds

In [None]:
# Google Cloud Store Example
import gcsfs

fs_GCS = gcsfs.GCSFileSystem(token='anon',access='read_only')

fs_GCS.ls('cmip6')  # just put the bucket name here

In [None]:
# Amazon S3 Example
import s3fs

fs_S3 = s3fs.S3FileSystem(anon=True)

fs_S3.ls('cmip6-pds')  # just put the bucket name here

In [None]:
# Amazon S3, GFDL netcdf collection  (dataset may consist of multiple netcdf files)

fs_GFDL = s3fs.S3FileSystem(anon=True)

path = 'CMIP6/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r2i1p1f1/day/va/gn/v20190710'

s3path = 's3://esgf-world/'+path+'/*.nc'
fs_GFDL.glob(s3path)

In [None]:
fobj = [fs_GFDL.open(f) for f in fs_GFDL.glob(s3path)]
ds = xr.open_mfdataset(fobj, data_vars='minimal', 
                        use_cftime=True, join='exact', combine='nested', concat_dim='time')
ds