# Obtain ERA5 data from the cloud (AWS)
### created by Marisol, 4 June 2022

In [1]:
import warnings
warnings.simplefilter('ignore') 

import numpy as np
import pandas as pd
import xarray as xr
from calendar import month_abbr # function that gives you the abbreviated name of a month
from calendar import monthrange # gives the number of day in a month
import matplotlib.pyplot as plt 
import fsspec
import s3fs
import dask
from dask.distributed import performance_report, Client, progress
import os # library to interact with the operating system

***
## For this example we select a region, and also a specific month and a range of years to analyze

In [2]:
# Select region by defining latitude and longitude range. 
# ERA-5 data has a 1/4 degree resolution. 
latr = [17,20] # Latitude range. Make sure lat1 < lat2 since no test is done below to simplify the code. resolution 0.25 degrees
lonr = [-89,-86.5] # lon1 < lon2. and use the range -180 : 180
# time selection
iyr = 1979 # initial year. by default, we set it to the start year of ERA5 dataset
fyr = 2022 # final year. by default, we set it to the end year of ERA5 dataset


***
## Acquire data from the AWS cloud

In this case, files are stored in a different format than SST.  ERA5 data is stored in monthly files (of daily data) organized in yearly folders. Then, monhtly files have to be accessed individually.

In [12]:
#avars = ['air_temperature_at_2_metres','eastward_wind_at_10_metres','northward_wind_at_10_metres','significant_height_of_wind_and_swell_waves','integral_wrt_time_of_surface_direct_downwelling_shortwave_flux_in_air_1hour_Accumulation','precipitation_amount_1hour_Accumulation']
#flns = ['AT2m','ZW10m','MW10m','Waves','Rad','Rain']
# for precipitation, radiation change time0 to time1 in line 26... after the select
# no wave data
avars = ['significant_height_of_wind_and_swell_waves']
flns = ['Waves']
# get variables
d=0
for k in avars:
    print(k)
    for iy, y in enumerate(range(iyr, fyr+1)): # for loop over the selected years
        if y==2022:
            lm = 5
        else:
            lm = 13
        for jx in range(1,lm):
            file_location = 'https://era5-pds.s3.us-east-1.amazonaws.com/zarr/'+str(y)+'/'+str(jx).zfill(2)+'/data/'+k+'.zarr'
            # filename includes: bucket name: era5-pds, year: y (transformed to string type), month: mon, and the name of the variable with extenssion zarr
            print(file_location)
            ds = xr.open_zarr(file_location,consolidated=True) # open access to data

            # generate time frame to obtain the whole month data (first to last day of selected month)
            dte1 = str(y)+'-'+str(jx).zfill(2)+'-01'
            dte2 = str(y)+'-'+str(jx).zfill(2)+'-'+str(monthrange(y, jx)[1]) #monthrange provides the lenght of the month
            # select data region and time - meridional wind
            vds = ds[k].sel(time1 = slice(dte1,dte2),
                                    lat  = slice(latr[1],latr[0],), 
                                    lon  = slice(lonr[0]+360,lonr[1]+360)
                                    ).load() # calculae mean before downloading it
            fn = flns[d]+'_'+str(y)+'_'+str(jx).zfill(2)+'.nc'
            vds.to_netcdf('../data/ERA5_'+flns[d]+'/'+fn)
    d += 1


significant_height_of_wind_and_swell_waves
https://era5-pds.s3.us-east-1.amazonaws.com/zarr/1979/01/data/significant_height_of_wind_and_swell_waves.zarr


KeyError: '.zmetadata'

In [8]:
ds

Unnamed: 0,Array,Chunk
Bytes,2.85 GiB,31.93 MiB
Shape,"(737, 721, 1440)","(372, 150, 150)"
Count,101 Tasks,100 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.85 GiB 31.93 MiB Shape (737, 721, 1440) (372, 150, 150) Count 101 Tasks 100 Chunks Type float32 numpy.ndarray",1440  721  737,

Unnamed: 0,Array,Chunk
Bytes,2.85 GiB,31.93 MiB
Shape,"(737, 721, 1440)","(372, 150, 150)"
Count,101 Tasks,100 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,11.52 kiB,11.52 kiB
Shape,"(737, 2)","(737, 2)"
Count,2 Tasks,1 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 11.52 kiB 11.52 kiB Shape (737, 2) (737, 2) Count 2 Tasks 1 Chunks Type datetime64[ns] numpy.ndarray",2  737,

Unnamed: 0,Array,Chunk
Bytes,11.52 kiB,11.52 kiB
Shape,"(737, 2)","(737, 2)"
Count,2 Tasks,1 Chunks
Type,datetime64[ns],numpy.ndarray


***
# Resources
**Data**
- AWS [ERA-5 (ECMWF)](https://registry.opendata.aws/ecmwf-era5/) reanalysis data.
This page also has links to other tutorials that use other libraries.
- [List of data available](https://github.com/planet-os/notebooks/blob/master/aws/era5-pds.md) on ERA5 and details on how the files are organized.
- Google Earth Engine ERA-5 data. [[Monthly]](https://developers.google.com/earth-engine/datasets/catalog/ECMWF_ERA5_MONTHLY#bands) [[Daily]](https://developers.google.com/earth-engine/datasets/catalog/ECMWF_ERA5_DAILY).

**More on the libraries:**
- [xarray apply](https://www.programcreek.com/python/example/123575/xarray.apply_ufunc) Examples on how to apply a function to an xarray structure
- [sckit-learn (sklearn)](https://scikit-learn.org/stable/) a library for machine learning functions
- [statsmodels](https://www.statsmodels.org/stable/user-guide.html) a library to calculalte statistical models.


