# Examine MERRA2 on s3

Paul Montesano  
July 2024

**Goal:**  
Access MERRA2 data on s3. Find the same data used for ABoVE TTE sibbork runs

**Motivation:**  
Provide a starting point for MERRA2 use for various studies

**Resources:**  
- https://lb.gesdisc.eosdis.nasa.gov/meditor/notebookviewer/?notebookUrl=https%3A%2F%2Fgithub.com%2Fnasa%2Fgesdisc-tutorials%2Fblob%2Fmain%2Fcloud-tutorials%2Fnotebooks%2FHow_to_Directly_Access_MERRA-2_Data_from_an_S3_Bucket.ipynb  
- https://disc.gsfc.nasa.gov/datasets/M2T1NXFLX_5.12.4/summary?keywords=%22merra-2%22  


In [36]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd

import cartopy.crs as ccrs
import cartopy.feature as cfeature
from shapely.geometry import Point

import s3fs
import xarray as xr

In [86]:
def subset_merra2_ds(ds, EXTENT = (0, 180, 55, 75)):
    
    min_lon, max_lon, min_lat, max_lat = EXTENT
    subset_ds = ds.sel(lat=slice(min_lat,max_lat), lon=slice(min_lon,max_lon))
    
    return subset_ds
    
def process_merra2_ds_s3(s3_fn, s3_session):

    ds = xr.open_dataset(s3_session.open(s3_fn))

    # [1] Subset to extent
    ds = subset_merra2_ds(ds)
   
    return ds

### s3 access to GESDISC (Goddard Earth Sciences Data and Information Services Center)

**click link and check `I agree`**

In [14]:
s3_cred_endpoint = {
    'podaac':'https://archive.podaac.earthdata.nasa.gov/s3credentials',
    'gesdisc': 'https://data.gesdisc.earthdata.nasa.gov/s3credentials',
    'lpdaac':'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials',
    'ornldaac': 'https://data.ornldaac.earthdata.nasa.gov/s3credentials',
    'ghrcdaac': 'https://data.ghrc.earthdata.nasa.gov/s3credentials'
}
s3_cred_endpoint

{'podaac': 'https://archive.podaac.earthdata.nasa.gov/s3credentials',
 'gesdisc': 'https://data.gesdisc.earthdata.nasa.gov/s3credentials',
 'lpdaac': 'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials',
 'ornldaac': 'https://data.ornldaac.earthdata.nasa.gov/s3credentials',
 'ghrcdaac': 'https://data.ghrc.earthdata.nasa.gov/s3credentials'}

In [74]:
# Define Function 
def get_temp_creds(provider):
    return requests.get(s3_cred_endpoint[provider]).json()

In [75]:
# Get Credentials
temp_creds_req = get_temp_creds('gesdisc')

# Pass Authentication to s3fs
s3_gesdisc = s3fs.S3FileSystem(anon=False, key=temp_creds_req['accessKeyId'], secret=temp_creds_req['secretAccessKey'], token=temp_creds_req['sessionToken'])

In [76]:
s3_fn_list = s3_gesdisc.glob('s3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXFLX.5.12.4/1980/01/*') #M2T1NXFLX
s3_fn_list[0:3]

['gesdisc-cumulus-prod-protected/MERRA2/M2T1NXFLX.5.12.4/1980/01/MERRA2_100.tavg1_2d_flx_Nx.19800101.nc4',
 'gesdisc-cumulus-prod-protected/MERRA2/M2T1NXFLX.5.12.4/1980/01/MERRA2_100.tavg1_2d_flx_Nx.19800102.nc4',
 'gesdisc-cumulus-prod-protected/MERRA2/M2T1NXFLX.5.12.4/1980/01/MERRA2_100.tavg1_2d_flx_Nx.19800103.nc4']

works

In [87]:
ds_list = []
for s3_fn in s3_fn_list[0:5]:
    #with s3_gesdisc.open(s3_fn) as fileObj:
    ds_list.append(process_merra2_ds_s3(s3_fn, s3_gesdisc)) 

In [88]:
ds = xr.concat(ds_list, dim='day')

In [68]:
ds

In [48]:
# Iterate through remote_files to create a fileset
fileset = []
for file in s3_fn:
    fileset.append(with s3.open(file) as fileObj) 

# This works
ds = xr.open_mfdataset(fileset, concat_dim='time', combine='nested')

SyntaxError: invalid syntax (1928305932.py, line 4)

In [None]:
ds = xr.open_mfdataset(s3_fn_list, concat_dim='time', combine='nested')

In [None]:
create_sibbork_files_from_MERRA2(lat, lon, datadir='/css/merra2/MERRA2_all', years=np.r_[args.start_year:args.end_year], fields=['PRECTOTCORR', 'TLML'], reduce_how=['sum', 'mean'], filename=filename)

In [34]:
def convert_to_cftime(date_str):
    import cftime
    # Parse the string assuming it is in the format 'YYYY-MM'
    year, month = map(int, date_str.split('-'))
    # Create a CFTimeIndex object
    cftime_date = cftime.DatetimeGregorian(year, month, 1)
    return cftime_date

In [39]:
def plot_merra2(ds, VAR, FOCAL_LAT, FOCAL_LON, YR_RANGE=('1980','2024'), DIM_NAME='source_id', HIGHLIGHT_DATE=None):
    
    import matplotlib.dates as mdates
    fig, ax = plt.subplots(1,1, figsize=(15,2))
    ax.grid(True)

    y1, y2 = YR_RANGE

    # Specify the month and year for the vertical line
    specific_date = convert_to_cftime(HIGHLIGHT_DATE)

    # Add the vertical dotted line
    ax.axvline(x=specific_date, color='r', linestyle='--', label=f'Map date: {specific_date.strftime("%Y-%m")}')

    # Line plot of all models (bands) for a single lat lon
    ds[VAR].sel(y=FOCAL_LAT, x=FOCAL_LON, method='nearest').sel(time=slice(y1,y2)).plot.line(ax=ax, x="time", color='orange', alpha=0.9)

    # Formatting the plot
    #ax.xaxis.set_major_locator(mdates.YearLocator(1))  # Major ticks every 10 years
    ax.xaxis.set_minor_locator(mdates.YearLocator(1))  # Minor ticks every year
    #ax.xaxis.set_major_locator(plt.MaxNLocator(20))  # Control the number of grid lines
    #ax.tick_params(axis='x', rotation=45)
    ax.set_title(f"Daily {VAR} ({y1}-{y2}) from MERRA2 at {FOCAL_LAT}, {FOCAL_LON}")
    ax.set_xlabel('Date')
    #ax.set_ylabel('Temperature (°K)')
    ax.legend()
    
    plt.show()

In [None]:
def create_sibbork_files_from_MERRA2(lat, lon, datadir='/css/merra2/MERRA2_all', years=np.r_[1980:1990], fields=['PRECTOTCORR', 'TLML'], reduce_how=['sum', 'mean'], filename=None):
    import matplotlib as mpl
    if filename is None:
        filename='DPTarr_MERRA2_' + coordinate_to_string(lat, lon,factor=1) + '.npy'   
    def _get_year(y):
      input_dir=os.path.join(datadir, str(y))
      out_file=filename[:-4]+'_'+str(y)+'.nc'
      ds=get_xr_dataset(input_dir, pattern='Y'+str(y)+'/M??/MERRA2.tavg1_2d_flx_Nx.*.nc4', multiple_nc=True).sel(lon=lon,lat=lat, method='nearest')
      get_daily_values(ds, fields, reduce_how, export_nc=True, out_pth='.', out_file=out_file)
      return None
        
    # except:
    for y in tqdm(years):
      if os.path.isfile(filename[:-4]+'_'+str(y)+'.nc'):
        print('Skipping generation of existing file: {}'.format(filename[:-4]+'_'+str(y)+'.nc'))
      else:
        ds=get_xr_dataset(datadir, pattern='Y'+str(y)+'/M??/MERRA2.tavg1_2d_flx_Nx.*.nc4', multiple_nc=True).sel(lon=lon,lat=lat, method='nearest')
        get_daily_values(ds, fields, reduce_how, export_nc=True, out_pth='.', out_file=filename[:-4]+'_'+str(y)+'.nc')
    
    tt=get_xr_dataset('.', multiple_nc=True, pattern=filename[:-4]+"*.nc")

    DPTarr=np.zeros((tt.dims['time'], 3))
    for k,(d,p,t) in enumerate(zip(tt['time'].values,tt['PRECTOTCORR'].values, tt['TLML'].values)):
        DPTarr[k, 0]=dates.date2num(datetime.datetime.utcfromtimestamp(d.astype('O')/1e9)) #https://stackoverflow.com/questions/29753060/how-to-convert-numpy-datetime64-into-datetime#29755657
        DPTarr[k, 1]=p*86400/10 # MERRA units in kg/m^2/s also  km/m^2 = mm (e.g. conver to daily rain div by 10 for cm)
        DPTarr[k, 2]=t-273.15   #convert to C
    np.save(filename, DPTarr)    

In [None]:
def get_xr_dataset(datadir, fname=None, multiple_nc=False, pattern='MERRA2/Y????/M??/MERRA2.tavg1_2d_flx_Nx.*.nc4'):
    """
    Returns a "cleaned" xarray dataset for LIS data

    :param datadir: path to data ex. '/Users/lsetiawan/Downloads/200101/' or r'C:\work\datadrive\LIS\'
    :param fname: file name if using to open only one netCDF file
    :param multiple_nc: True if using to read multiple netCDF Files
    :return: xarray dataset
    """
    if multiple_nc is False:
        try:
            ds = xr.open_dataset(os.path.join(datadir, fname))
        except:
            print("Please provide filename!")
            sys.exit("Exiting...")
    else:
        file_list=np.sort(glob.glob(os.path.join(datadir,pattern)))
        # Added tp get rid of some garbage files that match the pattern and rarely appear
        file_list = [i for i in file_list if not ('ave' in i or 'diurnal' in i or 'monthly' in i)]
        #file_list=np.sort(reduce(lambda x, y: x+y, list(map(lambda r,_,x:[r+'/'+f for f in [f for f in x if fnmatch.fnmatch(f,pattern)]], os.walk(datadir)))))
	#above line stopped working after upgrade to py3.7
        #ds = xr.open_mfdataset(os.path.join(datadir, pattern))
        ds = xr.open_mfdataset(file_list, concat_dim='time', combine='nested') # decode_cf=False may be faster but returns errors

    return ds


def get_monthly_avg(ds, des_vars, export_nc=False, out_pth=None):
    new_ds = None
    for idx, var in enumerate(des_vars):
        with ProgressBar():
            da = ds[var].resample('MS', 'time', how = 'sum')
        if idx == 0:
            new_ds = da.to_dataset()
        else:
            new_ds[var] = da

    if export_nc:
        try:
            new_ds.to_netcdf(os.path.join(out_pth, 'LISMonthly.nc'))
        except IOError:
            print('Folder not found.')

    return new_ds

def get_daily_values(ds, des_vars, reduce_how, export_nc=False, out_pth=None, out_file='LISMonthly.nc' ):
    new_ds = None
    for idx, var in enumerate(des_vars):
        #da = ds[var].resample('D', 'time', how=reduce_how[idx])
        if 'mean' in reduce_how[idx].lower():
            da=ds[var].resample(time='1D').mean() #20191120 Batu. Above line is apparently  deprecated. 
        elif 'sum' in reduce_how[idx].lower():
            da=ds[var].resample(time='1D').sum()
        if idx == 0:
            new_ds = da.to_dataset()
        else:
            new_ds[var] = da

    if export_nc:
        try:
            new_ds.to_netcdf(os.path.join(out_pth, out_file)) #Adding load() to speed up did not help  https://github.com/pydata/xarray/issues/2912
        except IOError:
            print('Folder not found.')

    return new_ds