In [1]:
import datetime
import json
import warnings

import certifi
import urllib3
from urllib.parse import urlencode

from pprint import pprint

In [2]:
# -----------------------------------------------------------------------------
# class CmrProcess
#
# @author: Caleb Spradlin, caleb.s.spradlin@nasa.gov
# @version: 12.30.2021
#
# https://cmr.earthdata.nasa.gov/search/
# https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html
# -----------------------------------------------------------------------------
class CmrProcess(object):

    CMR_BASE_URL = 'https://cmr.earthdata.nasa.gov' +\
        '/search/granules.umm_json_v1_4?'

    # Range for valid lon/lat
    LATITUDE_RANGE = (-90, 90)
    LONGITUDE_RANGE = (-180, 180)

    # -------------------------------------------------------------------------
    # __init__
    # -------------------------------------------------------------------------
    def __init__(self,
                 mission,
                 dateTime,
                 lonLat=None,
                 error=False,
                 dayNightFlag='',
                 pageSize=150,
                 maxPages=50):

        self._error = error
        self._dateTime = dateTime
        self._mission = mission
        self._pageSize = pageSize
        self._maxPages = maxPages
        
        self._lonLat = lonLat
        self._dayNightFlag = dayNightFlag

    # -------------------------------------------------------------------------
    # run()
    #
    # Given a set of parameters on init (time, location, mission), search for
    # the most relevant file. This uses CMR to search metadata for
    # relevant matches.
    # -------------------------------------------------------------------------
    def run(self):
        print('Starting query')
        outout = set()
        for i in range(self._maxPages):
            
            d, e = self._cmrQuery(pageNum=i+1)
            
            if e and i > 1:
                return sorted(list(outout))
            
            if not e:
                print('Results found on page: {}'.format(i+1))
                out = [r['file_url'] for r in d.values()]
                outout.update(out)
                
        outout = sorted(list(outout))
        return outout
        

    # -------------------------------------------------------------------------
    # cmrQuery()
    #
    # Search the Common Metadata Repository(CMR) for a file that
    # is a temporal and spatial match.
    # -------------------------------------------------------------------------
    def _cmrQuery(self, pageNum=1):

        requestDictionary = self._buildRequest(pageNum=pageNum)
        totalHits, resultDictionary = self._sendRequest(requestDictionary)

        if self._error:
            return None, self._error

        if totalHits <= 0:
            print('No hits on page number: {}, ending search.'.format(pageNum))
            #warnings.warn(msg)
            return None, True

        resultDictionaryProcessed = self._processRequest(resultDictionary)
        return resultDictionaryProcessed, self._error

    # -------------------------------------------------------------------------
    # buildRequest()
    #
    # Build a dictionary based off of parameters given on init.
    # This dictionary will be used to encode the http request to search
    # CMR.
    # -------------------------------------------------------------------------
    def _buildRequest(self, pageNum=1):
        requestDict = dict()
        requestDict['page_num'] = pageNum
        requestDict['page_size'] = self._pageSize
        requestDict['concept_id'] = self._mission
        requestDict['bounding_box'] = self._lonLat
        requestDict['day_night_flag'] = self._dayNightFlag
        requestDict['temporal'] = self._dateTime
        return requestDict

    # -------------------------------------------------------------------------
    # _sendRequest
    #
    # Send an http request to the CMR server.
    # Decode data and count number of hits from request.
    # -------------------------------------------------------------------------
    def _sendRequest(self, requestDictionary):
        with urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                                 ca_certs=certifi.where()) as httpPoolManager:
            encodedParameters = urlencode(requestDictionary, doseq=True)
            requestUrl = self.CMR_BASE_URL + encodedParameters
            try:
                requestResultPackage = httpPoolManager.request('GET',
                                                               requestUrl)
            except urllib3.exceptions.MaxRetryError:
                self._error = True
                return 0, None

            requestResultData = json.loads(
                requestResultPackage.data.decode('utf-8'))
            status = int(requestResultPackage.status)

            if not status == 400:
                totalHits = len(requestResultData['items'])
                return totalHits, requestResultData

            else:
                msg = 'CMR Query: Client or server error: ' + \
                    'Status: {}, Request URL: {}, Params: {}'.format(
                        str(status), requestUrl, encodedParameters)
                warnings.warn(msg)
                return 0, None

    # -------------------------------------------------------------------------
    # _processRequest
    #
    # For each result in the CMR query, unpackage relevant information to
    # a dictionary. While doing so set flags if data is not desirable (too
    # close to edge of dataset).
    #
    #  REVIEW: Make the hard-coded names class constants? There are a lot...
    # -------------------------------------------------------------------------
    def _processRequest(self, resultDict):

        resultDictProcessed = dict()

        for hit in resultDict['items']:

            fileName = hit['umm']['RelatedUrls'][0]['URL'].split(
                '/')[-1]

            # ---
            # These are hardcoded here because the only time these names will
            # ever change is if we changed which format of metadata we wanted
            # the CMR results back in.
            #
            # These could be placed as class constants in the future.
            # ---
            fileUrl = hit['umm']['RelatedUrls'][0]['URL']
            temporalRange = hit['umm']['TemporalExtent']['RangeDateTime']
            dayNight = hit['umm']['DataGranule']['DayNightFlag']

 
            spatialExtent = hit['umm']['SpatialExten' +
                                          't']['HorizontalSpatialDom' +
                                               'ain']

            key = fileName

            resultDictProcessed[key] = {
                'file_name': fileName,
                'file_url': fileUrl,
                'temporal_range': temporalRange,
                'spatial_extent': spatialExtent,
                'day_night_flag': dayNight}

        return resultDictProcessed

#### Boreal NA

In [62]:
search_dict_borealna = {
    'site_name': 'Boreal North America',
    'bbox': [-165,50,-45,71], 
    'minmonth': "06",
    'maxmonth': "09",
    'years_list': [2018,2019,2020,2021,2022]
}

#### Senegal

In [61]:
search_dict_senegal = {
    'site_name': 'Senegal',
    'bbox': [-18,12,-11,17], 
    'minmonth': "01",
    'maxmonth': "12",
    'years_list': [2018,2019,2020,2021,2022]
}

In [3]:
search_dict_howland = {
    'site_name': 'Howland',
    'bbox': [-69,44,-68,46], 
    'minmonth': "06",
    'maxmonth': "09",
    'years_list': [2018,2019,2020,2021,2022]
}

In [13]:
search_dict_serc = {
    'site_name': 'SERC',
    'bbox': [-76.6,38.8,-76.5,38.9], 
    'minmonth': "06",
    'maxmonth': "09",
    'years_list': [2018,2019,2020,2021,2022]
}

#### Bhasan Char

In [44]:
search_dict_bhasan = {
    'site_name': 'Bhasan Char',
    'bbox': [91.36,22.35,91.43,22.392], 
    'minmonth': "01",
    'maxmonth': "12",
    'years_list': [2018,2019,2020,2021,2022]
}

'91.36,22.35,91.43,22.392'

In [14]:
# Choose a site
search_dict = search_dict_serc

#### Build search: seasonal search across list of years for ATL08 in a bbox

In [15]:
# Find this at https://search.earthdata.nasa.gov/
COLLECTID_ATL08_V3 = "C2003772626-NSIDC_ECS"
COLLECTID_ATL08_V5 = "C2144424132-NSIDC_ECS"
# Page size: 150, number of results returned by page.
PAGESIZE = 150 
# Max page, number of pages to return before ending query.
MAXPAGE = 60
# Total max results will be PAGESIZE * MAXPAG

cmrP_list = []
for YEAR in search_dict['years_list']:
    cmrP = CmrProcess(mission = COLLECTID_ATL08_V5, 
                      dateTime=f"{YEAR}-{search_dict['minmonth']}-01T00:00:00Z,{YEAR}-{search_dict['maxmonth']}-30T23:59:59Z", 
                      lonLat = ','.join(str(e) for e in search_dict['bbox']),
                      pageSize=PAGESIZE,
                      maxPages=MAXPAGE)
    cmrP_list.append(cmrP)

#### Run search

In [16]:
resultList = [cmrP.run() for cmrP in cmrP_list]

Starting query
No hits on page number: 1, ending search.
No hits on page number: 2, ending search.
No hits on page number: 3, ending search.
Starting query
Results found on page: 1
No hits on page number: 2, ending search.
No hits on page number: 3, ending search.
Starting query
Results found on page: 1
No hits on page number: 2, ending search.
No hits on page number: 3, ending search.
Starting query
Results found on page: 1
No hits on page number: 2, ending search.
No hits on page number: 3, ending search.
Starting query
No hits on page number: 1, ending search.
No hits on page number: 2, ending search.
No hits on page number: 3, ending search.


In [17]:
len(resultList[0])
atl08_granule_list = [item for sublist in resultList for item in sublist]
print(f"{len(atl08_granule_list)} granules in search results list")

# Get list of just granule names without paths
[g.split('/')[-1] for g in atl08_granule_list]

15 granules in search results list


['ATL08_20190601214044_09870306_005_01.h5',
 'ATL08_20190804062838_05680402_005_01.h5',
 'ATL08_20190831172034_09870406_005_01.h5',
 'ATL08_20190929155639_00420506_005_01.h5',
 'ATL08_20200628025559_00420806_005_01.h5',
 'ATL08_20200703143148_01260802_005_01.h5',
 'ATL08_20200801130752_05680802_005_01.h5',
 'ATL08_20200828235944_09870806_005_01.h5',
 'ATL08_20200926223548_00420906_005_01.h5',
 'ATL08_20210626093521_00421206_005_01.h5',
 'ATL08_20210701211112_01261202_005_01.h5',
 'ATL08_20210730194718_05681202_005_01.h5',
 'ATL08_20210827063913_09871206_005_01.h5',
 'ATL08_20210925051519_00421306_005_01.h5',
 'ATL08_20210930165108_01261302_005_01.h5']

#### Here is a dataframe of the granule paths

In [12]:
import pandas as pd

atl08_h5_fn_df = pd.DataFrame(atl08_granule_list, columns = ['path'])
atl08_h5_fn_df['site_name'] = search_dict['site_name']

atl08_h5_fn_df


Unnamed: 0,path,site_name
0,https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL08...,Howland
1,https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL08...,Howland
2,https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL08...,Howland
3,https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL08...,Howland
4,https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL08...,Howland
5,https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL08...,Howland
6,https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL08...,Howland
7,https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL08...,Howland
8,https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL08...,Howland
9,https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL08...,Howland


Notes:
1. Can we get these as s3 paths? ---> probably not yet on AWS
2. Should we download to our AWS s3 buckets and store? or just transfer from ADAPT?