# Search EarthData's Common Metadata Repository by a bbox to get local ADAPT paths of ATL08 granules 
To find the ATL08 data you need to run through extraction and filtering, use the list returned from this noteboook 

Paul Montesano, Caleb Spadlin  
September 2023

### To run `do_extract_atl08_v005.py` like this:
```[pmontesa@adaptlogin101 ~]$ pdsh -g forest do_extract_filter_atl08.sh \"2018 2019 2020 2021 2022 2023\" /explore/nobackup/people/pmontesa/userfs02/data/icesat2/atl08.006/senegal_20m/list_atl08.006_senegal senegal_20m /explore/nobackup/people/pmontesa/userfs02/data/icesat2/atl08.006```


In [228]:
import datetime
import json
import warnings

import certifi
import urllib3
from urllib.parse import urlencode

from pprint import pprint

import pandas as pd
import numpy as np

import sys
sys.path.append('/home/pmontesa/code/geoscitools')
import atl08lib
import maplib
import pandas as pd
import geopandas as gpd
import csv


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
# -----------------------------------------------------------------------------
# class CmrProcess
#
# @author: Caleb Spradlin, caleb.s.spradlin@nasa.gov
# @version: 12.30.2021
#
# https://cmr.earthdata.nasa.gov/search/
# https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html
# -----------------------------------------------------------------------------
class CmrProcess(object):

    CMR_BASE_URL = 'https://cmr.earthdata.nasa.gov' +\
        '/search/granules.umm_json_v1_4?'

    # Range for valid lon/lat
    LATITUDE_RANGE = (-90, 90)
    LONGITUDE_RANGE = (-180, 180)

    # -------------------------------------------------------------------------
    # __init__
    # -------------------------------------------------------------------------
    def __init__(self,
                 mission,
                 dateTime,
                 lonLat=None,
                 error=False,
                 dayNightFlag='',
                 pageSize=150,
                 maxPages=50):

        self._error = error
        self._dateTime = dateTime
        self._mission = mission
        self._pageSize = pageSize
        self._maxPages = maxPages
        
        self._lonLat = lonLat
        self._dayNightFlag = dayNightFlag

    # -------------------------------------------------------------------------
    # run()
    #
    # Given a set of parameters on init (time, location, mission), search for
    # the most relevant file. This uses CMR to search metadata for
    # relevant matches.
    # -------------------------------------------------------------------------
    def run(self):
        print('Starting query')
        outout = set()
        for i in range(self._maxPages):
            
            d, e = self._cmrQuery(pageNum=i+1)
            
            if e and i > 1:
                return sorted(list(outout))
            
            if not e:
                print('Results found on page: {}'.format(i+1))
                out = [r['file_url'] for r in d.values()]
                outout.update(out)
                
        outout = sorted(list(outout))
        return outout
        

    # -------------------------------------------------------------------------
    # cmrQuery()
    #
    # Search the Common Metadata Repository(CMR) for a file that
    # is a temporal and spatial match.
    # -------------------------------------------------------------------------
    def _cmrQuery(self, pageNum=1):

        requestDictionary = self._buildRequest(pageNum=pageNum)
        totalHits, resultDictionary = self._sendRequest(requestDictionary)

        if self._error:
            return None, self._error

        if totalHits <= 0:
            print('No hits on page number: {}, ending search.'.format(pageNum))
            #warnings.warn(msg)
            return None, True

        resultDictionaryProcessed = self._processRequest(resultDictionary)
        return resultDictionaryProcessed, self._error

    # -------------------------------------------------------------------------
    # buildRequest()
    #
    # Build a dictionary based off of parameters given on init.
    # This dictionary will be used to encode the http request to search
    # CMR.
    # -------------------------------------------------------------------------
    def _buildRequest(self, pageNum=1):
        requestDict = dict()
        requestDict['page_num'] = pageNum
        requestDict['page_size'] = self._pageSize
        requestDict['concept_id'] = self._mission
        requestDict['bounding_box'] = self._lonLat
        requestDict['day_night_flag'] = self._dayNightFlag
        requestDict['temporal'] = self._dateTime
        return requestDict

    # -------------------------------------------------------------------------
    # _sendRequest
    #
    # Send an http request to the CMR server.
    # Decode data and count number of hits from request.
    # -------------------------------------------------------------------------
    def _sendRequest(self, requestDictionary):
        with urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                                 ca_certs=certifi.where()) as httpPoolManager:
            encodedParameters = urlencode(requestDictionary, doseq=True)
            requestUrl = self.CMR_BASE_URL + encodedParameters
            try:
                requestResultPackage = httpPoolManager.request('GET',
                                                               requestUrl)
            except urllib3.exceptions.MaxRetryError:
                self._error = True
                return 0, None

            requestResultData = json.loads(
                requestResultPackage.data.decode('utf-8'))
            status = int(requestResultPackage.status)

            if not status == 400:
                totalHits = len(requestResultData['items'])
                return totalHits, requestResultData

            else:
                msg = 'CMR Query: Client or server error: ' + \
                    'Status: {}, Request URL: {}, Params: {}'.format(
                        str(status), requestUrl, encodedParameters)
                warnings.warn(msg)
                return 0, None

    # -------------------------------------------------------------------------
    # _processRequest
    #
    # For each result in the CMR query, unpackage relevant information to
    # a dictionary. While doing so set flags if data is not desirable (too
    # close to edge of dataset).
    #
    #  REVIEW: Make the hard-coded names class constants? There are a lot...
    # -------------------------------------------------------------------------
    def _processRequest(self, resultDict):

        resultDictProcessed = dict()

        for hit in resultDict['items']:

            fileName = hit['umm']['RelatedUrls'][0]['URL'].split(
                '/')[-1]

            # ---
            # These are hardcoded here because the only time these names will
            # ever change is if we changed which format of metadata we wanted
            # the CMR results back in.
            #
            # These could be placed as class constants in the future.
            # ---
            fileUrl = hit['umm']['RelatedUrls'][0]['URL']
            temporalRange = hit['umm']['TemporalExtent']['RangeDateTime']
            dayNight = hit['umm']['DataGranule']['DayNightFlag']

 
            spatialExtent = hit['umm']['SpatialExten' +
                                          't']['HorizontalSpatialDom' +
                                               'ain']

            key = fileName

            resultDictProcessed[key] = {
                'file_name': fileName,
                'file_url': fileUrl,
                'temporal_range': temporalRange,
                'spatial_extent': spatialExtent,
                'day_night_flag': dayNight}

        return resultDictProcessed

#### Boreal NA

In [3]:
search_dict_borealna = {
    'site_name': 'Boreal North America',
    'bbox': [-165,50,-45,71], 
    'minmonth': "06",
    'maxmonth': "09",
    'years_list': [2018,2019,2020,2021,2022]
}

#### Senegal

In [180]:
search_dict_senegal = {
    'site_name': 'Senegal',
    'bbox': [-18,12,-11,17], 
    'minmonth': "01",
    'maxmonth': "12",
    'years_list': [2018,2019,2020,2021,2022,2023,2024]
}

In [181]:
search_dict_howland = {
    'site_name': 'Howland',
    'bbox': [-69,44,-68,46], 
    'minmonth': "06",
    'maxmonth': "09",
    'years_list': [2018,2019,2020,2021,2022]
}

In [182]:
search_dict_serc = {
    'site_name': 'SERC',
    'bbox': [-76.6,38.8,-76.5,38.9], 
    'minmonth': "06",
    'maxmonth': "09",
    'years_list': [2018,2019,2020,2021,2022]
}

#### Bhasan Char

In [183]:
search_dict_bhasan = {
    'site_name': 'Bhasan Char',
    'bbox': [91.36,22.35,91.43,22.392], 
    'minmonth': "01",
    'maxmonth': "12",
    'years_list': [2018,2019,2020,2021,2022]
}

## Choose a site and create subdir

In [185]:
# This provides the ATLAS data product and version - also used for subdir in ADAPT/EXPLORE
ATL08_VERSION = 'ATL08.006'

In [186]:
# Choose a site
search_dict = search_dict_senegal

# Choose a data output subdir
SUBDIR_NAME = 'senegal_20m'
OUTPUT_DIR = f'/explore/nobackup/people/pmontesa/userfs02/data/icesat2/{ATL08_VERSION.lower()}/{SUBDIR_NAME}'
!mkdir -p $OUTPUT_DIR
OUTPUT_DIR

'/explore/nobackup/people/pmontesa/userfs02/data/icesat2/atl08.006/senegal_20m'

#### Build search: seasonal search across list of years for ATL08 in a bbox

In [187]:
# ADAPT/EXPLORE main dir for all ATLAS data
DIR_ATLAS = '/css/icesat-2/ATLAS'

In [188]:
# Format the ATL08 data subdir needed to make ADAPT/EXPLORE specific granule path lists for 'do_extract_atl08' scripts
PATH_ATL08 = os.path.join(DIR_ATLAS, ATL08_VERSION)
PATH_ATL08

'/css/icesat-2/ATLAS/ATL08.006'

In [189]:
# Find this at https://search.earthdata.nasa.gov/
COLLECTIONCONCEPTID_DICT = {
                        'ATL08.003': "C2003772626-NSIDC_ECS",
                        'ATL08.005': "C2144424132-NSIDC_ECS",
                        'ATL08.006': "C2565090645-NSIDC_ECS",
}

In [190]:
# Page size: 150, number of results returned by page.
PAGESIZE = 150 
# Max page, number of pages to return before ending query.
MAXPAGE = 60
# Total max results will be PAGESIZE * MAXPAG

cmrP_list = []
for YEAR in search_dict['years_list']:
    
    cmrP = CmrProcess(mission = COLLECTIONCONCEPTID_DICT[ATL08_VERSION], 
                      dateTime=f"{YEAR}-{search_dict['minmonth']}-01T00:00:00Z,{YEAR}-{search_dict['maxmonth']}-30T23:59:59Z", 
                      lonLat = ','.join(str(e) for e in search_dict['bbox']),
                      pageSize=PAGESIZE,
                      maxPages=MAXPAGE)
    cmrP_list.append(cmrP)

#### Run search

In [191]:
resultList_year = [cmrP.run() for cmrP in cmrP_list]

Starting query
Results found on page: 1
No hits on page number: 2, ending search.
No hits on page number: 3, ending search.
Starting query
Results found on page: 1
Results found on page: 2
No hits on page number: 3, ending search.
Starting query
Results found on page: 1
Results found on page: 2
No hits on page number: 3, ending search.
Starting query
Results found on page: 1
Results found on page: 2
No hits on page number: 3, ending search.
Starting query
Results found on page: 1
Results found on page: 2
No hits on page number: 3, ending search.
Starting query
Results found on page: 1
No hits on page number: 2, ending search.
No hits on page number: 3, ending search.
Starting query
No hits on page number: 1, ending search.
No hits on page number: 2, ending search.
No hits on page number: 3, ending search.


In [218]:
dict_results = dict(zip(search_dict['years_list'], resultList_year))

In [227]:
print(f'# {ATL08_VERSION} granules by year:')
[print(f'{YEAR}: {len(dict_results[YEAR])}') for YEAR in search_dict['years_list']]

# Get one large list with all years
atl08_granule_list = [item for sublist in resultList_year for item in sublist]
print(f"\n{len(atl08_granule_list)} total granules in search results list")

print('\nNote: # of filtered ATL08 CSV files will be a subset of this total.')

# ATL08.006 granules by year:
2018: 38
2019: 206
2020: 206
2021: 197
2022: 199
2023: 98
2024: 0

944 total granules in search results list

Note: # of filtered ATL08 CSV files will be a subset of this total.


In [220]:
if False:
    # Get list of just granule names without paths
    [g.split('/')[-1] for g in atl08_granule_list]

#### Here is a dataframe of the granule paths

In [221]:
atl08_h5_fn_df = pd.DataFrame(atl08_granule_list, columns = ['path_cmr'])
atl08_h5_fn_df['site_name'] = search_dict['site_name']
atl08_h5_fn_df.head()

Unnamed: 0,path_cmr,site_name
0,https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL08...,Senegal
1,https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL08...,Senegal
2,https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL08...,Senegal
3,https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL08...,Senegal
4,https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL08...,Senegal


### Get full local paths of the ATL08 data for granule lists needed for `do_extract_atl08`

In [222]:
atl08_h5_fn_df["path_local"] = PATH_ATL08
atl08_h5_fn_df["file"] = atl08_h5_fn_df["path_cmr"].apply(lambda x: os.path.basename(x))
atl08_h5_fn_df["date_subdir"] = pd.to_datetime(atl08_h5_fn_df["file"].str.split('_', expand=True)[1].str[:8] , format="%Y%m%d").dt.strftime('%Y.%m.%d')
atl08_h5_fn_df["year"] = atl08_h5_fn_df["file"].str.split('_', expand=True)[1].str[:4]
atl08_h5_fn_df["path_local"] = atl08_h5_fn_df[['path_local', 'date_subdir', 'file']].apply(lambda row: os.path.join(*row), axis=1)

atl08_h5_fn_df.path_local.to_list()[0]

'/css/icesat-2/ATLAS/ATL08.006/2018.10.16/ATL08_20181016043523_02680107_006_02.h5'

In [223]:
atl08_h5_fn_df.head()

Unnamed: 0,path_cmr,site_name,path_local,file,date_subdir,year
0,https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL08...,Senegal,/css/icesat-2/ATLAS/ATL08.006/2018.10.16/ATL08...,ATL08_20181016043523_02680107_006_02.h5,2018.10.16,2018
1,https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL08...,Senegal,/css/icesat-2/ATLAS/ATL08.006/2018.10.16/ATL08...,ATL08_20181016162940_02760101_006_02.h5,2018.10.16,2018
2,https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL08...,Senegal,/css/icesat-2/ATLAS/ATL08.006/2018.10.17/ATL08...,ATL08_20181017040943_02830107_006_02.h5,2018.10.17,2018
3,https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL08...,Senegal,/css/icesat-2/ATLAS/ATL08.006/2018.10.21/ATL08...,ATL08_20181021040123_03440107_006_02.h5,2018.10.21,2018
4,https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL08...,Senegal,/css/icesat-2/ATLAS/ATL08.006/2018.10.21/ATL08...,ATL08_20181021155541_03520101_006_02.h5,2018.10.21,2018


Notes:
1. Can we get these as s3 paths? ---> probably not yet on AWS
2. Should we download to our AWS s3 buckets and store? or just transfer from ADAPT?

### Write a text file of the list of graules with full local path
this will be the main list that can be split up by VM to parallelize for `do_extract_filter`

In [224]:
list_nodes_fn = f'{OUTPUT_DIR}/nodelist'
LIST_VM = ['forest201', 'forest202', 'forest203', 'forest204', #'forest206',
           'forest206','forest207','forest208','forest209','forest210']
np.savetxt(list_nodes_fn, LIST_VM, delimiter="\n", fmt="%s")

## Generate sublists (chunks) by YEAR

In [225]:
for YEAR in search_dict['years_list']:
    print(YEAR)
    list_main_fn = f"{OUTPUT_DIR}/list_{ATL08_VERSION.lower()}_{search_dict['site_name'].replace(' ', '').lower()}_{YEAR}"
    df_sub = atl08_h5_fn_df[atl08_h5_fn_df.year == str(YEAR)]
    np.savetxt(list_main_fn, df_sub.path_local, delimiter="\n", fmt="%s")
    
    # Generate sublists (chunks) from YEAR mainlist
    !/home/pmontesa/code/HRSI/gen_chunks.py $list_main_fn $list_nodes_fn

2018
forest201
forest202
forest203
forest204
forest206
forest207
forest208
forest209
forest210
# lines: 38, # VMs: 9
Chunksize = 4
2019
forest201
forest202
forest203
forest204
forest206
forest207
forest208
forest209
forest210
# lines: 206, # VMs: 9
Chunksize = 22
2020
forest201
forest202
forest203
forest204
forest206
forest207
forest208
forest209
forest210
# lines: 206, # VMs: 9
Chunksize = 22
2021
forest201
forest202
forest203
forest204
forest206
forest207
forest208
forest209
forest210
# lines: 197, # VMs: 9
Chunksize = 21
2022
forest201
forest202
forest203
forest204
forest206
forest207
forest208
forest209
forest210
# lines: 199, # VMs: 9
Chunksize = 22
2023
forest201
forest202
forest203
forest204
forest206
forest207
forest208
forest209
forest210
# lines: 98, # VMs: 9
Chunksize = 10
2024
forest201
forest202
forest203
forest204
forest206
forest207
forest208
forest209
forest210
# lines: 0, # VMs: 9
Chunksize = 0


## Build GeoDataFrame

In [237]:
search_dict['years_list']
OUTPUT_DIR
os.path.join(OUTPUT_DIR, filename + '.gpkg')

'/explore/nobackup/people/pmontesa/userfs02/data/icesat2/atl08.006/senegal_20m/senegal_20m.gpkg'

In [None]:

atl08_gdf = pd.concat([atl08lib.atl08_io(OUTPUT_DIR, str(YEAR), DO_PICKLE=False, LENGTH_SEG=20) for YEAR in search_dict['years_list'] ])
filename = os.path.basename(OUTPUT_DIR)
atl08_gdf.to_file(os.path.join(OUTPUT_DIR, filename + '.gpkg'), driver='GPKG')
atl08_gdf.info()

Building list of ATL08 csvs...
12
Creating pandas data frame...
Creating a gdf for 2018 @ 20m...
Building list of ATL08 csvs...
57
Creating pandas data frame...
Creating a gdf for 2019 @ 20m...
Building list of ATL08 csvs...
51
Creating pandas data frame...
Creating a gdf for 2020 @ 20m...
Building list of ATL08 csvs...
44
Creating pandas data frame...
Creating a gdf for 2021 @ 20m...
Building list of ATL08 csvs...
57
Creating pandas data frame...
Creating a gdf for 2022 @ 20m...
Building list of ATL08 csvs...
30
Creating pandas data frame...
Creating a gdf for 2023 @ 20m...
Building list of ATL08 csvs...
0
No csvs for a gdf for 2024 @ 20m.


## Map

In [None]:
maplib.MAP_ATL08_FOLIUM(atl08_gdf.sample(frac=0.1), MAP_COL='h_can', GROUP_COL='y', DO_NIGHT=False, RADIUS=3)

Mapping 125896 day/night ATL08 observations of h_can
Mapping unique groups in y: [2018, 2019, 2020, 2021, 2022, 2023]
