# Earthdata Crawler

Check the following for every collection:
- Presence of cloud opendap URL
- Presence of a DMR++ URL
- Whether DMR++ is openable by VirtualiZarr's DMR++ parser
- Whether DMR++ is downloadable
- (not yet) Quality of dmrpp (are they flattened?)

Note: Borrowed `get_info` from https://github.com/opengeos/NASA-Earth-Data/blob/main/nasa_earth_data.py

In [1]:
import logging
import requests

import earthaccess
import pandas as pd
import virtualizarr as vz
from dask.distributed import LocalCluster, Client
from tqdm.notebook import tqdm


# Handy text formatting codes.
bold_start = '\033[1m'
bold_end = '\033[0m'

## Retrieve all collections.

In [2]:
_ = earthaccess.login()

In [3]:
%%time
collections = earthaccess.search_datasets(keyword="*")
len(collections)

CPU times: user 2.14 s, sys: 407 ms, total: 2.55 s
Wall time: 2min 28s


9903

## Set up session, cluster, and logging.

In [4]:
# An fsspec session will be used for VirtualiZarr parsing.
fs = earthaccess.get_fsspec_https_session()

In [5]:
# Start a Dask cluster to be used for processing Earthdata collections in parallel.
if "dask_client" not in locals():
    # cluster = LocalCluster(threads_per_worker=1)
    cluster = LocalCluster(processes=True)
    dask_client = Client(cluster)

dask_client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: /user/danielfromearth/proxy/8787/status,

0,1
Dashboard: /user/danielfromearth/proxy/8787/status,Workers: 4
Total threads: 4,Total memory: 3.71 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:38631,Workers: 4
Dashboard: /user/danielfromearth/proxy/8787/status,Total threads: 4
Started: Just now,Total memory: 3.71 GiB

0,1
Comm: tcp://127.0.0.1:36117,Total threads: 1
Dashboard: /user/danielfromearth/proxy/39835/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:38379,
Local directory: /tmp/dask-scratch-space/worker-s9t4ryyn,Local directory: /tmp/dask-scratch-space/worker-s9t4ryyn

0,1
Comm: tcp://127.0.0.1:46395,Total threads: 1
Dashboard: /user/danielfromearth/proxy/42979/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:46265,
Local directory: /tmp/dask-scratch-space/worker-6kqegoqh,Local directory: /tmp/dask-scratch-space/worker-6kqegoqh

0,1
Comm: tcp://127.0.0.1:43133,Total threads: 1
Dashboard: /user/danielfromearth/proxy/35213/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:42189,
Local directory: /tmp/dask-scratch-space/worker-ivrg20st,Local directory: /tmp/dask-scratch-space/worker-ivrg20st

0,1
Comm: tcp://127.0.0.1:33723,Total threads: 1
Dashboard: /user/danielfromearth/proxy/37759/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:33453,
Local directory: /tmp/dask-scratch-space/worker-c8qd82zh,Local directory: /tmp/dask-scratch-space/worker-c8qd82zh


In [6]:
# Retrieve logger so we can log messages during dask jobs.
logger = logging.getLogger("distributed.worker")

In [7]:
# There will be many HTTP 404 "Not Found" Errors so let's not print them.
class MyFilter(logging.Filter):
    def filter(self, record):
        # Example: Filter out messages containing "debug"
        return "Error while downloading the file" not in record.getMessage()

logging.getLogger("earthaccess.store").addFilter(MyFilter())

## Main logic is in these functions.

In [8]:
def get_collection_info(one_collection):
    info = {}
    info["ShortName"] = one_collection["umm"]["ShortName"]
    info["concept-id"] = one_collection["meta"]["concept-id"]
    info["provider-id"] = one_collection["meta"]["provider-id"]

    return info

In [9]:
def get_single_granule_from_collection(collection_concept_id: str) -> tuple[earthaccess.DataGranule | None, []]:
    """Return a single granule and its associated URLs."""
    try:
        a_granule = earthaccess.search_data(
            concept_id=collection_concept_id,
            count=1
        )
    except RuntimeError as err:
        logger.warning(f"For [{collection_concept_id}]: {err}")
        return None, []
        

    try:
        links = a_granule[0].data_links(access="indirect")
    except IndexError as err:
        logger.warning(f"For [{collection_concept_id}]: {err}")
        return None, []
    
    return a_granule[0], links

In [10]:
def check_dmrpp_from_url(a_url: str) -> tuple[bool, bool]:
    """Try opening, and if not openable, then at least try downloading, a DMR++ file.
    
    Returns:
        A tuple containing a bool (whether the DMR++ was able to be downloaded) 
        and a bool (whether the DMR++ was able to be opened using 
        `virtualizarr.open_virtual_dataset()`).
    """
    downloadable = False
    openable = False
    
    dmrpp_url = a_url + ".dmrpp"
    logger.info(f"check_dmrpp_for_a_url for {dmrpp_url}")

    # TEST 1 - Try opening the DMR++ with VirtualiZarr
    try:
        opened_dmrpp = vz.open_virtual_dataset(
            dmrpp_url, 
            filetype="dmrpp", 
            indexes={}, 
            reader_options={"storage_options": fs.storage_options}
        )
        downloadable = True
        openable = True
        logger.info("Opened DMR++ with VirtualiZarr.")
        
    except FileNotFoundError as err:
        logger.info("DMR++ not found.")

    # TEST 2 - Try downloading the DMR++
    except Exception as err:
        logging.info(f"Exception occurred when opening DMR++: {err}. Trying download..")
        # If opening didn't work, let's just try downloading
        try:
            files = earthaccess.download(dmrpp_url, "./dmrpp/", pqdm_kwargs={"disable": True})
            downloadable = True
            logger.info("Downloaded DMR++.")
        except Exception as errh:
            logger.warning(f"DMR++ exists but exception {errh} occurred when attempting download of {dmrpp_url}.")
        except RuntimeError as err:
            logger.warning(f"DMR++ exists but unexpected RuntimeError occurred when attempting download of {dmrpp_url}.")
            raise err

    # TEST 3 - Traverse all groups.
    # TODO - walk groups inside the DMR++, and pass each group to virtualizarr.open_virtual_dataset().
            
    return downloadable, openable

In [11]:
def inspect_collection(a_collection: earthaccess.DataCollection) -> tuple[dict, dict]:
    """Determine the accessibility of DMR++ related attributes of an Collection."""
    collection_info = get_collection_info(a_collection)
    concept_id = collection_info['concept-id']

    # Check if a cloud opendap or .dmrpp URL exists.
    url_results = {
        "concept_id": concept_id,
        "provider_id": collection_info['provider-id'],
        "cloud_opendap_url": None,
        "dmrpp_url_in_cmr": None,
        "downloadable_dmrpp": False,
        "openable_dmrpp": False
    }
    
    a_granule, the_get_data_urls = get_single_granule_from_collection(concept_id)
    if a_granule is not None:
        try:
            url_fields = a_granule['umm']['RelatedUrls']
        except KeyError:
            logger.warning(f"For {concept_id}: Keys ['umm']['RelatedUrls'] were not found.")
            return collection_info, url_results
            
        for url_field in url_fields:  # Iterate over RelatedUrls in each request step
            url = url_field['URL']
            logger.info(f"url-{url}")
            
            if url.startswith("https://opendap.earthdata.nasa.gov/"):
                url_results["cloud_opendap_url"] = url
                
            elif url.endswith(".dmrpp"):
                url_results["dmrpp_url_in_cmr"] = url
    
            if url in the_get_data_urls:
                downloaded, opened = check_dmrpp_from_url(url)
                if downloaded:
                    url_results["downloadable_dmrpp"] = True
                if opened:
                    url_results["openable_dmrpp"] = True
                
    return collection_info, url_results

## Cycle through collections.

In [12]:
collections_to_check = collections
num_collections = len(collections_to_check)

In [13]:
%%time
futures = dask_client.map(inspect_collection, collections_to_check) # Map the function to a sequence of inputs
inspection_results = dask_client.gather(futures) # Collect the results

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
ERROR:earthaccess.store:Error while downloading the file G2011091001641.L2_COMS_OC.nc.dmrpp
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.11/site-packages/earthaccess/store.py", line 678, in _download_file
    r.raise_for_status()
  File "/srv/conda/envs/notebook/lib/python3.11/site-packages/requests/models.py", line 1024, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://oceandata.sci.gsfc.nasa.gov/getfile/G2011091001641.L2_COMS_OC.nc.dmrpp
ERROR:earthaccess.store:Error while downloading the file coherence.dmrpp
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.11

CPU times: user 12min 36s, sys: 1min 27s, total: 14min 4s
Wall time: 2h 52min 40s


In [14]:
# dask_client.get_worker_logs()

## Wrangle the results into a summary

In [15]:
expanded_results = []
for result in inspection_results:
    collection_info_dict = result[0]
    url_results_dict = result[1]
    
    expanded_results.append((
        url_results_dict["concept_id"], 
        url_results_dict["provider_id"], 
        url_results_dict["cloud_opendap_url"], 
        url_results_dict["dmrpp_url_in_cmr"], 
        url_results_dict["downloadable_dmrpp"], 
        url_results_dict["openable_dmrpp"]
    ))

results_df = pd.DataFrame.from_dict(expanded_results)
results_df = results_df.rename(columns={
    0: "collection_id", 
    1: "provider_id", 
    2: "cloud_opendap_url", 
    3: "dmrpp_url_in_cmr",
    4: "downloadable_dmrpp", 
    5: "openable_dmrpp"}
)

In [16]:
results_df.head()

Unnamed: 0,collection_id,provider_id,cloud_opendap_url,dmrpp_url_in_cmr,downloadable_dmrpp,openable_dmrpp
0,C2763266360-LPCLOUD,LPCLOUD,,,False,False
1,C1964798938-LAADS,LAADS,,,False,False
2,C2343115666-LPCLOUD,LPCLOUD,,,False,False
3,C2408750690-LPCLOUD,LPCLOUD,https://opendap.earthdata.nasa.gov/collections...,s3://lp-prod-public/EMITL2ARFL.001/EMIT_L2A_RF...,True,True
4,C1996881146-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/providers/P...,,True,True


In [17]:
results_df[results_df["cloud_opendap_url"].notnull()]

Unnamed: 0,collection_id,provider_id,cloud_opendap_url,dmrpp_url_in_cmr,downloadable_dmrpp,openable_dmrpp
3,C2408750690-LPCLOUD,LPCLOUD,https://opendap.earthdata.nasa.gov/collections...,s3://lp-prod-public/EMITL2ARFL.001/EMIT_L2A_RF...,True,True
4,C1996881146-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/providers/P...,,True,True
11,C2408009906-LPCLOUD,LPCLOUD,https://opendap.earthdata.nasa.gov/collections...,s3://lp-prod-public/EMITL1BRAD.001/EMIT_L1B_RA...,True,True
33,C2075141684-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/collections...,,True,True
40,C2076114664-LPCLOUD,LPCLOUD,https://opendap.earthdata.nasa.gov/collections...,,True,True
...,...,...,...,...,...,...
9501,C2013583732-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/providers/P...,,True,True
9502,C2013557893-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/providers/P...,,True,True
9503,C2013584708-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/providers/P...,,True,True
9504,C2013583906-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/providers/P...,,True,True


In [18]:
results_df[results_df["dmrpp_url_in_cmr"].notnull()]["dmrpp_url_in_cmr"].to_list()

['s3://lp-prod-public/EMITL2ARFL.001/EMIT_L2A_RFL_001_20220810T034103_2222203_001/EMIT_L2A_MASK_001_20220810T034103_2222203_001.nc.dmrpp',
 's3://lp-prod-public/EMITL1BRAD.001/EMIT_L1B_RAD_001_20220810T034103_2222203_001/EMIT_L1B_OBS_001_20220810T034103_2222203_001.nc.dmrpp',
 's3://lp-prod-protected/VNP09GA.002/VNP09GA.A2012019.h23v01.002.2023122181913/VNP09GA.A2012019.h23v01.002.2023122181913.h5.dmrpp',
 's3://lp-prod-protected/VNP14A1.002/VNP14A1.A2012019.h15v00.002.2023121192419/VNP14A1.A2012019.h15v00.002.2023121192419.h5.dmrpp',
 's3://lp-prod-protected/VNP21A1D.002/VNP21A1D.A2012019.h10v07.002.2023131141704/VNP21A1D.A2012019.h10v07.002.2023131141704.h5.dmrpp',
 's3://lp-prod-protected/VNP21A1N.002/VNP21A1N.A2012019.h08v09.002.2023131141643/VNP21A1N.A2012019.h08v09.002.2023131141643.h5.dmrpp',
 's3://lp-prod-protected/VJ114.002/VJ114.A2018005.0000.002.2022252033022/VJ114.A2018005.0000.002.2022252033022.nc.dmrpp',
 's3://lp-prod-protected/VNP14.002/VNP14.A2012023.1006.002.20231212

In [19]:
num_url_results = results_df.shape[0]

summary_stats = {}

num_cloud_opendap_urls = results_df["cloud_opendap_url"].notnull().sum()
num_non_cloud_opendap_urls = results_df["dmrpp_url_in_cmr"].notnull().sum()
num_downloadable_dmrpp = results_df["downloadable_dmrpp"].sum()
num_openable_dmrpp = results_df["openable_dmrpp"].sum()

fraction_downloadable_collections = num_downloadable_dmrpp / num_collections * 100
fraction_openable_collections = num_openable_dmrpp / num_collections * 100
fraction_downloadable_urls = num_downloadable_dmrpp / num_url_results * 100
fraction_openable_urls = num_openable_dmrpp / num_url_results * 100

print(f"Number cloud opendap URLs: {num_cloud_opendap_urls}")
print(f"Number dmr++ URLs in CMR: {num_non_cloud_opendap_urls}")
print(f"Fraction of downloadable DMR++ files (by appending '.dmrpp') found: {num_downloadable_dmrpp} out of {num_collections} datasets" 
      f"({fraction_downloadable_collections:0.0f} %) and {num_url_results} URLs ({fraction_downloadable_urls:0.0f} %).")
print(f"Fraction of openable DMR++ files (by appending '.dmrpp') found: {num_openable_dmrpp} out of {num_collections} datasets" 
      f"({fraction_openable_collections:0.0f} %) and {num_url_results} URLs ({fraction_openable_urls:0.0f} %).")

Number cloud opendap URLs: 1272
Number dmr++ URLs in CMR: 349
Fraction of downloadable DMR++ files (by appending '.dmrpp') found: 2101 out of 9903 datasets(21 %) and 9903 URLs (21 %).
Fraction of openable DMR++ files (by appending '.dmrpp') found: 1499 out of 9903 datasets(15 %) and 9903 URLs (15 %).
