# Earthdata Crawler

Check the following for every collection:
- Presence of cloud opendap URL
- Presence of a DMR++ URL
- Whether DMR++ is openable by VirtualiZarr's DMR++ parser
- Whether DMR++ is downloadable
- (not yet) Quality of dmrpp (are they flattened?)

Note: Borrowed `get_info` from https://github.com/opengeos/NASA-Earth-Data/blob/main/nasa_earth_data.py

In [1]:
import logging
import requests

import earthaccess
import pandas as pd
import virtualizarr as vz
from dask.distributed import LocalCluster, Client
from tqdm.notebook import tqdm


# Handy text formatting codes.
bold_start = '\033[1m'
bold_end = '\033[0m'

In [2]:
_ = earthaccess.login()

In [3]:
%%time
collections = earthaccess.search_datasets(keyword="*")
len(collections)

CPU times: user 2.18 s, sys: 293 ms, total: 2.47 s
Wall time: 26.6 s


9905

In [4]:
def get_info(dataset):
    info = {}
    info["ShortName"] = dataset["umm"]["ShortName"]
    info["EntryTitle"] = dataset["umm"]["EntryTitle"]
    try:
        info["DOI"] = dataset["umm"]["DOI"]["DOI"]
    except:
        info["DOI"] = ""
    info["concept-id"] = dataset["meta"]["concept-id"]
    info["provider-id"] = dataset["meta"]["provider-id"]

    try:
        info["s3-links"] = dataset["meta"]["s3-links"]
    except:
        info["s3-links"] = ""
    # info["granule-count"] = dataset["meta"]["granule-count"]

    try:
        start_time = dataset["umm"]["TemporalExtents"][0]["RangeDateTimes"][0][
            "BeginningDateTime"
        ]
        info["start-time"] = start_time
    except:
        info["start-time"] = ""
        
    try:
        start_time = dataset["umm"]["TemporalExtents"][0]["RangeDateTimes"][0][
            "BeginningDateTime"
        ]
        info["start-time"] = start_time
    except:
        info["start-time"] = ""
        
    try:
        end_time = dataset["umm"]["TemporalExtents"][0]["RangeDateTimes"][0][
            "EndingDateTime"
        ]
        info["end-time"] = end_time
    except:
        info["end-time"] = ""

    if info["DOI"] != "":
        info["Linkage"] = "https://doi.org/" + info["DOI"]
    else:
        info["Linkage"] = ""
    

    return info

## Cycle through collections, checking a single granule for each.

### Set up session, cluster, logging.

In [5]:
fs = earthaccess.get_fsspec_https_session()

In [6]:
if "dask_client" not in locals():
    # cluster = LocalCluster(threads_per_worker=1)
    cluster = LocalCluster(processes=True)
    dask_client = Client(cluster)

dask_client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: /user/danielfromearth/proxy/8787/status,

0,1
Dashboard: /user/danielfromearth/proxy/8787/status,Workers: 4
Total threads: 4,Total memory: 3.71 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:40009,Workers: 4
Dashboard: /user/danielfromearth/proxy/8787/status,Total threads: 4
Started: Just now,Total memory: 3.71 GiB

0,1
Comm: tcp://127.0.0.1:44383,Total threads: 1
Dashboard: /user/danielfromearth/proxy/34399/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:42763,
Local directory: /tmp/dask-scratch-space/worker-qsymnx8m,Local directory: /tmp/dask-scratch-space/worker-qsymnx8m

0,1
Comm: tcp://127.0.0.1:43737,Total threads: 1
Dashboard: /user/danielfromearth/proxy/44257/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:36867,
Local directory: /tmp/dask-scratch-space/worker-7v_4411k,Local directory: /tmp/dask-scratch-space/worker-7v_4411k

0,1
Comm: tcp://127.0.0.1:41515,Total threads: 1
Dashboard: /user/danielfromearth/proxy/35613/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:46585,
Local directory: /tmp/dask-scratch-space/worker-rwoye09y,Local directory: /tmp/dask-scratch-space/worker-rwoye09y

0,1
Comm: tcp://127.0.0.1:43959,Total threads: 1
Dashboard: /user/danielfromearth/proxy/40781/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:41961,
Local directory: /tmp/dask-scratch-space/worker-5cvw1aq3,Local directory: /tmp/dask-scratch-space/worker-5cvw1aq3


In [7]:
logger = logging.getLogger("distributed.worker")

In [8]:
# dask_client.shutdown()

In [10]:
# There will be many HTTP 404 "Not Found" Errors so let's not print them.
class MyFilter(logging.Filter):
    def filter(self, record):
        # Example: Filter out messages containing "debug"
        return "Error while downloading the file" not in record.getMessage()

logging.getLogger("earthaccess.store").addFilter(MyFilter())

### Main logic is in these functions.

In [None]:
def get_single_granule_from_collection(collection_concept_id: str):
    try:
        a_granule = earthaccess.search_data(
            concept_id=collection_concept_id,
            count=1
        )
    except RuntimeError as err:
        logger.warning(err)
        return None, []
        

    try:
        links = a_granule[0].data_links(access="indirect")
    except IndexError as err:
        logger.warning(err)
        return None, []
    
    return a_granule[0], links

In [28]:
def open_dmrpp_from_url(a_url: str):
    """Try opening, and if not openable, then at least try downloading, a DMR++ file."""
    downloadable = False
    openable = False
    
    dmrpp_url = a_url + ".dmrpp"
    logger.info(f"check_dmrpp_for_a_url for {dmrpp_url}")

    # TEST 1 - Try opening the DMR++ with VirtualiZarr
    try:
        opened_dmrpp = vz.open_virtual_dataset(
            dmrpp_url, 
            filetype="dmrpp", 
            indexes={}, 
            reader_options={"storage_options": fs.storage_options}
        )
        downloadable = True
        openable = True
        logger.info("opened_dmrpp")
        
    except FileNotFoundError as err:
        logger.info("Not downloadable")

    # TEST 2 - Try downloading the DMR++
    except Exception as err:
        logging.info(f"Unexpected error when opening dmrpp: {err}. Trying download.")
        # If opening didn't work, let's just try downloading
        try:
            files = earthaccess.download(dmrpp_url, "./dmrpp/", pqdm_kwargs={"disable": True})
            downloadable = True
            logger.info("downloaded dmrpp")
        except Exception as errh:
            logger.info(f"exception when downloading dmrpp: {errh}.")
        except RuntimeError as err:
            logger.info("Unexpected Failure")
            raise err

    # TEST 3 - Traverse all groups.
    # TODO - walk groups inside the DMR++, and pass each group to virtualizarr.open_virtual_dataset().
            
    return downloadable, openable

In [57]:
def inspect_collection(a_dataset):
    collection_info = get_info(a_dataset)
    concept_id = collection_info['concept-id']
    a_granule, the_get_data_urls = get_single_granule_from_collection(concept_id)
    
    # Check if a cloud opendap or .dmrpp URL exists.
    url_results = {
        "concept_id": concept_id,
        "provider_id": collection_info['provider-id'],
        "cloud_opendap_url": None,
        "dmrpp_url_in_cmr": None,
        "downloadable_dmrpp": False,
        "openable_dmrpp": False
    }
    for url_field in a_granule['umm']['RelatedUrls']:  # Iterate over RelatedUrls in each request step
        url = url_field['URL']
        logger.info(f"url-{url}")
        
        if url.startswith("https://opendap.earthdata.nasa.gov/"):
            url_results["cloud_opendap_url"] = url
            
        elif url.endswith(".dmrpp"):
            url_results["dmrpp_url_in_cmr"] = url

        if url in the_get_data_urls:
            downloaded, opened = open_dmrpp_from_url(url)
            if downloaded:
                url_results["downloadable_dmrpp"] = True
            if opened:
                url_results["openable_dmrpp"] = True
                
    return collection_info, url_results

### Execution of functions happens here.

In [58]:
collections_to_check = collections[0:100]
num_collections = len(collections_to_check)

In [59]:
futures = dask_client.map(inspect_collection, collections_to_check) # Map the function to a sequence of inputs
inspection_results = dask_client.gather(futures) # Collect the results

In [60]:
# dask_client.get_worker_logs()

In [61]:
expanded_results = []
for result in inspection_results:
    collection_info_dict = result[0]
    url_results_dict = result[1]
    
    expanded_results.append((
        url_results_dict["concept_id"], 
        url_results_dict["provider_id"], 
        url_results_dict["cloud_opendap_url"], 
        url_results_dict["dmrpp_url_in_cmr"], 
        url_results_dict["downloadable_dmrpp"], 
        url_results_dict["openable_dmrpp"]
    ))

results_df = pd.DataFrame.from_dict(expanded_results)
results_df = results_df.rename(columns={
    0: "collection_id", 
    1: "provider_id", 
    2: "cloud_opendap_url", 
    3: "dmrpp_url_in_cmr",
    4: "downloadable_dmrpp", 
    5: "openable_dmrpp"}
)

In [63]:
results_df

Unnamed: 0,collection_id,provider_id,cloud_opendap_url,dmrpp_url_in_cmr,downloadable_dmrpp,openable_dmrpp
0,C2763266360-LPCLOUD,LPCLOUD,,,False,False
1,C1964798938-LAADS,LAADS,,,False,False
2,C2343115666-LPCLOUD,LPCLOUD,,,False,False
3,C2408750690-LPCLOUD,LPCLOUD,https://opendap.earthdata.nasa.gov/collections...,s3://lp-prod-public/EMITL2ARFL.001/EMIT_L2A_RF...,True,True
4,C1996881146-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/providers/P...,,True,True
...,...,...,...,...,...,...
95,C2484078896-LPCLOUD,LPCLOUD,,,False,False
96,C2565788876-LPCLOUD,LPCLOUD,,,False,False
97,C2524245159-LPCLOUD,LPCLOUD,,,False,False
98,C2565788888-LPCLOUD,LPCLOUD,,,False,False


In [64]:
results_df[results_df["cloud_opendap_url"].notnull()]

Unnamed: 0,collection_id,provider_id,cloud_opendap_url,dmrpp_url_in_cmr,downloadable_dmrpp,openable_dmrpp
3,C2408750690-LPCLOUD,LPCLOUD,https://opendap.earthdata.nasa.gov/collections...,s3://lp-prod-public/EMITL2ARFL.001/EMIT_L2A_RF...,True,True
4,C1996881146-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/providers/P...,,True,True
11,C2408009906-LPCLOUD,LPCLOUD,https://opendap.earthdata.nasa.gov/collections...,s3://lp-prod-public/EMITL1BRAD.001/EMIT_L1B_RA...,True,True
33,C2075141605-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/collections...,,True,True
34,C2075141684-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/collections...,,True,True
41,C2076114664-LPCLOUD,LPCLOUD,https://opendap.earthdata.nasa.gov/collections...,,True,True
42,C2076087338-LPCLOUD,LPCLOUD,https://opendap.earthdata.nasa.gov/collections...,,True,True
45,C2102958977-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/collections...,,True,True
46,C2036877535-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/collections...,,True,True
47,C2036877612-POCLOUD,POCLOUD,https://opendap.earthdata.nasa.gov/collections...,,True,False


In [68]:
results_df[results_df["dmrpp_url_in_cmr"].notnull()]["dmrpp_url_in_cmr"].to_list()

['s3://lp-prod-public/EMITL2ARFL.001/EMIT_L2A_RFL_001_20220810T034103_2222203_001/EMIT_L2A_MASK_001_20220810T034103_2222203_001.nc.dmrpp',
 's3://lp-prod-public/EMITL1BRAD.001/EMIT_L1B_RAD_001_20220810T034103_2222203_001/EMIT_L1B_OBS_001_20220810T034103_2222203_001.nc.dmrpp',
 's3://lp-prod-protected/VNP09GA.002/VNP09GA.A2012019.h23v01.002.2023122181913/VNP09GA.A2012019.h23v01.002.2023122181913.h5.dmrpp',
 's3://lp-prod-protected/VNP14A1.002/VNP14A1.A2012019.h15v00.002.2023121192419/VNP14A1.A2012019.h15v00.002.2023121192419.h5.dmrpp',
 's3://lp-prod-protected/VNP21A1D.002/VNP21A1D.A2012019.h10v07.002.2023131141704/VNP21A1D.A2012019.h10v07.002.2023131141704.h5.dmrpp',
 's3://lp-prod-protected/VNP21A1N.002/VNP21A1N.A2012019.h08v09.002.2023131141643/VNP21A1N.A2012019.h08v09.002.2023131141643.h5.dmrpp',
 's3://lp-prod-protected/VJ114.002/VJ114.A2018005.0000.002.2022252033022/VJ114.A2018005.0000.002.2022252033022.nc.dmrpp',
 's3://lp-prod-protected/VNP14.002/VNP14.A2012023.1006.002.20231212

In [20]:
num_url_results = results_df.shape[0]

summary_stats = {}

num_cloud_opendap_urls = results_df["cloud_opendap_url"].sum()
num_non_cloud_opendap_urls = results_df["non_cloud_opendap_link"].sum()
num_downloadable_dmrpp = results_df["downloadable_dmrpp"].sum()
num_openable_dmrpp = results_df["openable_dmrpp"].sum()

fraction_downloadable_collections = num_downloadable_dmrpp / num_collections * 100
fraction_openable_collections = num_openable_dmrpp / num_collections * 100
fraction_downloadable_urls = num_downloadable_dmrpp / num_url_results * 100
fraction_openable_urls = num_openable_dmrpp / num_url_results * 100

print(f"Number cloud opendap URLs: {num_cloud_opendap_urls}")
print(f"Number non-cloud opendap URLs: {num_non_cloud_opendap_urls}")
print(f"Fraction of downloadable DMR++ files found: {num_downloadable_dmrpp} out of {num_collections} datasets" 
      f"({fraction_downloadable_collections:0.0f} %) and {num_url_results} URLs ({fraction_downloadable_urls:0.0f} %).")
print(f"Fraction of openable DMR++ files found: {num_openable_dmrpp} out of {num_collections} datasets" 
      f"({fraction_openable_collections:0.0f} %) and {num_url_results} URLs ({fraction_openable_urls:0.0f} %).")

Number cloud opendap URLs: 324
Number non-cloud opendap URLs: 94
Fraction of downloadable DMR++ files found: 563 out of 200 datasets(282 %) and 1780 URLs (32 %).
Fraction of openable DMR++ files found: 455 out of 200 datasets(228 %) and 1780 URLs (26 %).
