# Top 50/75 Service Checker
A 22.4 hackfest projects by some manager types who still like to play

Goal:
* Dashboard the level of services available for our Top 50/75 datasets (limited to those discoverable via CMR)

Strech Goals:
* quantify (and visualize) the problem of “I have to do something different for different datasets every time I access/read/plot”
* if we have x services on x datasets - can we search/access/read/viz them all the same way across those different endpoints?

In [None]:
# Install prerequisite packages
import sys

# Note you usually need to install gdal outside of Python / pip first. On OSX, brew install gdal
!{sys.executable} -m pip install rasterio s3fs OWSLib GDAL matplotlib netCDF4 numpy xarray h5netcdf hvplot plotly seaborn

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from urllib import request, parse
from http.cookiejar import CookieJar
import getpass
import netrc
import os
import requests
import json
import s3fs
import netrc
import json
import pandas as pd
from osgeo import gdal
import xarray as xr
import hvplot.xarray
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import math
import seaborn as sns
from difflib import SequenceMatcher
#import holoviews as hv

In [None]:
# Top 50 / Stretch 75 Collection Shortnames

top_datasets = {
"SENTINEL-1A_SLC",
"GPM_3IMERGHH",
"NLDAS_FORA0125_H",
"M2T1NXSLV",
"GLDAS_NOAH025_3H",
"GPM_3IMERGHHL",
"GPM_3IMERGHHE",
"NLDAS_NOAH0125_H",
"M2I3NPASM",
"M2I1NXASM",
"GPM_3IMERGDF",
"M2T1NXFLX",
"M2T1NXRAD",
"NLDAS_MOS0125_H",
"GPM_MERGIR",
"NLDAS_VIC0125_H",
"GLDAS_NOAH10_3H",
"S5P_L2__NO2___",
"S5P_L2__CO____",
"S5P_L2__CH4___",
"S5P_L2__SO2___",
"MOD04_L2",
"MYD04_L2",
"VNP46A2",
"MYD03",
"MYD021KM",
"MYD06_L2",
"MYD35_L2",
"MOD03",
"MOD021KM",
"MOD06_L2",
"VNP46A1",
"MOD07_L2",
"MOD05_L2",
"MOD02HKM",
"MCD06COSP_D3_MODIS",
"MYD04_3K",
"MYD07_L2",
"MOD04_3K",
"MOD35_L2",
"MYD05_L2",
"MOD11A1",
"MYD11A1",
"MOD11A2",
"MYD11A2",
"MCD19A2",
"MYD09GA",
"MOD15A2H",
"MOD09GA",
"MCD43A4",
"MOD14",
"MCD15A2H",
"MYD14",
"MCD43A3",
"MOD13Q1",
"MYD13Q1",
"MOD13A3",
"MYD13A3",
"MYD09GQ",
"MCD15A3H",
"ASTGTM",
"ASTGTM_NC",
"MOD09GQ",
"MOD09A1",
"MOD16A2",
"MOD09Q1",
"MYD09Q1",
"MYD09A1",
"MOD11_L2",
"MYD11_L2",
"DAYMET_DAILY_V4_1840",
"PODAAC-GHMDA-2PJ19",
"PODAAC-OSCAR-03D01",
"PODAAC-GHGMR-4FJ04",
"VIIRS_NPP-STAR-L3U-v2.80"
}

provider_list = {
    "ASF", "GES_DISC", "LAADS", "LPCLOUD", "POCLOUD", "GHRC_DAAC", "ORNL_CLOUD", "NSIDC_CPRD"
}

In [None]:
#Scorer method which looks at each dataset and based on its characteristics, gives it both an 'ideal' and 'actual' score to assess how compliant is it with the Levels of Service
#This is overly simplistic!

#Background on the has<...> fields is here: https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html


def scorer (df_to_score):
    actual_score = 0 
    ideal_score = 0
    
    #calculate ideal score
    if (df_to_score.loc[0,'nativeDataFormats'] is not None and len(df_to_score.loc[0,'nativeDataFormats']) > 0):
        for format in (df_to_score.loc[0,'nativeDataFormats']):
            if (format.find("DF") != -1):
                #this is something like HDF/NetCDF
                ideal_score = 4 #1 point each for reformatting, spatial, temporal subetting or 'other' which could be reprojection (I can't check variable subsetting :( )
        
    #calculate actual score
    if (df_to_score.loc[0,'hasSpatialSubsetting'] == True):
        actual_score = actual_score + 1
        
    if (df_to_score.loc[0,'hasTemporalSubsetting']== True):
        actual_score = actual_score + 1
        
    if (df_to_score.loc[0,'hasFormats'] == True):
        actual_score = actual_score + 1
        
    if (df_to_score.loc[0,'hasTransforms'] == True):
        actual_score = actual_score + 1
        
    #print(df_to_score)
    df_to_score = df_to_score.assign(idealScore=[ideal_score])
    df_to_score = df_to_score.assign(actualScore=[actual_score])
    
    return df_to_score

In [None]:
# We can't use pandas well with the native data formats since they are a list, so this yanks them out into their own columns.  Not perfect, but better than nothing.
def dataFormatCleaner(df_to_clean):
    #num_formats = 1
    
    if (df_to_clean.loc[0,'nativeDataFormats'] is not None and len(df_to_clean.loc[0,'nativeDataFormats']) > 0):
        for format in (df_to_clean.loc[0,'nativeDataFormats']):
            #trying to make a dynamic column name on the fly, but it wasn't working.  For another time.
            #column_name_string = "nativeDataFormat"+str(num_formats)+"=["+format+"]"
            df_to_clean = df_to_clean.assign(nativeDataFormat1=[format])
            break
            #num_formats = num_formats + 1
            
    return df_to_clean


In [None]:
# query graphQL

def queryGraphQL (filter_str):
    graphql_rooturl = "https://graphql.earthdata.nasa.gov/api"
    query = """
    query{
    collections (cloudHosted: true, limit: 2000, %s ){
        count
        items {
          provider
          shortName
          conceptId
          nativeDataFormats
          processingLevel
          hasFormats
          hasSpatialSubsetting
          hasTemporalSubsetting
          hasTransforms
          services {
            count
            items {
              type
            }
          }
          granules {
            count
          }
        }
      }
    }
    """ % (filter_str)

    response = requests.post(url=graphql_rooturl, json={"query": query})

    if response.status_code != 200:
        print("Error with ", filter_str)
        print("response status code: ", response.status_code)
        print("response : ", response.content)
    

    # Shove it into a dataframe
    json_data = json.loads(response.text)
    df_data = json_data['data']['collections']['items']
    new_df = pd.json_normalize(df_data)
    
    return new_df;


In [None]:
#Loop through Top 50/75 datasets, query, score and add to dataframe

topdata_df = pd.DataFrame()
count = 0

for shortname in (top_datasets):
    #print(shortname)
    #if (count <=3):
    filter_str = "shortName: \"" + shortname + "\""
    new_df = queryGraphQL(filter_str)
    
    #score this
    if (len(new_df) == 0):
        print(shortname + " has no matching data")
    else:
        if (len(new_df) > 1):
            print(shortname + " has more than 1 entry, bail for now")      
        else:
            new_df = scorer(new_df)
            new_df = dataFormatCleaner(new_df)
            topdata_df = pd.concat([topdata_df, new_df], ignore_index=True)
        #count = count + 1

display(topdata_df)


In [None]:
actual_score = topdata_df.groupby(['processingLevel.id'])['actualScore'].sum()
ideal_score = topdata_df.groupby(['processingLevel.id'])['idealScore'].sum()

top_merged = pd.DataFrame()
top_merged['actualScore'] = actual_score
top_merged['idealScore'] = ideal_score


top_merged.plot(y=["actualScore", "idealScore"], use_index=True, kind="bar", title="Top 50/75 - Levels of Service")

# Try all the cloud hosted data

In [None]:
all_the_data_df = pd.DataFrame()

for provider in (provider_list):
    print(provider)
    filter_str = "provider: \"" + provider + "\""
    new_df = queryGraphQL(filter_str)
        
    #score this
    if (len(new_df) == 0):
        print(provider + " has no matching data")
    else:
        for index, row in new_df.iterrows():
            transposed = row.to_frame().T.reset_index()
            scored_df = scorer(transposed)
            scored_df = dataFormatCleaner(scored_df)
                
            #add new data to the full table
            all_the_data_df = pd.concat([all_the_data_df, scored_df], ignore_index=True)
        
display(all_the_data_df)

In [None]:
actual_score = all_the_data_df.groupby(['processingLevel.id'])['actualScore'].sum()
ideal_score = all_the_data_df.groupby(['processingLevel.id'])['idealScore'].sum()

merged = pd.DataFrame()
merged['actualScore'] = actual_score
merged['idealScore'] = ideal_score

merged.plot(y=["actualScore", "idealScore"], use_index=True, kind="bar", title="All Cloud Data - Levels of Service")

# Identify areas of opportunity

In [None]:
# Looking for biggest areas to focus on within the Top 50/75 data:

opportunities = all_the_data_df.groupby(['processingLevel.id','nativeDataFormat1']).size()
opportunities = opportunities.unstack()
opportunities_t = opportunities.T
plt.figure(figsize=(10,40))
sns.heatmap(opportunities_t, vmin=10, vmax=200, cmap="coolwarm", annot=True)


# Service Gurus - phone a friend

In [None]:
# Looking for gurus to talk to (DAACs who have wired up at least 1 service in their cloud data)
service_gurus = all_the_data_df.loc[all_the_data_df['services.count'] >= 1]

display(service_gurus.groupby(['provider']).size())

# While at it, make our other metric generation super easy too

In [None]:
collectionCounts = all_the_data_df.groupby(['provider']).size()
granuleCounts = all_the_data_df.groupby(['provider'])['granules.count'].sum()

merged = pd.DataFrame()
merged['granuleCount'] = granuleCounts
merged['collectionCount']= collectionCounts

display(merged)

merged.plot(y=["granuleCount"], use_index=True, kind="bar", title="All Cloud Data - Granules per Provider")

Things we learned:
* 5 of the top 75 are no longer available?
* 2 of the top 75 have 2 entries with the same shortname?
* Our metadata is still _really_ inconsistent.  Data formats entirely missing, processing levels not standardized
* It'd be helpful to easily query on 'hasOPeNDAP' (Amy's working this PR)
* It'd be helpful to easily query on 'hasVariableSubetting' and 'hasReprojection'
* Pandas makes a lot of things really, really easy.  But, the graph nature of our metadata means that we hit limitations with just using panda.

# For future playing

In [None]:
#Start with services first -> Collections
def queryGraphQLforServices ():
    graphql_rooturl = "https://graphql.earthdata.nasa.gov/api"
    
            
    services_query = """
        query{
            services(limit: 2000) {
              items {
                type
                conceptId
                name
                serviceOptions
                supportedInputProjections
                supportedOutputProjections
                supportedReformattings
                collections() {
                  count
                  items {
                    conceptId
                    cloudHosted
                  }
                }

                }
              }
            }
    """

    response = requests.post(url=graphql_rooturl, json={"query": services_query})

    if response.status_code != 200:
        print("response status code: ", response.status_code)
        print("response : ", response.content)
        print("request: ", response.request)
    
    
    # Shove it into a dataframe
    json_data = json.loads(response.text)
    df_data = json_data['data']['services']['items']
    new_df = pd.json_normalize(df_data)
    
    return new_df;
    

services_df = queryGraphQLforServices()
display(services_df)
