In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
#This Notebook enables insight into Podaacpy usage and download metrics.
#PyPI does not display download statistics because they are difficult 
#to collect and display accurately.


#There are numerous reasons for [download counts] removal/deprecation 
#via Pypisome of which are:
#  * Technically hard to make work with the new CDN
#    * The CDN is being donated to the PSF, and the donated tier does not offer any form of log access
#    * The work around for not having log access would greatly reduce the utility of the CDN
#  * Highly inaccurate
#    * A number of things prevent the download counts from being inaccurate, some of which include:
#      * pip download cache
#      * Internal or unofficial mirrors
#      * Packages not hosted on PyPI (for comparisons sake)
#      * Mirrors or unofficial grab scripts causing inflated counts 
#          (Last I looked 25% of the downloads were from a known mirroring script).
#  * Not particularly useful
#    * Just because a project has been downloaded a lot doesn’t mean it’s good
#    * Similarly just because a project hasn’t been downloaded a lot doesn’t mean it’s bad
#    * In short because it’s value is low for various reasons, and the tradeoffs 
#          required to make it work are high It has been not an effective use of resources.

#As an alternative, the Linehaul project [0] streams download logs to Google BigQuery [1]. 
#Linehaul writes an entry in a the-psf.pypi.downloadsYYYYMMDD table for each download. 
#The table contains information about what file was downloaded and how it was downloaded.

#More information can be found at 
#  https://packaging.python.org/guides/analyzing-pypi-package-downloads/,
#specifically see 
#  https://packaging.python.org/guides/analyzing-pypi-package-downloads/#setting-up

# [0] https://github.com/pypa/linehaul
# [1] https://cloud.google.com/bigquery

In [None]:
#Metrics for all time
#standardSQL
SELECT COUNT(*) AS total_num_downloads
FROM `the-psf.pypi.downloads*`
WHERE file.project = 'podaacpy'

In [None]:
#Metrics for last 30 days
#standardSQL
SELECT 
    country_code, 
    COUNT(*) AS num_downloads_last_30_days
FROM `the-psf.pypi.downloads*`
WHERE file.project = 'podaacpy'
  -- Only query the last 30 days of history
  AND _TABLE_SUFFIX
    BETWEEN FORMAT_DATE(
      '%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY))
    AND FORMAT_DATE('%Y%m%d', CURRENT_DATE())
GROUP BY
  country_code
ORDER BY
  num_downloads_last_30_days DESC
LIMIT
  100

In [None]:
#Metrics for last 7 days
#standardSQL
SELECT 
    country_code, 
    COUNT(*) AS num_downloads_last_7_days
FROM `the-psf.pypi.downloads*`
WHERE file.project = 'podaacpy'
  -- Only query the last 7 days of history
  AND _TABLE_SUFFIX
    BETWEEN FORMAT_DATE(
      '%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY))
    AND FORMAT_DATE('%Y%m%d', CURRENT_DATE())
GROUP BY
  country_code
ORDER BY
  num_downloads_last_7_days DESC
LIMIT
  100

In [None]:
# pypinfo --limit 100000 --days 365 podaacpy project version file pyversion percent3 percent2 impl impl-version openssl date month year country installer installer-version setuptools-version system system-release distro distro-version cpu

In [None]:
#Give full details about individual podaacpy downloads
SELECT
  country_code,
  COUNT(*) as downloads,
  timestamp,
  url,
  file.filename,
  file.project,
  file.version,
  file.type,
  details.installer.name,
  details.installer.version,
  details.python,
  details.implementation.name,
  details.implementation.version,
  details.distro.name,
  details.distro.version,
  details.distro.id,
  details.distro.libc.lib,
  details.distro.libc.version,
  details.system.name,
  details.system.release,
  details.cpu,
  details.openssl_version,
  tls_protocol,
  tls_cipher
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    DATE_ADD(CURRENT_TIMESTAMP(), -7, "day"),
    DATE_ADD(CURRENT_TIMESTAMP(), -1, "day")
  )
WHERE
  file.project = 'podaacpy'
GROUP BY
  country_code, timestamp, url, file.filename,
  file.project,
  file.version,
  file.type,
  details.installer.name,
  details.installer.version,
  details.python,
  details.implementation.name,
  details.implementation.version,
  details.distro.name,
  details.distro.version,
  details.distro.id,
  details.distro.libc.lib,
  details.distro.libc.version,
  details.system.name,
  details.system.release,
  details.cpu,
  details.openssl_version,
  tls_protocol,
  tls_cipher
ORDER BY
  downloads ASC
LIMIT 100

In [None]:
#Now lets provide some insight into PO.DAAC impact metrics 
#in Elseviers SCOPUS service

In [2]:
#First lets import the libraries we require
from pprint import pprint
import podaac.podaac as podaac
import podaac.podaac_utils as utils

In [3]:
#Then we can create instances of the classes we will use
p = podaac.Podaac()
u = utils.PodaacUtils()

In [6]:
#Print a list of CYGNSS dataset short names
print('\nHeres list_all_available_granule_search_dataset_short_names()')
result = u.list_all_available_granule_search_dataset_short_names()
dsetShortName = [i for i in result]
print(dsetShortName)
print(len(dsetId))


Heres list_all_available_granule_search_dataset_short_names()
['AQUARIUS_L3_ANCILLARY_SST_SMI_MONTHLY_V5', 'AQUARIUS_L3_WIND_SPEED_SMI_DAILY_V5', 'REMSS-L2P_GRIDDED_25-WSAT', 'QSCAT_BYU_L3_OW_SIGMA0_ANTARCTICA_POLAR-STEREOGRAPHIC_BROWSE_MAPS_LTOD', 'AQUARIUS_L3_DENSITY_SMI_3MONTH_V5', 'NAVO-L2P-AVHRR17_L', 'AQUARIUS_L3_WIND_SPEED_SMIA_ANNUAL_V5', 'AVHRR_PATHFINDER_L3_SST_8DAY_NIGHTTIME_V51', 'UCLA_DEALIASED_SASS_L3', 'TMI-REMSS-L2P-v4', 'AQUARIUS_L3_SPICINESS_SMIA_SEASONAL-CLIMATOLOGY_V5', 'AQUARIUS_L3_SSS-RainFlagged_SMIA_28DAY-RUNNINGMEAN_V5', 'VIIRS_NPP-JPL-L2P-v2016.0', 'RSCAT_L1B_V1.3', 'AQUARIUS_L3_WIND_SPEED_CAP_7DAY_V5', 'AMSRE-REMSS-L3U-v7a', 'GRACE_GAD_L2_GRAV_CSR_RL05', 'TELLUS_PGR_TXT', 'SEAWINDS_BYU_L3_OW_SIGMA0_ARCTIC_POLAR-STEREOGRAPHIC_BROWSE_IMAGES_LTOD', 'AQUARIUS_L3_SSS_SMID_MONTHLY_V5', 'PRESWOT_HYDRO_L2_GREALM_LAKE_HEIGHT_V1', 'AVISO_L4_DYN_TOPO_1DEG_1MO', 'MODIS_TERRA_L3_SST_THERMAL_ANNUAL_9KM_NIGHTTIME_V2014.0', 'AVHRR19_G-NAVO-L2P-v1.0', 'AVHRR_PATHFINDER_L3_SS

In [7]:
dataset_landing_pages = []
#Perform a search on each dataset, note this can take some time as we are querying >600 times
for ds in dsetShortName:
    result = p.dataset_search(short_name=ds)
    #Cache the dataset landing page URL
    searchStr = 'http://podaac.jpl.nasa.gov/dataset/'
    dataset_landing_pages = [ str(i) for i in result.strip().split() if searchStr in i ][0]
print(dataset_landing_pages)


Heres p.dataset_search()
href="http://podaac.jpl.nasa.gov/dataset/GRACE_GAB_L2_GRAV_GFZ_RL05"


In [None]:
#Using Elsevier's Scopus Search, lets see if we can 
#retreieve any information from the above dataset landing page
for ds_entry in dataset_landing_pages:
    url = 'https://api.elsevier.com/content/search/scopus?query=ALL:' + dataset_landing_page + '&APIKey=715b412c00f0b95e918a3e7abe6e6ee4'
    import requests
    try:
        metadata = requests.get(url)
        status_codes = [404, 400, 503, 408]
        if metadata.status_code in status_codes:
            metadata.raise_for_status()
    except requests.exceptions.HTTPError as error:
        print(error)
        raise

    pprint(metadata.text)

In [8]:
#Again, using Elsevier's Scopus Search, lets see other podaac.jpl.nasa.gov resource we can retreive.
url = 'https://api.elsevier.com/content/search/scopus?query=ALL:podaac.jpl.nasa.gov&APIKey=715b412c00f0b95e918a3e7abe6e6ee4'
import requests
try:
    metadata = requests.get(url)
    status_codes = [404, 400, 503, 408]
    if metadata.status_code in status_codes:
        metadata.raise_for_status()
except requests.exceptions.HTTPError as error:
    print(error)
    raise

pprint(metadata.text)
#There are 460 hits...

('{"search-results":{"opensearch:totalResults":"460","opensearch:startIndex":"0","opensearch:itemsPerPage":"25","opensearch:Query":{"@role": '
 '"request", "@searchTerms": "ALL:podaac.jpl.nasa.gov", "@startPage": '
 '"0"},"link": [{"@_fa": "true", "@ref": "self", "@href": '
 '"https://api.elsevier.com/content/search/scopus?start=0&count=25&query=ALL%3Apodaac.jpl.nasa.gov&apiKey=715b412c00f0b95e918a3e7abe6e6ee4", '
 '"@type": "application/json"},{"@_fa": "true", "@ref": "first", "@href": '
 '"https://api.elsevier.com/content/search/scopus?start=0&count=25&query=ALL%3Apodaac.jpl.nasa.gov&apiKey=715b412c00f0b95e918a3e7abe6e6ee4", '
 '"@type": "application/json"},{"@_fa": "true", "@ref": "next", "@href": '
 '"https://api.elsevier.com/content/search/scopus?start=25&count=25&query=ALL%3Apodaac.jpl.nasa.gov&apiKey=715b412c00f0b95e918a3e7abe6e6ee4", '
 '"@type": "application/json"},{"@_fa": "true", "@ref": "last", "@href": '
 '"https://api.elsevier.com/content/search/scopus?start=435&count=25&

In [13]:
publications = []
#Which journals mention podaac.jpl.nasa.gov
import json
entry = json.loads(metadata.text)
for entry in entry['search-results']['entry']:
    publications.append(entry['prism:publicationName'])
pprint(publications)

['Journal of Geodesy',
 'Sustainability (Switzerland)',
 'Canadian Historical Review',
 'ISPRS International Journal of Geo-Information',
 '15th Specialist Meeting on Microwave Radiometry and Remote Sensing of the '
 'Environment, MicroRad 2018 - Proceedings',
 '15th Specialist Meeting on Microwave Radiometry and Remote Sensing of the '
 'Environment, MicroRad 2018 - Proceedings',
 'Natural Hazards',
 'Advances in Space Research',
 'Marine Ecology Progress Series',
 'Remote Sensing',
 'Journal of Geophysical Research: Solid Earth',
 'GPS Solutions',
 'Ocean Science',
 'Remote Sensing',
 'Monthly Weather Review',
 'Ocean Modelling',
 'Ocean Science',
 'Geophysical Journal International',
 'IEEE Transactions on Geoscience and Remote Sensing',
 'IEEE Journal of Selected Topics in Applied Earth Observations and Remote '
 'Sensing',
 '2018 Aviation Technology, Integration, and Operations Conference',
 'Monthly Weather Review',
 'Remote Sensing',
 'Marine Ecology Progress Series',
 'OCEANS 2