In this notebook we will attempt to sample randomly from collections or granules. This will give us a starting database to work with, but also provide us with details about complications of working with different datatypes.

In [9]:
CREDENTIALS_DIR = '../../credentials' # make sure this is not checked
DATA_DIR        = '../../data/nasa/'
MINING_LOG      = 'mining_log.p'

DATE_START      = '2017-01-01T12:00:00Z'
DATE_END        = '2018-01-01T12:00:00Z'

In [10]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [11]:
from pyCMR.pyCMR import CMR
import time
from datetime import datetime as dt
import json
import pickle
import util
import random
from os.path import join
from pathlib import Path

In [12]:
cmr = CMR('cmr.cfg')

In [13]:
#long_names = set()
long_names = pickle.load(open(MINING_LOG, 'rb'))


In [None]:
TIME_WINDOW_DAYS = 0.5
FORMAT = '%Y-%m-%dT%H:%M:%SZ'
RANGE_START = time.mktime(time.strptime(DATE_START, FORMAT))
RANGE_END = time.mktime(time.strptime(DATE_END, FORMAT))

offset = 0
new_found = 0
while True:
    
    # Generate a random time window
    start_time = RANGE_START + offset * 86400 * TIME_WINDOW_DAYS
    end_time = start_time + 86400 * TIME_WINDOW_DAYS
    
    if end_time > RANGE_END:
        break
    
    query_time1 = time.strftime(FORMAT, time.gmtime(start_time))
    query_time2 = time.strftime(FORMAT, time.gmtime(end_time))
    # print('[%s TO\n%s]' % (query_time1, query_time2))
    
    query_results = cmr.searchCollection(created_at=query_time1 + ',' + query_time2)
    
    print(len(query_results))
    if len(query_results) > 100:
        print('WARNING: Pagination may have caused missed results.')
    
    # Add if they have a unique short name
    for collection in query_results:
        ln = collection['Collection']['ShortName'] + ' ' + collection['Collection']['LongName']
        ln_fname = util.make_fname_safe(ln)
        if ln not in long_names:
            
            # Write collection to file
            fname = join(DATA_DIR, ln_fname + '.json')
            f = Path(fname)
            if f.is_file():
                print('WARNING: Mining log (%s) does not reflect actual files downloaded in data directory' % MINING_LOG)
            json.dump(collection, open(fname, 'w'))
            
            # Record name in 'log'
            long_names.add(ln)
            new_found += 1
    
    # Don't overload the API
    time.sleep(random.random() * 0.25)
    
    offset += 1
    
print('Found %d new results' % new_found)

In [None]:
# Save our progress
pickle.dump(long_names, open(MINING_LOG, 'wb'))

In [15]:
# Inspect number of downloaded files
len(long_names)

503