In this notebook we will attempt to sample randomly from collections or granules. This will give us a starting database to work with, but also provide us with details about complications of working with different datatypes.

In [1]:
from pyCMR.pyCMR import CMR
cmr = CMR("cmr.cfg")

Idea: sample randomly using a uniform index into the data. What is the maximum index?

In [None]:
# Get all CMR results - how high can we boost the page number?
results = cmr.searchCollection(page_num=10000000)
len(results)

In [None]:
import json
js1 = json.dumps(results[0])
# print js1

This method is not working – it's all from Bowen Island with slightly different revision dates?

New strategy: randomly sample from around the globe.

In [None]:
import random

TRIALS = 10
BB_SIZE = 5 # 5x5 random bounding box

sampled_results_place = []

for trial in xrange(TRIALS):
    
    # Generate a random lat/lng bounding box
    lowerLat = random.uniform(-90.0, 90.0 - BB_SIZE)
    lowerLng = random.uniform(-180.0, 180.0 - BB_SIZE)
    upperLat = lowerLat + BB_SIZE
    upperLng = lowerLng + BB_SIZE
    bounding_box = "%d,%d,%d,%d" % (lowerLng, lowerLat, upperLng, upperLat)
    
    # Request URL
    query_results = cmr.searchCollection(bounding_box=bounding_box)
    sampled_results_place.append(query_results)
    
len(results)


The same 10 are always returned. This is not an effective way to sample. The creation dates of the datasets may be a better way.

In [None]:
"""
WARNING: ERASES
sampled_results = []
short_names = set([])
"""

# Pick up from where we left off
import pickle
short_names, sampled_results = pickle.load(open('metadata.p', 'rb'))


In [None]:
import random
import time
from datetime import datetime as dt

TRIALS = 50
TIME_WINDOW_DAYS = 365
FORMAT = '%Y-%m-%dT%H:%M:%SZ'

new_found = 0
for trial in xrange(TRIALS):
    
    print '*',
    
    # Generate a random time window
    
    range_start = time.mktime(time.strptime('1995-01-01T12:00:00Z', FORMAT))
    range_end = time.mktime(time.strptime('2017-01-01T12:00:00Z', FORMAT))
    start_time = range_start + random.random() * (range_end - range_start)
    end_time = start_time + 86400 * TIME_WINDOW_DAYS
        
    query_time1 = time.strftime(FORMAT, time.gmtime(start_time))
    query_time2 = time.strftime(FORMAT, time.gmtime(end_time))
    # print '[%s TO\n%s]' % (query_time1, query_time2)
    
    query_results = cmr.searchCollection(created_at=query_time1 + ',' + query_time2)
    
    # Add if they have a unique short name
    for collection in query_results:
        sn = collection['Collection']['ShortName']
        if sn not in short_names:
            sampled_results.append(collection)
            short_names.add(sn)
            new_found += 1
    
    # Don't overload the API
    time.sleep(random.random() * 2.0)
    
print 'Found %d new results' % new_found

    

In [None]:
# For debugging
"""range2 = '1997-11-17T22:25:35Z,1997-12-17T22:25:35Z'
results = cmr.searchCollection(created_at=range2)
len(results)"""

In [None]:
short_names

Now that we have an adequate sampling method, lets save some of this data to a "database" (right now just a file).

In [None]:
len(sampled_results)

There aren't quite as many collections as I thought - let's just comb through them chronologically and collect them.

In [21]:
# WARNING: ERASES
# sampled_results = []
# long_names = set([])

# Pick up from where we left off
import pickle
long_names, sampled_results = pickle.load(open('metadata.p', 'rb'))


In [24]:
import time
from datetime import datetime as dt
import random

TIME_WINDOW_DAYS = 0.5
FORMAT = '%Y-%m-%dT%H:%M:%SZ'

RANGE_START = time.mktime(time.strptime('2017-01-01T12:00:00Z', FORMAT))
RANGE_END = time.mktime(time.strptime('2018-01-01T12:00:00Z', FORMAT))

offset = 0
new_found = 0
while True:
    
    # Generate a random time window
    start_time = RANGE_START + offset * 86400 * TIME_WINDOW_DAYS
    end_time = start_time + 86400 * TIME_WINDOW_DAYS
    
    if end_time > RANGE_END:
        break
    
    query_time1 = time.strftime(FORMAT, time.gmtime(start_time))
    query_time2 = time.strftime(FORMAT, time.gmtime(end_time))
    # print '[%s TO\n%s]' % (query_time1, query_time2)
    
    query_results = cmr.searchCollection(created_at=query_time1 + ',' + query_time2)
    
    print len(query_results)
    
    # Add if they have a unique short name
    for collection in query_results:
        ln = collection['Collection']['LongName']
        if ln not in long_names:
            sampled_results.append(collection)
            long_names.add(ln)
            new_found += 1
    
    # Don't overload the API
    time.sleep(random.random() * 0.25)
    
    offset += 1
    
print 'Found %d new results' % new_found

    

0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
3
1
9
6
2
2
6
3
2
0
0
0
0
0
0
0
4
0
0
0
0
1
0
1
0
0
0
7
0
0
1
6
25
0
12
1
12
0
0
0
0
0
3
4
1
0
50
0
1
1
2
0
0
0
0
0
1
1
0
9
45
4
2
0
1
0
0
0
0
3
16
2
0
1
1
0
0
0
0
0
0
0
0
0
0
3
0
0
1
0
0
2
0
0
0
0
0
4
0
0
0
0
3
3
0
4
1
0
0
1
0
2
2
2
0
0
6
4
1
1
0
1
0
4
0
0
4
0
0
0
1
12
0
4
3
0
0
2
0
0
7
0
0
0
0
0
0
1
11
0
0
3
0
0
2
2
0
0
0
0
0
4
0
0
0
1
0
1
0
0
0
2
0
0
0
4
0
0
0
0
0
0
0
1
3
10
4
1
0
0
27
0
0
0
0
0
1
8
29
0
3
9
23
0
0
0
0
0
0
0
10
65
3
2
0
2
0
0
0
0
0
1
0
0
2
1
4
0
2
1
12
1
0
0
0
0
0
0
69
1
0
4
0
3
13
4
2
0
0
0
0
1
100
1
3
2
27
0
1
11
1
0
0
0
0
2
5
15
0
2
23
5
0
2
0
0
0
2
0
0
1
6
0
1
3
3
4
5
0
0
0
7
0
16
2
1
6
4
5
9
22
32
10
14
0
0
0
2
0
0
7
0
2
1
4
1
0
0
0
0
0
1
0
4
3
0
6
2
1
1
0
0
0
0
0
0
0
0
0
0
4
0
4
3
1
0
0
0
0
0
3
0
0
2
0
0
1
2
2
0
0
0
0
0
7
0
6
6
5
0
0
8
0
0
0
0
0
0
1
0
0
4
0
8
35
1
0
0
0
0
0
1
0
1
3
1
24
3
100
0
0
0
0
0
0
0
0
0
1
1
0
1
1
0
8
0
0
0
0
4
3
0
1
0
1
0
0
7
0
0
0
0
0
4
3
0
3
1
26
0
0
5
0
0
0
0
0
0
0
0
2
0
1
0
5
2
1
0
0
0
0
1
0
0
0
0
0
0

In [27]:
len(sampled_results)

2516

In [28]:
# Save our progress

import pickle
pickle.dump((long_names, sampled_results), open('metadata.p', 'wb'))