# Search Machine

Start with just a provider and short name and perform the full search.
1. download STAC records from CMR
1. convert records to a GeoParquet file complete with extra info
1. 

In [173]:
provider = 'NSIDC_ECS'
short_name = 'ABLVIS1B_1'
user_search = pg = '''POLYGON((
    -110.60867891721024 53.37487808881224,
    -110.60867891721024 53.29764645852637,
    -109.73806661064765 53.29764645852637,
    -109.73806661064765 53.37487808881224,
    -110.60867891721024 53.37487808881224
))'''

In [3]:
import requests
import os

def stac_to_files(provider, short_name):
    base = f"https://cmr.earthdata.nasa.gov/stac/{provider}/collections/{short_name}/items"
    current = base
    base_path = f"data/{provider}/{short_name}"
    os.makedirs(base_path)
    while len(current) > 0:
        resp = requests.get(current)
    
        # check for end of data
        if resp.status_code!=200:
            message = resp.json()('errors')(0)
            print(f"\033(31mError: {resp.status_code} - {resp.reason}: {message}\n{current}\033(0m")
            break # the cursor is broken, get out
        
        data = resp.json()

        # download data
        for feature in data('features'):
            granule_url = f"{base}/{feature('id')}"
            gran_resp = requests.get(granule_url)
            if gran_resp.status_code == 200:
                with open (f"{base_path}/{feature('id')}.json", 'w') as f:
                    print(f"Save {feature('id')}")
                    f.write(gran_resp.text)

        # look for next page
        for link in data('links'):
            if link('rel')=='next':
                next = link('href')
                current = next if current != next else ''

stac_to_files(provider, short_name)

Save SC:ABLVIS1B.001:129486296
Save SC:ABLVIS1B.001:129487317
Save SC:ABLVIS1B.001:129487360
Save SC:ABLVIS1B.001:129487358
Save SC:ABLVIS1B.001:129487523
Save SC:ABLVIS1B.001:129487578
Save SC:ABLVIS1B.001:129487634
Save SC:ABLVIS1B.001:129487554
Save SC:ABLVIS1B.001:129487513
Save SC:ABLVIS1B.001:129487591
Save SC:ABLVIS1B.001:129487500
Save SC:ABLVIS1B.001:129487826
Save SC:ABLVIS1B.001:129487776
Save SC:ABLVIS1B.001:129487850
Save SC:ABLVIS1B.001:129487810
Save SC:ABLVIS1B.001:129487807
Save SC:ABLVIS1B.001:129487793
Save SC:ABLVIS1B.001:129487829
Save SC:ABLVIS1B.001:129487773
Save SC:ABLVIS1B.001:129488002
[31mError: 500 - Internal Server Error: Oops! Something has gone wrong. We have been alerted and are working to resolve the problem. Please try your request again later.
https://cmr.earthdata.nasa.gov/stac/NSIDC_ECS/collections/ABLVIS1B_1/items?cursor=eyJqc29uIjoiW1wibnNpZGNfZWNzXCIsMTQ5ODc1NjU5MzUxNSwxNTEzMTE4NzQ0XSIsInVtbSI6IltcIm5zaWRjX2Vjc1wiLDE0OTg3NTY1OTM1MTUsMTUxMzExODc

In [59]:
import duckdb

# Skip this, a better way was found below
if False:
    duckdb.sql(f'''
    COPY (select geometry from 'data/{provider}/{short_name}/*.json')
    TO 'data/{provider}/{short_name}/all.parquet'
    ''')

    !gpq convert data/{provider}/{short_name}/all.parquet data/{provider}/{short_name}/fixed.parquet

In [57]:
%%bash
convert()
{
    provider=$1
    short_name=$2
    echo $provider $short_name
    
    for i in data/$provider/$short_name/*.json
    do
        base_name="$(basename $i .json)"
        gpq convert $i data/$provider/$short_name/$base_name.parquet
    done 
    ls -l data/$provider/$short_name
}

convert 'NSIDC_ECS' 'ABLVIS1B_1'

NSIDC_ECS ABLVIS1B_1
total 352
-rw-r--r--  1 tacherr1  staff  1853 Sep 25 14:35 SC:ABLVIS1B.001:129486296.json
-rw-r--r--  1 tacherr1  staff  2551 Sep 25 15:07 SC:ABLVIS1B.001:129486296.parquet
-rw-r--r--  1 tacherr1  staff  2046 Sep 25 14:35 SC:ABLVIS1B.001:129487317.json
-rw-r--r--  1 tacherr1  staff  3264 Sep 25 15:07 SC:ABLVIS1B.001:129487317.parquet
-rw-r--r--  1 tacherr1  staff  2203 Sep 25 14:35 SC:ABLVIS1B.001:129487358.json
-rw-r--r--  1 tacherr1  staff  3750 Sep 25 15:07 SC:ABLVIS1B.001:129487358.parquet
-rw-r--r--  1 tacherr1  staff  1937 Sep 25 14:35 SC:ABLVIS1B.001:129487360.json
-rw-r--r--  1 tacherr1  staff  3011 Sep 25 15:07 SC:ABLVIS1B.001:129487360.parquet
-rw-r--r--  1 tacherr1  staff  2093 Sep 25 14:35 SC:ABLVIS1B.001:129487500.json
-rw-r--r--  1 tacherr1  staff  3322 Sep 25 15:07 SC:ABLVIS1B.001:129487500.parquet
-rw-r--r--  1 tacherr1  staff  1936 Sep 25 14:35 SC:ABLVIS1B.001:129487513.json
-rw-r--r--  1 tacherr1  staff  2802 Sep 25 15:07 SC:ABLVIS1B.001:129487513

found this in github:
https://github.com/duckdb/duckdb/pull/4097/files

In [104]:
duckdb.sql(f"select * from 'data/{provider}/{short_name}/SC*.parquet'")

prefix = f"data/{provider}/{short_name}/"
duckdb.sql(f'''
    COPY (select filename({len(prefix)+1}:-9) as granule, * from parquet_scan('data/{provider}/{short_name}/SC*.parquet', FILENAME=1))
    TO 'data/{provider}/{short_name}/all.parquet'
    ''')

In [163]:
prefix = f"data/{provider}/{short_name}/"
print(prefix)

# -107.21 52.73
# -109.75 53.34
# POINT(-107.5 53.3)
duckdb.sql(f'''
SELECT st_contains(geometry::geometry, 'POINT(-107.5 53.3)'::GEOMETRY) as found,
    granule,
    geometry
FROM parquet_scan('data/{provider}/{short_name}/all.parquet')
--WHERE found == true
''')


data/NSIDC_ECS/ABLVIS1B_1/


┌─────────┬───────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│  found  │          granule          │                                                                                                                                                                                                                                                                     

In [171]:

duckdb.sql(f'''
SELECT st_intersects(geometry::geometry, '{user_search}'::GEOMETRY) AS found,
    granule,
    geometry
FROM parquet_scan('data/{provider}/{short_name}/all.parquet')
WHERE found = true
''')

┌─────────┬───────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│  found  │          granule          │                                                                                                                                                                                                                                                                                 geometry                                                                                     