# Search Machine

Start with just a provider and short name and perform the full search.
1. download STAC records from CMR
1. convert each record to a GeoParquet file
1. use duckdb to combine all the GeoParquet files into one big file with extra info
1. Now do a basic search and try to find a record with a point
1. Now do a more realistic search and try to find multiple records.

In [None]:
provider = 'NSIDC_ECS'
short_name = 'ABLVIS1B_1'
user_search = pg = '''POLYGON((
    -110.60867891721024 53.37487808881224,
    -110.60867891721024 53.29764645852637,
    -109.73806661064765 53.29764645852637,
    -109.73806661064765 53.37487808881224,
    -110.60867891721024 53.37487808881224
))'''
limit = 20

In [None]:
import requests
import os

def stac_to_files(provider, short_name):
    base = f"https://cmr.earthdata.nasa.gov/stac/{provider}/collections/{short_name}/items"
    current = base
    base_path = f"data/{provider}/{short_name}"
    if not os.path.exists(base_path):
        os.makedirs(base_path)
    file_count = 0
    while current and (0 < limit and file_count < limit):
        resp = requests.get(current)
    
        # check for end of data
        if resp.status_code!=200:
            message = resp.json()('errors')(0)
            print(f"\033(31mError: {resp.status_code} - {resp.reason}: {message}\n{current}\033(0m")
            break # the cursor is broken, get out
        
        data = resp.json()

        # download data
        for feature in data['features']:
            granule_url = f"{base}/{feature['id']}"
            gran_resp = requests.get(granule_url)
            file_count = file_count + 1
            if gran_resp.status_code == 200:
                with open (f"{base_path}/{feature['id']}.json", 'w') as f:
                    print(f"Save {feature['id']}")
                    f.write(gran_resp.text)

        # look for next page
        found_next = False
        for link in data['links']:
            if link['rel']=='next':
                next_url = link['href']
                current = next_url if current != next_url else ''
                found_next = True
                break
        if not found_next:
            current = ''

stac_to_files(provider, short_name)

Now Convert each file over to GeoParquet

In [None]:
%%bash
convert()
{
    provider=$1
    short_name=$2
    echo $provider $short_name
    
    for i in data/$provider/$short_name/*.json
    do
        base_name="$(basename $i .json)"
        gpq convert $i data/$provider/$short_name/$base_name.parquet
    done 
    ls -l data/$provider/$short_name
}

convert 'NSIDC_ECS' 'ABLVIS1B_1'

Combine al the smaller files into one bigger file, but add the granule id which is part of the file name to each row.

found this in github: https://github.com/duckdb/duckdb/pull/4097/files

In [None]:
import duckdb

duckdb.sql(f"select * from 'data/{provider}/{short_name}/SC*.parquet'")

prefix = f"data/{provider}/{short_name}/"
prefix_size = len(prefix) + 1
duckdb.sql(f'''
    COPY (select filename[{prefix_size}:-9] as granule, * from parquet_scan('data/{provider}/{short_name}/SC*.parquet', FILENAME=1))
    TO 'data/{provider}/{short_name}/all.parquet'
    ''')

Perform a basic search. There are not many points as most of these polygons snake across the map with only a small about of overlap.

In [None]:
prefix = f"data/{provider}/{short_name}/"
print(prefix)

# -107.21 52.73
# -109.75 53.34
# POINT(-107.5 53.3)
duckdb.sql(f'''
SELECT *, st_contains(geometry::geometry, 'POINT(-107.5 53.3)'::GEOMETRY) as found,
    granule,
    geometry
FROM parquet_scan('data/{provider}/{short_name}/all.parquet')
--WHERE found == true
''')


Finally try an intersects search using the user search from up top. {user_search}

In [None]:

duckdb.sql(f'''
SELECT st_intersects(geometry::geometry, '{user_search}'::GEOMETRY) AS found,
    granule,
    geometry
FROM parquet_scan('data/{provider}/{short_name}/all.parquet')
WHERE found = true
''')