# Loading and using duckdb in python

## Dependencies

You only need to call the following once

In [None]:
%pip install duckdb pyarrow shapely requests

## Get a bunch of files
Download an entire collection's worth of files and convert them to GeoParquet

In [None]:
import requests

base = 'https://cmr.earthdata.nasa.gov/stac/GES_DISC/collections/LPRM_WINDSAT_NT_SOILM3_001/items'
current = base
while len(current) > 0:
    resp = requests.get(current)
    
    # check for end of data
    if resp.status_code!=200:
        message = resp.json()['errors'][0]
        print(f"\033[31mError: {resp.status_code} - {resp.reason}: {message}\n{current}\033[0m")
        break # the cursor is broken, get out
    
    data = resp.json()

    # download data
    for feature in data['features']:
        granule_url = f"{base}/{feature['id']}"
        gran_resp = requests.get(granule_url)
        if gran_resp.status_code == 200:
            with open (f"data/{feature['id']}.json", 'w') as f:
                print(f"Save {feature['id']}")
                f.write(gran_resp.text)

    # look for next page
    for link in data['links']:
        if link['rel']=='next':
            next = link['href']
            current = next if current != next else ''

Now convert them using the gpq command

In [None]:
%%bash
for i in data/*.nc.json 
do
    base_name="$(basename $i .json)"
    gpq convert $i data/$base_name.parquet
done 
ls -l data


In [None]:
import duckdb

duckdb.sql('''
COPY (select geometry from 'data/LPRM_WINDSAT_NT_SOILM3.001*.parquet')
TO 'data/LPRM_WINDSAT_NT_SOILM3.001_LPRM-WINDSAT_L3_NT_SOILM3_V001.parquet'
''')

## Get data as a collection

using curl to get a collection as a STAC request and then convert the output to a geo parquet file with the gpq tool, which is assumed to be installed locally with brew.

In [None]:
%%bash
mkdir -p data
cd data
curl -s \
    'https://cmr.earthdata.nasa.gov/stac/GES_DISC/collections/LPRM_WINDSAT_NT_SOILM3_001/items' \
    > LPRM_WINDSAT_NT_SOILM3_001.stac.json
gpq convert LPRM_WINDSAT_NT_SOILM3_001.stac.json LPRM_WINDSAT_NT_SOILM3_001.geo.parquet
gpq describe LPRM_WINDSAT_NT_SOILM3_001.geo.parquet
ls -l

Now load the data into a database

In [None]:
import duckdb

Describe the geo parquet file

In [None]:
print(duckdb.sql("describe select * from 'data/LPRM_WINDSAT_NT_SOILM3_001.geo.parquet'"))

Look inside it for data

In [None]:
print(duckdb.sql("select * from 'data/LPRM_WINDSAT_NT_SOILM3_001.geo.parquet'"))

In [None]:
duckdb.sql("create table lprm as select * from 'data/LPRM_WINDSAT_NT_SOILM3_001.geo.parquet'")

In [None]:
print(duckdb.sql("select * from lprm where '2003-02-09' < datetime AND datetime < '2003-02-12' "))

Try to prove that geoparquet is supported

In [None]:
result = duckdb.sql('''
SELECT extension_name, description
FROM duckdb_extensions()
WHERE installed == true
	AND extension_name == 'spatial';
''')
print(result)

In [None]:
duckdb.read_parquet('data/LPRM_WINDSAT_NT_SOILM3_001.geo.parquet')

In [None]:
duckdb.sql('''
SELECT ST_Contains(GEOMETRY, 'POINT(45 89)'::GEOMETRY) AS contains,
    st_normalize(geometry)
FROM 'data/*.geo.parquet';
''')

## Cleanup here
Random shell commands to clean up the data directory, run as needed!

In [None]:
%%bash
pwd
ls
rm -rf data
ls