# Loading and using duckdb in python

## Dependencies

You only need to call the following once

In [None]:
%pip install duckdb pyarrow shapely requests

## Get a bunch of files
Download an entire collection's worth of files and convert them to GeoParquet

In [94]:
import requests

base = 'https://cmr.earthdata.nasa.gov/stac/GES_DISC/collections/LPRM_WINDSAT_NT_SOILM3_001/items'
current = base
while len(current) > 0:
    resp = requests.get(current)
    
    # check for end of data
    if resp.status_code!=200:
        message = resp.json()['errors'][0]
        print(f"\033[31mError: {resp.status_code} - {resp.reason}: {message}\n{current}\033[0m")
        break # the cursor is broken, get out
    
    data = resp.json()

    # download data
    for feature in data['features']:
        granule_url = f"{base}/{feature['id']}"
        gran_resp = requests.get(granule_url)
        if gran_resp.status_code == 200:
            with open (f"data/{feature['id']}.json", 'w') as f:
                print(f"Save {feature['id']}")
                f.write(gran_resp.text)

    # look for next page
    for link in data['links']:
        if link['rel']=='next':
            next = link['href']
            current = next if current != next else ''

GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030201012753.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030202011051.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030203010110.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030204003656.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030205002359.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030206000707.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030207012800.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030208011455.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030209005039.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030210003343.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030211002044.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030212000355.nc
GET LPRM_WINDSAT_NT_SOILM3.001:LPRM-WIND

Now convert them using the gpq command

In [75]:
%%bash
for i in data/*.nc.json 
do
    base_name="$(basename $i .json)"
    gpq convert $i data/$base_name.parquet
done 
ls -l data


total 376
-rw-r--r--  1 tacherr1  staff   1829 Sep 24 16:58 LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030201012753.nc.json
-rw-r--r--  1 tacherr1  staff   1707 Sep 24 17:08 LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030201012753.nc.parquet
-rw-r--r--  1 tacherr1  staff   1829 Sep 24 16:58 LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030202011051.nc.json
-rw-r--r--  1 tacherr1  staff   1707 Sep 24 17:08 LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030202011051.nc.parquet
-rw-r--r--  1 tacherr1  staff   1829 Sep 24 16:58 LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030203010110.nc.json
-rw-r--r--  1 tacherr1  staff   1707 Sep 24 17:08 LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030203010110.nc.parquet
-rw-r--r--  1 tacherr1  staff   1829 Sep 24 16:59 LPRM_WINDSAT_NT_SOILM3.001:LPRM-WINDSAT_L3_NT_SOILM3_V001_20030204003656.nc.json
-rw-r--r--  1 tacherr1  staff   1707 Sep 24 17:08 LPRM_WINDSAT_N

## Get data as a collection

using curl to get a collection as a STAC request and then convert the output to a geo parquet file with the gpq tool, which is assumed to be installed locally with brew.

In [11]:
%%bash
mkdir -p data
cd data
curl -s \
    'https://cmr.earthdata.nasa.gov/stac/GES_DISC/collections/LPRM_WINDSAT_NT_SOILM3_001/items' \
    > LPRM_WINDSAT_NT_SOILM3_001.stac.json
gpq convert LPRM_WINDSAT_NT_SOILM3_001.stac.json LPRM_WINDSAT_NT_SOILM3_001.geo.parquet
gpq describe LPRM_WINDSAT_NT_SOILM3_001.geo.parquet
ls -l

╭────────────────────┬────────┬────────────┬────────────┬─────────────┬──────────┬────────────────┬──────────────────────┬────────╮
│ COLUMN             │ TYPE   │ ANNOTATION │ REPETITION │ COMPRESSION │ ENCODING │ GEOMETRY TYPES │ BOUNDS               │ DETAIL │
├────────────────────┼────────┼────────────┼────────────┼─────────────┼──────────┼────────────────┼──────────────────────┼────────┤
│ datetime           │ binary │ string     │ 0..1       │ zstd        │          │                │                      │        │
│ end_datetime       │ binary │ string     │ 0..1       │ zstd        │          │                │                      │        │
│ [1mgeometry[0m           │ binary │            │ 0..1       │ zstd        │ WKB      │ Polygon        │ [-180, -90, 180, 90] │        │
│ start_datetime     │ binary │ string     │ 0..1       │ zstd        │          │                │                      │        │
├────────────────────┼────────┴────────────┴────────────┴───────────

Now load the data into a database

In [4]:
import duckdb

Describe the geo parquet file

In [6]:
print(duckdb.sql("describe select * from 'data/LPRM_WINDSAT_NT_SOILM3_001.geo.parquet'"))

┌────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│  column_name   │ column_type │  null   │   key   │ default │  extra  │
│    varchar     │   varchar   │ varchar │ varchar │ varchar │ varchar │
├────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ datetime       │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ end_datetime   │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ geometry       │ GEOMETRY    │ YES     │ NULL    │ NULL    │ NULL    │
│ start_datetime │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘



Look inside it for data

In [5]:
print(duckdb.sql("select * from 'data/LPRM_WINDSAT_NT_SOILM3_001.geo.parquet'"))

┌──────────────────────────┬──────────────────────────┬──────────────────────────────────────────────────────────┬──────────────────────────┐
│         datetime         │       end_datetime       │                         geometry                         │      start_datetime      │
│         varchar          │         varchar          │                         geometry                         │         varchar          │
├──────────────────────────┼──────────────────────────┼──────────────────────────────────────────────────────────┼──────────────────────────┤
│ 2003-02-01T01:27:53.000Z │ 2003-02-02T01:12:06.000Z │ POLYGON ((-180 -90, 180 -90, 180 90, -180 90, -180 -90)) │ 2003-02-01T01:27:53.000Z │
│ 2003-02-02T01:10:51.000Z │ 2003-02-03T01:01:10.000Z │ POLYGON ((-180 -90, 180 -90, 180 90, -180 90, -180 -90)) │ 2003-02-02T01:10:51.000Z │
│ 2003-02-03T01:01:10.000Z │ 2003-02-04T00:38:11.000Z │ POLYGON ((-180 -90, 180 -90, 180 90, -180 90, -180 -90)) │ 2003-02-03T01:01:10.000Z │
│ 2003

In [8]:
duckdb.sql("create table lprm as select * from 'data/LPRM_WINDSAT_NT_SOILM3_001.geo.parquet'")

In [9]:
print(duckdb.sql("select * from lprm where '2003-02-09' < datetime AND datetime < '2003-02-12' "))

┌──────────────────────────┬──────────────────────────┬──────────────────────────────────────────────────────────┬──────────────────────────┐
│         datetime         │       end_datetime       │                         geometry                         │      start_datetime      │
│         varchar          │         varchar          │                         geometry                         │         varchar          │
├──────────────────────────┼──────────────────────────┼──────────────────────────────────────────────────────────┼──────────────────────────┤
│ 2003-02-09T00:50:39.000Z │ 2003-02-10T00:34:58.000Z │ POLYGON ((-180 -90, 180 -90, 180 90, -180 90, -180 -90)) │ 2003-02-09T00:50:39.000Z │
│ 2003-02-10T00:33:43.000Z │ 2003-02-11T00:20:44.000Z │ POLYGON ((-180 -90, 180 -90, 180 90, -180 90, -180 -90)) │ 2003-02-10T00:33:43.000Z │
│ 2003-02-11T00:20:44.000Z │ 2003-02-12T00:03:55.000Z │ POLYGON ((-180 -90, 180 -90, 180 90, -180 90, -180 -90)) │ 2003-02-11T00:20:44.000Z │
└─────

Try to prove that geoparquet is supported

In [10]:
result = duckdb.sql('''
SELECT extension_name, description
FROM duckdb_extensions()
WHERE installed == true
	AND extension_name == 'spatial';
''')
print(result)

┌────────────────┬────────────────────────────────────────────────────────────────────────────────────┐
│ extension_name │                                    description                                     │
│    varchar     │                                      varchar                                       │
├────────────────┼────────────────────────────────────────────────────────────────────────────────────┤
│ spatial        │ Geospatial extension that adds support for working with spatial data and functions │
└────────────────┴────────────────────────────────────────────────────────────────────────────────────┘



In [12]:
duckdb.read_parquet('data/LPRM_WINDSAT_NT_SOILM3_001.geo.parquet')

┌──────────────────────────┬──────────────────────────┬──────────────────────────────────────────────────────────┬──────────────────────────┐
│         datetime         │       end_datetime       │                         geometry                         │      start_datetime      │
│         varchar          │         varchar          │                         geometry                         │         varchar          │
├──────────────────────────┼──────────────────────────┼──────────────────────────────────────────────────────────┼──────────────────────────┤
│ 2003-02-01T01:27:53.000Z │ 2003-02-02T01:12:06.000Z │ POLYGON ((-180 -90, 180 -90, 180 90, -180 90, -180 -90)) │ 2003-02-01T01:27:53.000Z │
│ 2003-02-02T01:10:51.000Z │ 2003-02-03T01:01:10.000Z │ POLYGON ((-180 -90, 180 -90, 180 90, -180 90, -180 -90)) │ 2003-02-02T01:10:51.000Z │
│ 2003-02-03T01:01:10.000Z │ 2003-02-04T00:38:11.000Z │ POLYGON ((-180 -90, 180 -90, 180 90, -180 90, -180 -90)) │ 2003-02-03T01:01:10.000Z │
│ 2003

In [83]:
duckdb.sql('''
SELECT ST_Contains(GEOMETRY, 'POINT(45 89)'::GEOMETRY) AS contains,
    st_normalize(geometry)
FROM 'data/*.geo.parquet';
''')

┌──────────┬──────────────────────────────────────────────────────────┐
│ contains │                  st_normalize(geometry)                  │
│ boolean  │                         geometry                         │
├──────────┼──────────────────────────────────────────────────────────┤
│ true     │ POLYGON ((-180 -90, -180 90, 180 90, 180 -90, -180 -90)) │
│ true     │ POLYGON ((-180 -90, -180 90, 180 90, 180 -90, -180 -90)) │
│ true     │ POLYGON ((-180 -90, -180 90, 180 90, 180 -90, -180 -90)) │
│ true     │ POLYGON ((-180 -90, -180 90, 180 90, 180 -90, -180 -90)) │
│ true     │ POLYGON ((-180 -90, -180 90, 180 90, 180 -90, -180 -90)) │
│ true     │ POLYGON ((-180 -90, -180 90, 180 90, 180 -90, -180 -90)) │
│ true     │ POLYGON ((-180 -90, -180 90, 180 90, 180 -90, -180 -90)) │
│ true     │ POLYGON ((-180 -90, -180 90, 180 90, 180 -90, -180 -90)) │
│ true     │ POLYGON ((-180 -90, -180 90, 180 90, 180 -90, -180 -90)) │
│ true     │ POLYGON ((-180 -90, -180 90, 180 90, 180 -90, -180 

## Cleanup here
Random shell commands to clean up the data directory, run as needed!

In [None]:
%%bash
pwd
ls
rm -rf data
ls