# Create a geoparquet file 

In [1]:
import pystac_client
import json
from pathlib import Path
from stac_geoparquet.arrow import parse_stac_ndjson_to_arrow, to_parquet
from pyarrow.parquet import read_table

## Query a STAC API endpoint

In [2]:
# STAC API Endpoint
stac_url = "https://planetarycomputer.microsoft.com/api/stac/v1"

# Define Time Range
start_date = "2023-01-01"
end_date = "2023-12-31"
date_range = f"{start_date}/{end_date}"

# AOI for our collection
geometry = {
    "type": "Polygon",
    "coordinates": [
        [
            [-9.775114442649567, 44.141939637656606],
            [-9.775114442649567, 35.48272079491679],
            [1.302203384529662, 35.48272079491679],
            [1.302203384529662, 44.141939637656606],
            [-9.775114442649567, 44.141939637656606],
        ]
    ],
}

# Connect to STAC API
catalog = pystac_client.Client.open(stac_url)

# Search for Items
search = catalog.search(
    collections=["sentinel-2-l2a"],
    intersects=geometry,
    datetime=date_range,
    max_items=1000,
)

## Create a JSONl file

The JSON Lines text format, also called newline-delimited JSON, is a convenient format for storing structured data that may be processed one record at a time. 

Create a jsonl file with one feature (STAC Item) per line

In [3]:
items_iter = search.items()

max_items = 1000
s2_json_path = Path("sentinel-2-l2a.jsonl")
if not s2_json_path.exists():
    with open(s2_json_path, "w") as f:
        count = 0

        for item in items_iter:
            json.dump(item.to_dict(), f, separators=(",", ":"))
            f.write("\n")

            count += 1
            if count >= max_items:
                break

## Create a record batch reader

Use `stac_geoparquet.arrow.parse_stac_ndjson_to_arrow` to create an Apache Arrow record batch reader

See:

* `stac_geoparquet.arrow.parse_stac_ndjson_to_arrow` [https://stac-utils.github.io/stac-geoparquet/latest/api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_arrow](https://stac-utils.github.io/stac-geoparquet/latest/api/arrow/#stac_geoparquet.arrow.parse_stac_ndjson_to_arrow)
* `RecordBatchReader` [https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html#pyarrow.RecordBatchReader)

In [4]:
record_batch_reader = parse_stac_ndjson_to_arrow(s2_json_path)

## Create a `pyarrow.lib.Table`

See  [https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow-table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow-table)

In [5]:
table = record_batch_reader.read_all()

table.schema

assets: struct<AOT: struct<gsd: double, href: string, proj:bbox: list<item: double>, proj:shape: list<item: int64>, proj:transform: list<item: double>, roles: list<item: string>, title: string, type: string>, B01: struct<eo:bands: list<item: struct<center_wavelength: double, common_name: string, description: string, full_width_half_max: double, name: string>>, gsd: double, href: string, proj:bbox: list<item: double>, proj:shape: list<item: int64>, proj:transform: list<item: double>, roles: list<item: string>, title: string, type: string>, B02: struct<eo:bands: list<item: struct<center_wavelength: double, common_name: string, description: string, full_width_half_max: double, name: string>>, gsd: double, href: string, proj:bbox: list<item: double>, proj:shape: list<item: int64>, proj:transform: list<item: double>, roles: list<item: string>, title: string, type: string>, B03: struct<eo:bands: list<item: struct<center_wavelength: double, common_name: string, description: string, full_width

Inspect the first row

In [6]:
table[0]

<pyarrow.lib.ChunkedArray object at 0x767a8f237b20>
[
  -- is_valid: all not null
  -- child 0 type: struct<gsd: double, href: string, proj:bbox: list<item: double>, proj:shape: list<item: int64>, proj:transform: list<item: double>, roles: list<item: string>, title: string, type: string>
    -- is_valid: all not null
    -- child 0 type: double
      [
        10,
        10,
        10,
        10,
        10,
        ...
        10,
        10,
        10,
        10,
        10
      ]
    -- child 1 type: string
      [
        "https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/29/T/MJ/2023/12/31/S2B_MSIL2A_20231231T114409_N0510_R123_T29TMJ_20231231T150806.SAFE/GRANULE/L2A_T29TMJ_A035613_20231231T114858/IMG_DATA/R10m/T29TMJ_20231231T114409_AOT_10m.tif",
        "https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/31/T/CK/2023/12/31/S2A_MSIL2A_20231231T105441_N0510_R051_T31TCK_20231231T162343.SAFE/GRANULE/L2A_T31TCK_A044521_20231231T105442/IMG_DATA/R10m/T31TCK_20231231

## Serialize as a geoparquet

Use `stac_geoparquet.arrow.to_parquet` to serialize as geoparquet.

See [https://stac-utils.github.io/stac-geoparquet/latest/api/arrow/#stac_geoparquet.arrow.to_parquet](https://stac-utils.github.io/stac-geoparquet/latest/api/arrow/#stac_geoparquet.arrow.to_parquet)

In [7]:
s2_parquet_path = "s2-stac-api.parquet"
to_parquet(table, s2_parquet_path)

## Verify serialized geoparquet

Use `pyarrow.parquet.read_table` from pyarrow.parquet, see [https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html#pyarrow-parquet-read-table](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html#pyarrow-parquet-read-table)


In [8]:
read_table(s2_parquet_path) == table

True