<a href="https://colab.research.google.com/github/ua-datalab/Geospatial_Workshops/blob/main/notebooks/STAC_crawl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## SpatioTemporal Asset Catalog (STAC)
This notebook demonstrates the use of pystac_client python library to crawl through and access geospatial assets from a STAC complient API.

In [1]:
# Install pystac_client. This library is used to crawl SpatioTemporal Asset Catalogs (STAC)
!pip install pystac_client --quiet
!pip install geopandas

Collecting geopandas
  Using cached geopandas-1.0.1-py3-none-any.whl.metadata (2.2 kB)
Collecting pyogrio>=0.7.2 (from geopandas)
  Using cached pyogrio-0.10.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (5.5 kB)
Collecting pandas>=1.4.0 (from geopandas)
  Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting pyproj>=3.3.0 (from geopandas)
  Downloading pyproj-3.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)
Collecting pytz>=2020.1 (from pandas>=1.4.0->geopandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas>=1.4.0->geopandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached geopandas-1.0.1-py3-none-any.whl (323 kB)
Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
Using cached pyogrio-0.10.0-cp312-cp312-manylinux_2_28_x86_64.whl (24.0 MB)
Downloading pyproj-3.7

In [2]:
#Import the libraries into the current session

import pystac_client
import geopandas

In [3]:
catalog = pystac_client.Client.open(
    "https://stac.cyverse.org"
)

In [4]:
#Let's look at the collections within the root catalog
collections = list(catalog.get_collections())

# Print the number of collections
print(f"Number of collections in the base catalog: {len(collections)}")

# Print the names (or IDs) and descriptions of each collection
for collection in collections:
    print(f"ID: {collection.id}")

Number of collections in the base catalog: 1
ID: Open Forest Observatory


In [None]:
#Search the collection to find the number of items

search = catalog.search(collections=["Open Forest Observatory"])
items = search.item_collection()
len(items)

321

In [5]:
#Create a custom spatial and temporal filter to find assets of interest

time_range = "2023-01-01/2024-12-31"
bbox = [-123.621, 38.32, -119.67, 40.293] #SW corner longitude/latitude ; NE corner longitude/latitude

In [6]:
#Search the collection to find imagery assets within my time-range and bounding box.

search = catalog.search(collections=["Open Forest Observatory"], bbox=bbox, datetime=time_range)
items = search.item_collection()
len(items)



167

In [None]:
##Filter to find the item with the lowest cloud cover

selected_item = items, key=lambda item: item.properties["eo:cloud_cover"]
print(selected_item)

In [36]:
# Filter items by platform
desired_platform = "Phantom 4 RTK"
filtered_items = [item for item in items if item.properties("platform") == desired_platform]

print(f"Number of items with platform '{desired_platform}': {len(filtered_items)}")

TypeError: 'dict' object is not callable

In [None]:
# List all the assets for the selected item

import rich.table

table = rich.table.Table("Asset Key", "Description", "Asset Type" )
for asset_key, asset in selected_item.assets.items():
    table.add_row(asset_key, asset.title, asset.media_type)

table

In [None]:
#Convert the 'rendered preview' asset into a dictionary

selected_item.assets["rendered_preview"].to_dict()

In [None]:
#Display the 'rendered preview' asset of the item

from IPython.display import Image

Image(url=selected_item.assets["rendered_preview"].href, width=500)

In [None]:
#Get the API endpoint (url) of the 'blue' band asset.

selected_item.assets["blue"].href

In [None]:
##Get some info from the asset without downloading it
## Get response code, file type, file size
## We are looking for HTTP status code of 200

import requests

# Send a HEAD request to get the headers of the file
response = requests.head(selected_item.assets["blue"].href)

# Retrieve the status code
status_code = response.status_code

# Initialize variables for file type and size
file_type = None
file_size_mb = None

# Check if the Content-Type header exists
if 'Content-Type' in response.headers:
    file_type = response.headers['Content-Type']

# Check if the Content-Length header exists and convert it to megabytes
if 'Content-Length' in response.headers:
    file_size_bytes = int(response.headers['Content-Length'])
    file_size_mb = file_size_bytes / (1024 * 1024)  # Convert bytes to megabytes

print(f"Status Code: {status_code}")
print(f"File Type: {file_type}")
print(f"File Size: {file_size_mb:.2f} MB")


In [None]:
##Pull the selected asset (cloud optimized geotiff) into my notebook

#install and import library for display
!pip install rioxarray --quiet

import rioxarray


#Display the selected asset with coarser resolution.
#The asset is a COG so it has overviews embedded

ds = rioxarray.open_rasterio(
    selected_item.assets["blue"].href, overview_level=2
).squeeze()
img = ds.plot(cmap="viridis", add_colorbar=False)
img.axes.set_axis_off();