- [x] Create product subdir tree 
- [x] Download all files to subdirs
    - [x] List all files
    - [x] Parse location from safe dir 
    - [x] move all files to their local dir
- [x] ID burst overlapping with AOI
    - [ ] ~~Use burst API? Get IDs and products~~
    - [x] Or use burst info in product
- [x] Compute burst offsets
- [ ] Open datasets and add a burst_start and burst_end attribute
- [ ] Add min and max burst metadata in each dataset
- [ ] Download the burst range in each IW to zarr

In [3]:
from pystac_client.client import Client
import geopandas as gpd
import rioxarray as riox
import xarray as xr
import json
from dask.diagnostics import ProgressBar
from pathlib import Path
from urllib.parse import urlparse
import os

from rasterio.session import AWSSession
import rasterio
import boto3

## Open AWS sessions

In [4]:
out_dir = "/data/res/test_partial_product/"
if not os.path.isdir(out_dir):
    os.mkdir(out_dir)

# use the creds created on CDSE website
with open("/data/creds_s3.json") as f:
    cred = json.load(f)

un = cred["username"]
pw = cred["password"]

# rasterio session
rio_session = AWSSession(
    aws_access_key_id=un,
    aws_secret_access_key=pw,
    region_name="default",
    endpoint_url="eodata.dataspace.copernicus.eu")

# needed for other (non-tiff) files
session = boto3.session.Session()
s3 = boto3.resource(
    's3',
    endpoint_url='https://eodata.dataspace.copernicus.eu',
    aws_access_key_id=un,
    aws_secret_access_key=pw,
    region_name='default'
)

## Search products with STAC API

In [5]:
# Search using STAC api
catalog = Client.open(
    "https://stac.dataspace.copernicus.eu/v1/"
)
aoi_file = "../data/Morocco_AOI.geojson"
shp = gpd.read_file(aoi_file).geometry[0]
search = catalog.search(collections=["sentinel-1-slc"], intersects=shp)

# work only with the first result for now
it = next(search.items())

## Extract path info

In [6]:
# product will be saved in this subdir
product_root_dir = f"{it.id}.SAFE"

# use manifest file to get S3 bucket and prefix
manifest_url = it.assets["safe_manifest"].href

# Parse the url
parsed = urlparse(manifest_url)
if parsed.scheme != "s3":
    raise ValueError("Product url does not start with s3://")

# Bucket is the "netloc"
bucket_name = parsed.netloc

# Look for subdir prefix
manifest_path = Path(parsed.path.lstrip("/"))
prefix = str(manifest_path.parent)

## List all files

In [None]:
# try to find xml files
bucket = s3.Bucket(bucket_name)
files = [it.key for it in list(
    bucket.objects.filter(
        Prefix=prefix
    )
)]
for f in list(files):
    print(f)


## Create local tree

In [None]:
subdirs = ("annotation", "measurement", "preview", "support", "annotation/rfi", "annotation/calibration", "preview/icons")
for subdir in subdirs:
    subpath = Path(out_dir) / product_root_dir / subdir
    if not os.path.isdir(subpath):
        os.makedirs(subpath)

## Download and copy files

In [None]:
for f in files:
    remote_file = f

    # remove all that is before the SAFE dir
    parts = Path(f).parts
    idx = parts.index(product_root_dir)
    # keep only the subdir (?)
    local_path = str(Path(out_dir) / Path(*parts[idx:]))#.parent
    # skip raster files
    if Path(remote_file).suffix != ".tiff":    
        bucket.download_file(remote_file, local_path)

## Get burst geometry

In [None]:
from eo_tools.auxils import get_burst_geometry
from eo_tools.S1.core import read_metadata

# retrieve burst geometries
gdf_burst= get_burst_geometry(
    str(Path(out_dir) / product_root_dir),
    target_subswaths=["IW1", "IW2", "IW3"],
    polarization="VV",
)

# find what subswaths and bursts intersect AOI
gdf_burst= gdf_burst[gdf_burst.intersects(shp)]


# TODO: loop on polarizations and iw
iw = "3"
pol = "vv"

# use metadata to find where to crop
str_xml = f"**/annotation/*iw{iw}*{pol}*.xml"
pth_xml = list((Path(out_dir) / product_root_dir).glob(str_xml))[0]
meta = read_metadata(pth_xml=pth_xml)
burst_info = meta["product"]["swathTiming"]
lines_per_burst = int(burst_info["linesPerBurst"])
burst_indices = gdf_burst[gdf_burst.subswath == f"IW{iw}"].burst
min_burst = burst_indices.min()
max_burst = burst_indices.max()
line_start = lines_per_burst * (min_burst - 1)
line_end = lines_per_burst * (max_burst - 1)

# open raster
url = it.assets[f"iw{iw}-{pol}"].href
with rasterio.Env(session=rio_session, AWS_VIRTUAL_HOSTING=False):
    ds = riox.open_rasterio(url, chunks="auto")

ds.attrs["min_burst"] = min_burst
ds.attrs["max_burst"] = max_burst
# TODO: replace path
# download cropped array
with ProgressBar():
    # ds[line_start:line_end].to_zarr("/data/res/test_s3_S1.zarr", mode="w")

In [None]:
# parts = Path(f).parts
# idx = parts.index(product_root_dir)
# keep only the subdir (?)
# local_path = str(Path(out_dir) / Path(*parts[idx:])

NameError: name 'local_paths' is not defined

## Read burst ranges in raster

In [None]:
# this may not be needed
url = it.assets["iw1-vv"].href
annotation_url = it.assets["schema-product-iw1-vh"].href

In [2]:
# open remote dataset
with rasterio.Env(session=rio_session, AWS_VIRTUAL_HOSTING=False):
    ds = riox.open_rasterio(url, chunks="auto")

# download cropped array
# with ProgressBar():
    # ds[0, :1500].to_zarr("/data/res/test_s3_S1.zarr", mode="w")

NameError: name 'rasterio' is not defined

This fails:
```bash
./sentinel1_burst_extractor_spatiotemporal.sh -p vv -s 2025-09-25 -e 2025-09-30 -x 30.19358 -y -7.2473372
```
Example from the docs also fails. Problem with string parsing.

In [1]:
ds

NameError: name 'ds' is not defined