#### Import necessary Python modules

In [None]:
# HTTP requests
import requests


# utility libraries
from datetime import date
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from dask import delayed

import getpass

# JSON parser
import json

# XML parser
import xml.etree.ElementTree as ET

# system modules
import os
import re
import sys
import random

# data manipulation
import pandas as pd
import numpy as np

# geo data manipulation
import geopandas as gpd
import shapely
from shapely.geometry import shape

# EO data manipulation
import rasterio
import xarray as xr
import rioxarray as rio

# visualization product
import hvplot.pandas
import matplotlib.pyplot as plt
import matplotlib.image
from rasterio.windows import Window

# file manipulation
from pathlib import Path
import fsspec

In [None]:
aoi = gpd.read_file('data/catchment_outline.geojson', crs="EPGS:4326")
plot = aoi.hvplot(geo=True, 
                  tiles='OSM',
                  #tiles='EsriImagery',
                  frame_width=800, 
                  frame_height=600, 
                  alpha=0.3, 
                  line_width=4
                  )
                                

In [None]:
# Get the bounds
minx, miny, maxx, maxy = aoi.total_bounds

In [None]:
# base URL of the product catalogue
catalogue_odata_url = "https://catalogue.dataspace.copernicus.eu/odata/v1"

# search parameters
collection_name = "SENTINEL-2"
product_type = "S2MSI2A"
max_cloud_cover = 100.
aoi = str(shapely.geometry.Polygon([(minx, miny), (minx, maxy), (maxx, maxy), (maxx, miny), (minx, miny)]))
search_period_start = "2018-02-01T00:00:00.000Z"
search_period_end = "2018-02-08T00:00:00.000Z"

In [None]:
search_query = (f"{catalogue_odata_url}/Products?$filter="
                f"Collection/Name eq '{collection_name}' "
                f"and Attributes/OData.CSC.StringAttribute/any(att:att/Name eq 'productType' and att/OData.CSC.StringAttribute/Value eq '{product_type}') "
                f"and OData.CSC.Intersects(area=geography'SRID=4326;{aoi}') "
                f"and ContentDate/Start gt {search_period_start} "
                f"and ContentDate/Start lt {search_period_end}")

In [None]:
response = requests.get(search_query).json()
result = pd.DataFrame.from_dict(response["value"])

In [None]:
result

In [None]:
def process_row(row):
    row["GeoFootprint"] = shape(row["GeoFootprint"])
    row['Start'] = pd.to_datetime(row['ContentDate'].get('Start'))    
    return row

In [None]:
result = result.apply(process_row, axis=1)
result.sort_values(by=['Start'], inplace=True)
result.reset_index(drop=True, inplace=True)

In [None]:
gdf = gpd.GeoDataFrame(result, geometry=result["GeoFootprint"], crs="EPSG:4326")

In [None]:
gdf.hvplot.polygons(c='Name', geo=True, tiles='OSM', frame_height=600, frame_width=800, alpha=0.2, legend=True)

In [None]:
credential = str.split(open('/home/pier/.s3_CDSE_passwd').read().replace('\n', ''), ':')
fs = fsspec.filesystem('s3',key=credential[0], secret=credential[1], endpoint_url='https://eodata.dataspace.copernicus.eu', anon=False)

In [None]:
bands = result['S3Path'].apply(lambda x: fs.glob(f"s3:/{x}/GRANULE/**/*_20m.jp2"))
bands = bands.apply(pd.Series)
bands.columns = bands.iloc[0].apply(lambda x: x.split('/')[-1].split('_')[-2]).to_list()

In [None]:
sel_bands = bands.filter(['B03', 'B11', 'CLDPRB', 'SNWPRB'])

In [None]:
sel_bands

In [None]:
def get_band(path):
    date_time = pd.to_datetime(path.split('/')[-1].split('_')[-3])
    try:
        with fs.open(path, 'rb') as f:
            with rio.open_rasterio(f) as da:
                print(path)
                da.name = path.split('/')[-1].split('_')[-2]
                da = da.drop('band').squeeze()
                da = da.assign_coords(time=date_time)
                da = da.expand_dims('time')
                return da
    except Exception as e:
        print(e)

In [None]:
fB03 = [get_band(path) for path in bands['B11'].to_list()]

In [None]:
fB03 = fB03[1:]

In [None]:
fB03

In [None]:
@delayed
def get_band(band):
    with fs.open(band, mode='rb', anon=True, default_fill_cache=False, default_cache_type='first') as infile:
        with rasterio.open(infile) as src:
            return src.read(1)

In [None]:
columns

In [None]:
flist = bands['B03'].to_list() + bands['B11'].to_list() + msk['cloud'].to_list() + msk['snow'].to_list()

In [None]:
flist

In [None]:
fs2 = fsspec.filesystem('')

In [None]:
from kerchunk.xarray_backend import SingleHdf5ToZa 
from pathlib import Path
import os
import ujson

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first') # args to fs.open()
# default_fill_cache=False avoids caching data in between file chunks to lowers memory usage.

def gen_json(file_url):
    with fs.open(file_url, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=300)
        # inline threshold adjusts the Size below which binary blocks are included directly in the output
        # a higher inline threshold can result in a larger json file but faster loading time
        variable = file_url.split('/')[-1].split('.')[0]
        month = file_url.split('/')[2]
        outf = f'{month}_{variable}.json' #file name to save json to
        with fs2.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

In [None]:
max_cloud_cover = 100.00
search_query = (f"{search_query} "
                f"and Attributes/OData.CSC.DoubleAttribute/any(att:att/Name eq 'cloudCover' and att/OData.CSC.DoubleAttribute/Value le {max_cloud_cover})")

# print(f"""\n{search_query.replace(' ', "%20")}\n""")

response = requests.get(search_query).json()
result = pd.DataFrame.from_dict(response["value"])