In [None]:
import xarray as xr
import s3fs
import datetime as dt
import rioxarray as rio
import geopandas as gpd
from rasterio.enums import Resampling
from pathlib import Path 

In [None]:

#medaillon folderstructure
raw_dir     = Path('./data/raw')
cleaned_dir = Path('./data/cleaned')

#current datetime object
lag0days_dt = dt.datetime.today()

#create raw dir
raw_day_dir = raw_dir / lag0days_dt.strftime('%Y/%m/%d') 
raw_day_dir.mkdir(parents=True,exist_ok=True)

#create cleaned dir
cleaned_day_dir = cleaned_dir / lag0days_dt.strftime('%Y/%m/%d') 
cleaned_day_dir.mkdir(parents=True,exist_ok=True)

hourly_variables = ["PP", "HR2", "T2","dirViento10", "magViento10", "PSFC", 
                    "ACLWDNB", "ACLWUPB", "ACSWDNB", "TSLB", "SMOIS"]
daily_variables = ["Tmax", "Tmin"]

In [None]:
#construct s3 urls
hourly_wrf_urls = []
for fhr in range(1,25):
    hourly_wrf_url = f"s3://smn-ar-wrf/DATA/WRF/DET/{lag0days_dt.strftime('%Y/%m/%d')}/00/WRFDETAR_01H_{lag0days_dt.strftime('%Y%m%d')}_00_{fhr:03d}.nc"
    hourly_wrf_urls.append(hourly_wrf_url)

#download netcdf files from s3
fs = s3fs.S3FileSystem(anon=True, requester_pays=False)
if not list(raw_day_dir.glob("*.nc")):
    fs.get(hourly_wrf_urls,str(raw_day_dir))

In [None]:
#stack multiple netcdf files
hourly_wrf_files = list(raw_day_dir.glob("*.nc"))
daily_data = xr.open_mfdataset(hourly_wrf_files, chunks='auto',decode_coords=False)[hourly_variables]

#sum precipitation, but compute average for other variables and stack them to a new image
daily_sum_pp = daily_data["PP"].sum(dim="time")
daily_median_others = daily_data.drop_vars("PP").median(dim="time")
daily_data = xr.combine_by_coords([daily_sum_pp,daily_median_others])

#assign the crs to the new image
img_for_crs = rio.open_rasterio(hourly_wrf_files[0], decode_times=False)
img_crs = img_for_crs.isel(time=0)['PP'].rio.crs
daily_data = daily_data.rio.write_crs(img_crs)

In [None]:
#load country parquet for clipping
argentina_pqt = Path('data/cleaned/reference/province.parquet')
gdf_argentina = gpd.read_parquet(argentina_pqt).dissolve()
argentina_crs = gdf_argentina.crs.to_epsg()
clip_geojson = [gdf_argentina.to_crs(daily_data.rio.crs)._to_geo()['features'][0]['geometry']]

#clip and reproject 
daily_data = daily_data.rio.clip(clip_geojson, from_disk=True, all_touched=True)
daily_data = daily_data.rio.reproject(dst_crs=f"epsg:{argentina_crs}", resolution=4000,resampling=Resampling.bilinear)

#write to cog
output_cog = cleaned_day_dir / f"WRFDETAR_{lag0days_dt.strftime('%Y%m%d')}.tif"
daily_data.rio.to_raster(output_cog, driver='COG')
