In [35]:
from maap.maap import MAAP
# maap = MAAP(maap_host='api.maap-project.org')
maap = MAAP()
maap._MAAP_HOST

'api.maap-project.org'

# Run `build_stack_list` locally for a list of dicts

In [36]:
# For some reason this is needed to get s3fs to work in ExtractUtils
# this upgrades to 0.3.4 even though we already specify this version in requirements_main...
#!pip install s3fs --upgrade

In [37]:
import rasterstats
import os
import geopandas as gpd
import pandas as pd
import glob
import datetime
!pip install xmltodict
import xmltodict
import sys
sys.path.append('/projects/code/icesat2_boreal/lib')
import ExtractUtils
import build_stack
import random

from multiprocessing import Pool
from functools import partial

import contextily as ctx

[0m

### To run build_stack.py across a tiled raster dataset you need a bunch of args that we'll gather into a dictionary

s3 you need to have a vector footprint of that dataset

#### Dictionary preparation makes this script very flexible and transferable to another s3 dataset
This dictionary is specific to the ESA Worldcover dataset.  
To run '`build_stack.py` across another dataset, just prepare another dictionary here and everything below should be exactly the same.  

In [38]:
TILE_NUM = 8080279620

In [39]:
INDEX_FN = 'https://maap-ops-workspace.s3.amazonaws.com/shared/montesano/databank/boreal_height_cmip6/hydrobasins_L08_patterns_tte_boreal_tundra_3995.gpkg'
INDEX_LYR = 'hydrobasins_L08_patterns_tte_boreal_tundra_3995'
OUTDIR = '/projects/my-public-bucket/databank/boreal_height_cmip6/output/build_stack_basin_clips'
ZONAL_DIR = '/projects/my-public-bucket/databank/boreal_height_cmip6/output/zonal_stats'

VECTOR_DICT = {
                'INDEX_FN': INDEX_FN,
                'ID_COL_NAME': 'HYBAS_ID',
                'TILE_NUM':TILE_NUM,
                'INDEX_LYR': INDEX_LYR,
}

BUILD_STACK_DICT_LIST = [
                # tcc slope
                {
                            # #'INDEX_FN': '/projects/my-public-bucket/boreal_tiles_v003.gpkg',
                            # 'INDEX_FN': INDEX_FN,
                            # 'ID_COL_NAME': 'HYBAS_ID',
                            # 'TILE_NUM':TILE_NUM,
                            # 'INDEX_LYR': INDEX_LYR,
                            # data is accessed via its footprint, with a 's3_path' col identifying the s3 locations of each tile
                            'RASTER_NAME': 'terrapulse_tcc_slope',
                            'COVAR_TILE_FN': 'https://maap-ops-workspace.s3.amazonaws.com/shared/montesano/databank/footprints/footprints_terrapulse-pub-data_tcc_slope-s3.gpkg',
                            'IN_COVAR_S3_COL': 's3_path',
                            'OUTDIR': OUTDIR,
                            'NODATA_VAL': 255,
                            'OUTPUT_CLIP_COG_FN':'',
                            'CREDENTIALS_FN': None
                        },
                # tcc pvalue
                {
                            # #'INDEX_FN': '/projects/my-public-bucket/boreal_tiles_v003.gpkg',
                            # 'INDEX_FN': INDEX_FN,
                            # 'ID_COL_NAME': 'HYBAS_ID',
                            # 'TILE_NUM':TILE_NUM,
                            # 'INDEX_LYR': INDEX_LYR,
                            # data is accessed via its footprint, with a 's3_path' col identifying the s3 locations of each tile
                            'RASTER_NAME': 'terrapulse_tcc_pvalue',
                            'COVAR_TILE_FN': 'https://maap-ops-workspace.s3.amazonaws.com/shared/montesano/databank/footprints/footprints_terrapulse-pub-data_tcc_pvalue-s3.gpkg',
                            'IN_COVAR_S3_COL': 's3_path',
                            'OUTDIR': OUTDIR,
                            'NODATA_VAL': 255,
                            'OUTPUT_CLIP_COG_FN':'',
                            'CREDENTIALS_FN': None
                        }
]

In [40]:
HYBAS_ID_LIST = gpd.read_file(INDEX_FN).HYBAS_ID.to_list()
len(HYBAS_ID_LIST)

17052

In [41]:
import importlib
importlib.reload(build_stack)

def run_build_stack(HYBAS_ID):
    VECTOR_DICT['TILE_NUM'] = HYBAS_ID
    build_stack.build_stack_list(
                            covar_dict_list=BUILD_STACK_DICT_LIST, 
                            vector_dict=VECTOR_DICT, 
                             tile_buffer_m=0, 
                             res=30, clip=True,
                             output_dir='/projects/my-public-bucket/databank/boreal_height_cmip6/output/build_stack_basin_clips',
                             height=None, width=None,
                             MAKE_DF=False
                            )
    
def rename_columns(GDF, bandname, stats_list):
    if stats_list is not None:
       
        names_list = ['val_'+ bandname + '_' + s for s in stats_list]
        rename_dict = dict(zip(stats_list, names_list))      
        GDF = GDF.rename(columns = rename_dict)
        
    return GDF

def extract_zonal_gdf_poly(HYBAS_ID, bandnames: list, GDF_fn, 
                           DATA_DIR = '/projects/my-public-bucket/databank/boreal_height_cmip6/output/build_stack_basin_clips',
                           OUTDIR = '/projects/my-public-bucket/databank/boreal_height_cmip6/output/zonal_stats',
                           ndval_list=[255,-9999], 
                           stats_list = ['max','min','median','mean','percentile_02','percentile_25','percentile_75','percentile_98','count']):
    
    r_fn = glob.glob(f'{DATA_DIR}/*{HYBAS_ID}*.tif')[0]
    GDF = gpd.read_file(GDF_fn)
    GDF = GDF[GDF.HYBAS_ID == HYBAS_ID]
    
    from rasterstats import zonal_stats
    import numpy as np
    import rasterio
    
    gdf_list = []
    
    with rasterio.open(r_fn) as r_src:
        print("\tExtracting raster values from: ", r_fn)
        df_list = []
        for i, bandname in enumerate(bandnames):
            
            bnum = i + 1
            #print(bnum)
            
            # Get array
            array = r_src.read(bnum)
            array = array.astype('float64')
            
            for ndval in ndval_list: array[array==ndval] = np.nan

            df = pd.DataFrame(
                    zonal_stats(
                        vectors=GDF.to_crs(r_src.crs), 
                        raster= array,#r_src.read(bnum, masked=True),
                        affine= r_src.transform,
                        stats=stats_list,
                        nodata=np.nan
                    )
            )

            # Rename cols
            df = rename_columns(df, bandname, stats_list)
            df_list.append(df)
            

        df_final = pd.concat(df_list, axis=1)
        final_gdf = GDF.reset_index().join(df_final.reset_index(drop=True), how='left')

        gdf_list.append(final_gdf)
            
    # Write the table of zonal stats on each band for current HYBAS_ID   
    hybas_id_gdf = pd.concat(gdf_list)
    hybas_id_gdf.to_file(os.path.join(OUTDIR, os.path.basename(r_fn).split('.tif')[0] + '_zonalstats.gpkg'), driver = 'GPKG')
    
    return pd.concat(gdf_list)


### Plot basins

In [8]:
# #zonal_smry_gdf = pd.concat(returned_stuff)
# with Pool(processes=25) as pool:
#     gdf_list = pool.map(partial(gpd.read_file), glob.glob(ZONAL_DIR + '/*.gpkg'))
# #zonal_smry_gdf = pd.concat([gpd.read_file(f) for f in glob.glob(OUTDIR + '/*.gpkg')])
# zonal_smry_gdf = pd.concat(gdf_list)                       
# ax = zonal_smry_gdf.to_crs(4326).plot(column='val_terrapulse_tcc_slope_median', cmap='BrBG', legend=True, vmin=-0.5, vmax=0.5)
# ax = ctx.add_basemap(ax=ax, crs=4326, source = ctx.providers.Esri.WorldGrayCanvas)

In [9]:
LIST_GPKG_DONE = gpd.read_file('/projects/my-public-bucket/databank/boreal_height_cmip6/output/hydrobasins_L08_patterns_tte_boreal_tundra_3995_tcc_trends_part1.gpkg').HYBAS_ID.to_list()

In [44]:
# Get missing
LIST_HYBAS_ID_COG_FINISHED= [int(os.path.basename(f).split('_')[2]) for f in glob.glob(OUTDIR + '/*cog.tif' )]
print(f'{len(LIST_HYBAS_ID_COG_FINISHED)} basin clip cogs that have been processed.')

LIST_HYBAS_ID_COG_MISSING = [id for id in HYBAS_ID_LIST if id not in LIST_HYBAS_ID_COG_FINISHED]
print(f'{len(LIST_HYBAS_ID_COG_MISSING)} basin clip cogs still missing')

LIST_HYBAS_ID_GPKG_FINISHED= [int(os.path.basename(f).split('_')[2]) for f in glob.glob(ZONAL_DIR + '/*cog_zonalstats.gpkg' )] + LIST_GPKG_DONE
                            
print(f'{len(LIST_HYBAS_ID_GPKG_FINISHED)} zonal GPKGs that have been processed.')

LIST_HYBAS_ID_GPKG_MISSING = [id for id in LIST_HYBAS_ID_COG_FINISHED if id not in LIST_HYBAS_ID_GPKG_FINISHED]
print(f'{len(LIST_HYBAS_ID_GPKG_MISSING)} zonal GPKGs available to process ')

15068 basin clip cogs that have been processed.
1984 basin clip cogs still missing
9105 zonal GPKGs that have been processed.
5963 zonal GPKGs available to process 


### Step [1]: build stack of tcc_slope and tcc_pvalue for basins

In [None]:
#%%capture captured_output_text

with Pool(processes=25) as pool:
    pool.map(partial(run_build_stack), LIST_HYBAS_ID_COG_MISSING)

### Step [2]: get zonal stats of stack for basin

In [43]:
%%capture captured_output_text

with Pool(processes=25) as pool:
    returned_stuff = pool.map(partial(extract_zonal_gdf_poly, 
                                      bandnames = [d['RASTER_NAME'] for d in BUILD_STACK_DICT_LIST], 
                                      GDF_fn = INDEX_FN ), 
                              LIST_HYBAS_ID_GPKG_MISSING
                             )

KeyboardInterrupt: 

In [35]:
for j in list(range(19093, 19119)):
    !kill -9 $j

/bin/bash: line 1: kill: (19113) - No such process
