In [11]:
import geopandas as gpd
import shapely
import pandas as pd
import datetime
import xarray as xr
import numpy as np
import os
import tqdm
import gc

In [12]:
import sys
sys.path.append('..')

In [13]:
import rsutils.modify_images
import rsutils.utils

In [14]:
REGION_SHAPEFILEPATH = '/gpfs/data1/cmongp2/sasirajann/nh_crop_calendar/crop_calendar/data/shapefiles/AfSP012Qry_ISRIC/GIS_Shape/AfSP012Qry_SubSaharanAfrica.shp'
WEATHER_CATALOG_FILEPATH = '/gpfs/data1/cmongp2/sasirajann/nh_crop_calendar/crop_calendar/data/weather_catalog_1994-2025.csv'
REFERENCE_TIF_FILEPATH = '/gpfs/data1/cmongp2/sasirajann/nh_crop_calendar/crop_calendar/data/ref_mod09.ndvi.global_0.05_degree.2019.001.c6.v1.tif'

NJOBS = 120

REGION = 'sub-saharan-africa'

OUTPUT_FOLDERPATH = f'/gpfs/data1/cmongp2/sasirajann/nh_crop_calendar/crop_calendar/data/outputs/{REGION}'

os.makedirs(OUTPUT_FOLDERPATH, exist_ok=True)

In [15]:
def create_attribute_datacube(
    catalog_df:pd.DataFrame,
    region_gdf:gpd.GeoDataFrame,
    export_filepath:str,
):
    try:
        catalog_df['date'].dt
    except:
        raise ValueError("'date' column in catalog_df is not of datetype")
    
    catalog_df = catalog_df.sort_values(by='date')

    data_profile_list = rsutils.modify_images.load_images(
        src_filepaths = catalog_df['tif_filepath'],
        shapes_gdf = region_gdf,
        njobs = NJOBS,
        raise_error = False,
    )

    catalog_df['to_drop'] = [data is None for data, _ in data_profile_list]
    catalog_df = catalog_df[~catalog_df['to_drop']]

    data_profile_list = [(data, profile) for data, profile in data_profile_list if data is not None]

    data_profile_list = rsutils.modify_images.modify_images_inplace(
        data_profile_list = data_profile_list,
        njobs = NJOBS,
        sequence = [
            (rsutils.modify_images.resample_by_ref, dict(ref_filepath=REFERENCE_TIF_FILEPATH)),
            (rsutils.modify_images.crop, dict(shapes_gdf=region_gdf, all_touched=True)),
        ],
        raise_error = True,
    )

    nparray = np.concatenate([data for data, _ in data_profile_list], axis=0)
    
    if nparray.dtype == np.uint8:
        nparray = nparray.astype(np.int16)

    dataarray = xr.DataArray(
        data = nparray,
        dims = ('timestamps', 'height', 'width'),
        coords = {
            'timestamps': catalog_df['date'].to_list(),
        }
    )

    dataarray.to_netcdf(export_filepath)

In [16]:
region_gdf = gpd.read_file(REGION_SHAPEFILEPATH)

region_gdf.to_file(os.path.join(OUTPUT_FOLDERPATH, f'{REGION}.geojson'))

weather_catalog_df = pd.read_csv(WEATHER_CATALOG_FILEPATH)

weather_catalog_df['date'] = weather_catalog_df['date'].apply(
    lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')
)

In [None]:
shapely.unary_union(region_gdf['geometry'])

In [None]:
weather_catalog_df['attribute'].value_counts()

In [None]:
attributes = weather_catalog_df['attribute'].unique().tolist()

for attribute in tqdm.tqdm(attributes):
    # if attribute in ['esi-4wk']:
    #     print(f'Skipping attribute = {attribute}')
    #     continue 

    print(f'Creating datacube for attribute = {attribute}')
    export_filepath = os.path.join(OUTPUT_FOLDERPATH, f'{attribute}.nc')
    
    if os.path.exists(export_filepath):
        print(f'Already exists.')
        continue

    create_attribute_datacube(
        catalog_df = weather_catalog_df[
            weather_catalog_df['attribute'] == attribute
        ],
        region_gdf = region_gdf,
        export_filepath = export_filepath,
    )

    gc.collect()

    print(f'Created !')