In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import ee, eemont
from forestry_carbon_arr.core import ForestryCarbonARR
from forestry_carbon_arr.utils.zarr_utils import save_dataset_efficient_zarr, load_dataset_zarr

import gcsfs
import os

fs = gcsfs.GCSFileSystem(project=os.getenv("GOOGLE_CLOUD_PROJECT"), token='/usr/src/app/user_id.json')
gcs_path_ds_train = 'gs://remote_sensing_saas/01-korindo/timeseries_zarr/ds_train.zarr'

ds_train = load_dataset_zarr(gcs_path_ds_train)
# ds_train

forestry = ForestryCarbonARR(config_path='./00_input/korindo.json')
forestry.initialize_gee()

In [None]:
# aoi
from forestry_carbon_arr.core.utils import DataUtils
import geopandas as gpd
import geemap

data_utils = DataUtils(forestry.config, use_gee=True)
aoi_gpd, aoi_ee = data_utils.load_geodataframe_gee(forestry.config["AOI_path"])

aoi_gpd_utm = aoi_gpd.to_crs(epsg=32749)

print(f"âœ… AOI loaded: {len(aoi_gpd_utm)} features")
print(f"   Area: {aoi_gpd_utm.geometry.area.sum()/10000:.2f} hectares")

In [None]:
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute

def get_selected_ts_features_multi(X_array, selected_features, channel_names=("EVI", "NDVI")):
    """
    X_array: np.ndarray with shape (n_samples, seq_length, n_channels)
    selected_features: dict like selected_features_top_150, with key 'value'
    channel_names: names for each channel (must match n_channels)
    """
    n_samples, seq_length, n_channels = X_array.shape
    assert n_channels == len(channel_names), "channel_names must match last dim of X_array"

    dfs = []
    for ch_idx, ch_name in enumerate(channel_names):
        df = pd.DataFrame(X_array[:, :, ch_idx])   # shape: (sample, timestep)
        df["id"] = df.index
        df_long = df.melt(id_vars="id", var_name="time", value_name="value")
        df_long["kind"] = ch_name
        dfs.append(df_long)

    df_long_all = pd.concat(dfs, ignore_index=True)

    # Use the same feature config for each kind (EVI, NDVI)
    fc_params_per_kind = {ch: selected_features["value"] for ch in channel_names}

    extracted = extract_features(
        df_long_all,
        column_id="id",
        column_sort="time",
        column_kind="kind",
        column_value="value",
        chunksize=2_000,
        n_jobs=40,
        kind_to_fc_parameters=fc_params_per_kind,
    )
    impute(extracted)
    return extracted

In [None]:
## FEATURE EXTRACTION, feature
# Feature configuration - Define this early so it's available for feature extraction
selected_features_small = {'value': {'minimum': None,
  'quantile': [{'q': 0.1}, {'q': 0.2}, {'q': 0.3}],
  'variation_coefficient': None,
  'ar_coefficient': [{'coeff': 1, 'k': 10},
   {'coeff': 5, 'k': 10},
   {'coeff': 4, 'k': 10}],
  'cwt_coefficients': [{'coeff': 1, 'w': 2, 'widths': (2, 5, 10, 20)},
   {'coeff': 6, 'w': 2, 'widths': (2, 5, 10, 20)},
   {'coeff': 7, 'w': 2, 'widths': (2, 5, 10, 20)},
   {'coeff': 5, 'w': 2, 'widths': (2, 5, 10, 20)},
   {'coeff': 3, 'w': 2, 'widths': (2, 5, 10, 20)}],
  'energy_ratio_by_chunks': [{'num_segments': 10, 'segment_focus': 3},
   {'num_segments': 10, 'segment_focus': 5},
   {'num_segments': 10, 'segment_focus': 9},
   {'num_segments': 10, 'segment_focus': 7},
   {'num_segments': 10, 'segment_focus': 6}],
  'fft_coefficient': [{'attr': 'real', 'coeff': 2},
   {'attr': 'imag', 'coeff': 36},
   {'attr': 'abs', 'coeff': 2},
   {'attr': 'real', 'coeff': 5},
   {'attr': 'angle', 'coeff': 13},
   {'attr': 'real', 'coeff': 10},
   {'attr': 'angle', 'coeff': 5},
   {'attr': 'abs', 'coeff': 0},
   {'attr': 'abs', 'coeff': 3},
   {'attr': 'angle', 'coeff': 4},
   {'attr': 'imag', 'coeff': 35},
   {'attr': 'imag', 'coeff': 5},
   {'attr': 'imag', 'coeff': 4},
   {'attr': 'angle', 'coeff': 23},
   {'attr': 'imag', 'coeff': 2}],
  'change_quantiles': [{'f_agg': 'mean', 'isabs': True, 'qh': 0.6, 'ql': 0.0},
   {'f_agg': 'var', 'isabs': False, 'qh': 0.8, 'ql': 0.0},
   {'f_agg': 'mean', 'isabs': True, 'qh': 0.8, 'ql': 0.0},
   {'f_agg': 'var', 'isabs': False, 'qh': 0.6, 'ql': 0.0},
   {'f_agg': 'mean', 'isabs': True, 'qh': 1.0, 'ql': 0.0},
   {'f_agg': 'var', 'isabs': False, 'qh': 0.8, 'ql': 0.4},
   {'f_agg': 'var', 'isabs': False, 'qh': 1.0, 'ql': 0.0}],
  'benford_correlation': None,
  'spkt_welch_density': [{'coeff': 5}],
  'mean': None,
  'partial_autocorrelation': [{'lag': 3}],
  'number_cwt_peaks': [{'n': 1}],
  'first_location_of_maximum': None,
  'cid_ce': [{'normalize': False}],
  'agg_linear_trend': [{'attr': 'intercept', 'chunk_len': 5, 'f_agg': 'min'},
   {'attr': 'slope', 'chunk_len': 50, 'f_agg': 'max'},
   {'attr': 'intercept', 'chunk_len': 50, 'f_agg': 'mean'}]}}

print("Feature configuration loaded successfully")
print(f"Number of feature types: {len(selected_features_small['value'])}")

In [None]:
!free -h

In [None]:
import pandas as pd
import numpy as np

X_features_small = get_selected_ts_features_multi(
    ds_train.X.values, selected_features_small, channel_names=("EVI", "NDVI")
)

ds_train["X_features_small"] = (("sample", "features_big"), X_features_small.values)