# Spatial ETL Notebook UI
Use this notebook to authenticate, configure, and run the ETL pipeline without editing YAML files.

1. Run the first cell to ensure project paths are set (and optionally trigger authentication).
2. Run the second cell to launch the widget UI, choose AOI/variables/year/season/CRS/storage, and execute the job.

In [1]:
# Cell 1: environment/authentication setup
import sys
import subprocess
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve()
if not (PROJECT_ROOT / 'src').exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
SRC_PATH = PROJECT_ROOT / 'src'
if SRC_PATH.exists() and str(SRC_PATH) not in sys.path:
    sys.path.insert(0, str(SRC_PATH))

print('Project root:', PROJECT_ROOT)
print('Ensure `earthengine authenticate` has been run (either here or in a terminal).')

RUN_GEE_AUTH = False
if RUN_GEE_AUTH:
    print('Starting Earth Engine auth flow...')
    subprocess.run(['earthengine', 'authenticate'], check=True)
else:
    print('Set RUN_GEE_AUTH=True above to run auth here, or run `earthengine authenticate` in a terminal.')


Project root: C:\Users\usuario\Documents\GitHub\spatial_data_mining
Ensure `earthengine authenticate` has been run (either here or in a terminal).
Set RUN_GEE_AUTH=True above to run auth here, or run `earthengine authenticate` in a terminal.


In [None]:
# Cell 2: interactive UI to configure and run the pipeline
import sys
import yaml
import ipywidgets as widgets
from IPython.display import display
from pathlib import Path
import importlib
from datetime import datetime

try:
    from ipyfilechooser import FileChooser
    HAS_FILE_CHOOSER = True
except Exception:
    FileChooser = None
    HAS_FILE_CHOOSER = False

PROJECT_ROOT = Path.cwd().resolve()
if not (PROJECT_ROOT / 'src').exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
SRC_PATH = PROJECT_ROOT / 'src'
if SRC_PATH.exists() and str(SRC_PATH) not in sys.path:
    sys.path.insert(0, str(SRC_PATH))

import spatial_data_mining.orchestrator as orchestrator
importlib.reload(orchestrator)
from spatial_data_mining.orchestrator import run_pipeline_from_dict
from spatial_data_mining.variables.metadata import VARIABLE_METADATA, get_variable_metadata

BASE_CONFIG_PATH = PROJECT_ROOT / 'config' / 'base.yaml'
AOI_DIR = PROJECT_ROOT / 'data' / 'aoi'
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / 'data' / 'outputs'

VARIABLE_OPTIONS = list(VARIABLE_METADATA.keys())
DEFAULT_SEASONS = ['winter', 'spring', 'summer', 'autumn', 'annual']


def load_defaults():
    if BASE_CONFIG_PATH.exists():
        with BASE_CONFIG_PATH.open('r', encoding='utf-8') as f:
            data = yaml.safe_load(f) or {}
    else:
        data = {}
    defaults = data.get('defaults', {})
    allowed_crs = defaults.get('allowed_crs', ['EPSG:4326'])
    resolution = defaults.get('resolution_m', 20)
    storage = defaults.get('storage', {'kind': 'local_cog', 'output_dir': str(DEFAULT_OUTPUT_DIR)})
    out_dir = storage.get('output_dir') or DEFAULT_OUTPUT_DIR
    out_dir = Path(out_dir)
    if not out_dir.is_absolute():
        out_dir = PROJECT_ROOT / out_dir
    storage['output_dir'] = str(out_dir)
    return allowed_crs, resolution, storage


def list_aois():
    if not AOI_DIR.exists():
        return []
    return sorted(p for p in AOI_DIR.glob('*') if p.is_file())


def year_range_for_variable(var_name: str):
    meta = get_variable_metadata(var_name)
    cov = meta.get('temporal_coverage', {})
    start = cov.get('start_year', 2000)
    end = cov.get('end_year', 'present')
    if end == 'present':
        end = datetime.now().year
    return start, int(end)


def season_options_for_variable(var_name: str):
    meta = get_variable_metadata(var_name)
    seasons = meta.get('season_options')
    return seasons if seasons else DEFAULT_SEASONS


def intersect_years(vars_selected):
    if not vars_selected:
        return list(range(2000, datetime.now().year + 1))
    starts = []
    ends = []
    for v in vars_selected:
        s, e = year_range_for_variable(v)
        starts.append(s)
        ends.append(e)
    start_max = max(starts)
    end_min = min(ends)
    if start_max > end_min:
        return []
    return list(range(start_max, end_min + 1))


def intersect_seasons(vars_selected):
    if not vars_selected:
        return DEFAULT_SEASONS
    season_sets = [set(season_options_for_variable(v)) for v in vars_selected]
    common = set.intersection(*season_sets) if season_sets else set(DEFAULT_SEASONS)
    return list(common) if common else ['annual']


allowed_crs, default_resolution, storage_defaults = load_defaults()
aoi_paths = list_aois()
aoi_options = [p.name for p in aoi_paths] if aoi_paths else []
aoi_map = {p.name: p for p in aoi_paths}

job_name = widgets.Text(value='notebook_job', description='Job name')
aoi_dropdown = widgets.Dropdown(options=aoi_options, description='AOI file')
target_crs = widgets.Dropdown(options=allowed_crs, description='Target CRS')
use_native_res = widgets.Checkbox(value=False, description='Use native resolution')
resolution = widgets.FloatText(value=default_resolution, description='Resolution (m)')
year = widgets.Dropdown(options=[], description='Year')
season = widgets.Dropdown(options=DEFAULT_SEASONS, value='summer', description='Season')
variables = widgets.SelectMultiple(
    options=VARIABLE_OPTIONS,
    value=('ndvi',),
    description='Variables'
)
storage_kind = widgets.ToggleButtons(
    options=[('Local COG', 'local_cog'), ('GCS COG', 'gcs_cog')],
    value='local_cog',
    description='Storage'
)
output_dir = widgets.Text(value=storage_defaults.get('output_dir', str(DEFAULT_OUTPUT_DIR)), description='Output dir')
choose_output_btn = widgets.Button(description='Browse output dir', icon='folder-open')
choose_output_btn.layout.width = '200px'
choose_output_btn.style.button_color = '#e8e8e8'
gcs_bucket = widgets.Text(value=storage_defaults.get('bucket', 'your-bucket'), description='GCS bucket')
gcs_prefix = widgets.Text(value=storage_defaults.get('prefix', 'spatial/outputs'), description='GCS prefix')
run_button = widgets.Button(description='Run pipeline', button_style='primary')
log_output = widgets.Output()

chooser_box = widgets.VBox()
dir_chooser = None
if HAS_FILE_CHOOSER:
    dir_chooser = FileChooser(str(Path(output_dir.value).expanduser()), select_default=True, show_only_dirs=True)
    dir_chooser.title = 'Select output directory'
    dir_chooser.use_dir_icons = True
    def on_dir_select(chooser):
        if chooser.selected_path:
            output_dir.value = chooser.selected_path
    dir_chooser.register_callback(on_dir_select)
    chooser_box.children = (dir_chooser,)

storage_box = widgets.VBox()


def refresh_storage_fields(change=None):
    if storage_kind.value == 'local_cog':
        if HAS_FILE_CHOOSER and dir_chooser:
            storage_box.children = (widgets.VBox([widgets.HBox([output_dir, choose_output_btn]), chooser_box]),)
        else:
            storage_box.children = (widgets.HBox([output_dir, choose_output_btn]),)
    else:
        storage_box.children = (gcs_bucket, gcs_prefix)


def on_native_change(change):
    resolution.disabled = change['new']


def choose_output_dir(_):
    if HAS_FILE_CHOOSER and dir_chooser:
        chooser_box.layout.display = None if chooser_box.layout.display == 'none' else 'none'
        if chooser_box.layout.display is None:
            dir_chooser.reset(path=str(Path(output_dir.value).expanduser()))
        return
    try:
        import tkinter as tk
        from tkinter import filedialog
        root = tk.Tk()
        root.withdraw()
        chosen = filedialog.askdirectory(initialdir=output_dir.value or str(PROJECT_ROOT))
        root.destroy()
        if chosen:
            output_dir.value = chosen
    except Exception as exc:
        with log_output:
            print('Directory picker unavailable in this environment. Please type the path manually.', exc)


def update_time_controls(_=None):
    vars_selected = list(variables.value)
    years = intersect_years(vars_selected)
    if not years:
        years = list(range(datetime.now().year, datetime.now().year + 1))
    years_sorted = sorted(years)
    year.options = years_sorted
    year.value = years_sorted[-1]

    seasons = intersect_seasons(vars_selected)
    season.options = seasons
    if season.value not in seasons:
        season.value = seasons[0]


def get_aoi_path():
    if not aoi_dropdown.value:
        return None
    selected = aoi_map.get(aoi_dropdown.value)
    if selected:
        return str(selected.resolve())
    return None


def progress(message: str):
    with log_output:
        print(message)


def on_run_clicked(_):
    log_output.clear_output()
    aoi_path = get_aoi_path()
    if not aoi_path:
        with log_output:
            print('Select an AOI file from data/aoi/.')
        return
    selected_vars = list(variables.value)
    if not selected_vars:
        with log_output:
            print('Select at least one variable before running.')
        return
    storage_cfg = {'kind': storage_kind.value}
    if storage_kind.value == 'local_cog':
        storage_cfg['output_dir'] = output_dir.value
    else:
        storage_cfg['bucket'] = gcs_bucket.value
        storage_cfg['prefix'] = gcs_prefix.value
        storage_cfg['output_dir'] = storage_defaults.get('output_dir', str(DEFAULT_OUTPUT_DIR))
    resolution_value = None if use_native_res.value else float(resolution.value)
    job_section = {
        'name': job_name.value,
        'aoi_path': aoi_path,
        'target_crs': target_crs.value,
        'resolution_m': resolution_value,
        'year': int(year.value),
        'season': season.value,
        'variables': selected_vars,
        'storage': storage_cfg,
    }
    with log_output:
        print('Running pipeline...')
    try:
        results = run_pipeline_from_dict(job_section, progress_cb=progress)
    except Exception as exc:
        with log_output:
            print('Pipeline failed:', exc)
        return
    with log_output:
        print('Pipeline completed. Outputs:')
        for res in results:
            print(f"- {res['variable']}: local={res['local_path']} gcs={res['gcs_uri']}")

# ensure only one handler is attached
run_button._click_handlers.callbacks = []
run_button.on_click(on_run_clicked)
variables.observe(update_time_controls, names='value')
use_native_res.observe(on_native_change, names='value')
storage_kind.observe(refresh_storage_fields, names='value')
choose_output_btn.on_click(choose_output_dir)

# initial setup
on_native_change({'new': use_native_res.value})
refresh_storage_fields()
update_time_controls()

ui = widgets.VBox([
    job_name,
    aoi_dropdown,
    widgets.HBox([target_crs, resolution, use_native_res]),
    widgets.HBox([year, season]),
    variables,
    storage_kind,
    storage_box,
    run_button,
    log_output,
])

display(ui)


VBox(children=(Text(value='notebook_job', description='Job name'), Dropdown(description='AOI file', options=('â€¦