Tahap ini dapat digunakan untuk: <br>
1. Download dataset Google Satellite Embeddings [sumber](https://developers.google.com/earth-engine/datasets/catalog/GOOGLE_SATELLITE_EMBEDDING_V1_ANNUAL#description).
2. Masking dataset Google Satellite Embeddings dengan band `water` dari dataset Dynamic World [sumber](https://developers.google.com/earth-engine/datasets/catalog/GOOGLE_SATELLITE_EMBEDDING_V1_ANNUAL#description). Approach ini digunakan sebagai rule of thumb mengingat Satellite Embeddings secara eksplisit tidak memiliki band untuk pembuatan indeks spektral NDWI dan/atau MNDWI.
3. Ekstraksi pixel value dari dataset Satellite Embeddings dan menyimpannya sebagai dataset train/test.

In [44]:
import ee
import geemap
import os
import math
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split


In [None]:
# Configuration
# Input Paths
sounding_path = r'data\sounding\sounding.geojson'
aoi_path = r'data\aoi_gili_ketapang.shp'

# Parameters
depth_column = 'z1'
target_year = 2018

# Output Paths
dataset_output_dir = r'train-test dataset\embeddings'
folder = 'Google Earth Engine'
filename = 'GiliKetapang_Embeddings'

os.makedirs(dataset_output_dir, exist_ok=True)

# Initialize GEE
try:
    ee.Initialize(project='mwahyur')
except Exception as e:
    ee.Authenticate(force=True)
    ee.Initialize(project='mwahyur')

print("GEE Initialized.")


GEE Initialized.


2. Helper Functions

In [46]:
# Convert gee to pandas dataframe
def fc_to_pandas(features):
    """Converts GEE dictionary list to Pandas DataFrame."""
    if not features:
        return pd.DataFrame()
    return pd.DataFrame([f['properties'] for f in features])

# Clean gdf, remove invalid rows (if any)
def clean_gdf_for_gee(gdf, label_col):
    """
    Sanitizes GeoDataFrame: Force 2D geometry, WGS84, and remove invalid rows.
    """
    # Force 2D (Drop Z dimension)
    if gdf.geometry.has_z.any():
        print("   [Info] Dropping Z-coordinates for GEE compatibility...")
        gdf['geometry'] = gdf.geometry.apply(lambda geom: pd.NA if geom is None else 
                                             (type(geom)(geom.x, geom.y) if geom.has_z else geom))
    
    # Ensure WGS84
    if gdf.crs != "EPSG:4326":
        gdf = gdf.to_crs("EPSG:4326")

    # Keep only necessary columns
    return gdf[[label_col, 'geometry']].copy()

# Extract the raster pixel value in batch
def extract_in_chunks(gdf, image, label_col, chunk_size=200):
    """
    Extracts data in chunks (batch). Retries with 30m buffer if the exact pixel is masked.
    """
    gdf = clean_gdf_for_gee(gdf, label_col)
    results = []
    num_chunks = math.ceil(len(gdf) / chunk_size)
    
    print(f"Processing {len(gdf)} points in {num_chunks} batches...")
    
    for i in range(num_chunks):
        chunk = gdf.iloc[i*chunk_size : (i+1)*chunk_size]
        try:
            ee_chunk = geemap.geopandas_to_ee(chunk)
            
            # Attempt 1: Exact Extraction
            samples = image.sampleRegions(
                collection=ee_chunk, properties=[label_col], scale=10, geometries=False
            )
            features = samples.getInfo()['features']
            
            # Attempt 2: Buffer if failed
            if not features:
                buffered_chunk = ee_chunk.map(lambda f: f.buffer(30))
                samples_buffered = image.reduceRegions(
                    collection=buffered_chunk, reducer=ee.Reducer.mean(), scale=10
                )
                features = samples_buffered.filter(ee.Filter.notNull(['A01'])).getInfo()['features']

            if features:
                df_chunk = fc_to_pandas(features)
                if 'A01' in df_chunk.columns:
                    df_chunk = df_chunk.dropna(subset=['A01'])
                    results.append(df_chunk)
                    print(f"Batch {i+1}: Extracted {len(df_chunk)} samples")
            else:
                print(f"Batch {i+1}: No valid pixels found")
                
        except Exception as e:
            print(f"Batch {i+1} Error: {str(e)}")
            continue

    if not results:
        return pd.DataFrame()
    return pd.concat(results, ignore_index=True)


4. Export Dataset to Google Drive

In [47]:
# Prepare Dataset
# Load AOI
gdf_aoi = gpd.read_file(aoi_path)
if gdf_aoi.crs != "EPSG:4326":
    gdf_aoi = gdf_aoi.to_crs("EPSG:4326")
aoi_geometry = geemap.geopandas_to_ee(gdf_aoi).geometry()

# 1. Load Embeddings
embeddings = ee.ImageCollection('GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL') \
    .filterDate(f'{target_year}-01-01', f'{target_year+1}-01-01') \
    .filterBounds(aoi_geometry) \
    .mosaic()

# 2. Load Water Mask (Dynamic World: Class 0 is Water)
dw_water_mask = ee.ImageCollection("GOOGLE/DYNAMICWORLD/V1") \
    .filterDate(f'{target_year}-01-01', f'{target_year+1}-01-01') \
    .filterBounds(aoi_geometry) \
    .mosaic() \
    .select('label').eq(0) # 1 if Water, 0 if Land

# Clip Dataset with AOI
image_to_export = embeddings.clip(aoi_geometry).updateMask(dw_water_mask)

# Create Export Task in GEE
task = ee.batch.Export.image.toDrive(
    image=image_to_export,
    description=f"{filename}_{target_year}",
    fileNamePrefix=f"{filename}_{target_year}",
    folder=folder,
    region=aoi_geometry,
    scale=10,
    crs='EPSG:4326',
    maxPixels=1e13,
    fileFormat='GeoTIFF'
)
task.start()
print(f"Starting Export to Google Drive ({drive_folder})...")


Starting Export to Google Drive (SDB_Project_2024)...


5. Extract Raster Pixel Value Using Samples from Field Surveys. <br>
This steps also split the dataset into train/test dataset, and save as .npy.

In [48]:
# Load Points
gdf_points = gpd.read_file(sounding_path)
gdf_points[depth_column] = pd.to_numeric(gdf_points[depth_column], errors='coerce')
gdf_points = gdf_points.dropna(subset=[depth_column])

# Split
# 70% Training, 30% Testing (can be adjusted as needed)
print("Splitting Train/Test...")
gdf_train, gdf_test = train_test_split(gdf_points, test_size=0.3, random_state=42)

# Extract
# NOTE: We use 'embeddings' directly here (defined in Step 3)
print("Extracting Training Set...")
df_train = extract_in_chunks(gdf_train, embeddings, depth_column)

print("Extracting Test Set...")
df_test = extract_in_chunks(gdf_test, embeddings, depth_column)

if df_train.empty:
    raise ValueError("No training data extracted.")

# Save
print("Saving .npy files...")
def save_npy(df, prefix):
    # Select A01-A64 only
    cols = sorted([c for c in df.columns if c.startswith('A') and c != 'system:index'])
    X = df[cols].values
    y = df[depth_column].values
    
    # Remove NaNs
    mask = ~np.isnan(X).any(axis=1) & ~np.isnan(y)
    
    np.save(os.path.join(dataset_output_dir, f'X_{prefix}.npy'), X[mask])
    np.save(os.path.join(dataset_output_dir, f'y_{prefix}.npy'), y[mask])
    
    if prefix == 'train':
        np.save(os.path.join(dataset_output_dir, 'feature_names.npy'), np.array(cols))

save_npy(df_train, 'train')
save_npy(df_test, 'test')

print("-" * 30)
print("Starting Data Extraction...")
print(f"Success! Map export started and Datasets saved to: {dataset_output_dir}")

Splitting Train/Test...
Extracting Training Set...
   [Info] Dropping Z-coordinates for GEE compatibility...
Processing 738 points in 4 batches...
Batch 1: Extracted 200 samples
Batch 2: Extracted 200 samples
Batch 3: Extracted 200 samples
Batch 4: Extracted 138 samples
Extracting Test Set...
   [Info] Dropping Z-coordinates for GEE compatibility...
Processing 317 points in 2 batches...
Batch 1: Extracted 200 samples
Batch 2: Extracted 117 samples
Saving .npy files...


FileNotFoundError: [Errno 2] No such file or directory: 'train-test dataset\\embeddings\\X_train.npy'