# Pre-processing 
Tahap ini digunakan untuk mempersiapkan dataset yang digunakan dalam pembuatan model GeoAI untuk Satellite-Derived Bathymetry, langkah-langkahnya antara lain adalah: <br>
1. Koreksi sunglint (deglint) untuk meminimalkan efek bayangan atau pantulan cahaya matahari di perairan yang terdapat pada citra remote sensing (dalam hal ini yang digunakan adalah Sentinel-2) berdasarkan metode Hedley e al. (2005).
2. Menghitung Modified Normalized Difference Water Index (MNDWI) untuk masking daratan dan optically shallow water. Sehingga output yang dihasilkan adalah citra yang siap digunakan untuk training model.
3. Ekstraksi nilai piksel dengan titik-titik berisi nilai kedalaman perairan dangkal yang didapatkan dari sounding menggunakan echosounder.
4. Split train/test dataset untuk digunakan dalam tahapan model development.

In [1]:
import os
import glob
import numpy as np
import rasterio
import pandas as pd
import geopandas as gpd
import rasterio as rio
from tqdm import tqdm

from preprocessing.preprocessing import sunglint_correction, calculate_mndwi, extract_raster_value
from sklearn.model_selection import train_test_split


## 1. Deglint and shallow water masking

In [2]:
# Configurations
input_path = r'data'
output_path = r'data\corrected' # Output folder name
plot = r'data\corrected\plot' # For QC plots

# Parameters
mndwi_threshold = 0.0 # Threshold for SWIR-based water detection

# Create directories
os.makedirs(output_path, exist_ok=True)
os.makedirs(plot, exist_ok=True)

tif_files = glob.glob(os.path.join(input_path, "*.tif"))
print(f"Found {len(tif_files)} images.")

# Sentinel-2 Band Names for Metadata
s2_band = [
    'B1', 'B2', 'B3', 'B4', 'B5', 'B6',
    'B7', 'B8', 'B8A', 'B9', 'B11', 'B12'
]


Found 8 images.


In [3]:
# Batch Processing for Sunglint Correction + MNDWI Masking
for filepath in tqdm(tif_files, desc="Hedley + MNDWI Processing"):
    filename = os.path.basename(filepath)
    
    # Define output path using the FOLDER variable (prevents recursive error)
    current_output_path = os.path.join(output_path, f"corrected_{filename}")

    try:
        with rasterio.open(filepath) as src:
            # 1. Read Data
            data = src.read().astype('float32')
            profile = src.profile.copy()
            
            if src.count < 12:
                print(f"[SKIP] {filename}: Not enough bands ({src.count}). Need 12.")
                continue

            # 2. Identify Bands Indexes
            idx_blue, idx_green, idx_red = 1, 2, 3
            idx_nir = 7   # Band 8 (Used for Hedley Glint Calc)
            idx_swir = 10 # Band 11 (Used for MNDWI Masking)
            
            raw_blue = data[idx_blue]
            raw_green = data[idx_green]
            raw_red = data[idx_red]
            raw_nir = data[idx_nir]
            raw_swir = data[idx_swir]

            # 3. Hedley Sunglint Corrections (RGB Only)
            # This calculates the slope automatically and removes glint
            corrected_bands = sunglint_correction(
                visible_bands=[raw_blue, raw_green, raw_red],
                nir_band=raw_nir,
                output_dir=plot,
                image_id=filename,
                plot_graphs=True # Check the plots to see the regression slope!
            )
            
            clean_blue = corrected_bands[0]
            clean_green = corrected_bands[1]
            clean_red = corrected_bands[2]
            
            # 4. MNDWI Water Masking
            # Use Corrected Green + Raw SWIR
            # SWIR is much better at ignoring glint than NIR
            mndwi = calculate_mndwi(clean_green, raw_swir)
            water_mask = mndwi > mndwi_threshold

            # 5. Reconstruct Final Stack
            final_stack = data.copy()
            
            # Overwrite RGB with Clean Versions
            final_stack[idx_blue] = clean_blue
            final_stack[idx_green] = clean_green
            final_stack[idx_red] = clean_red
            
            # Apply Mask to ALL 12 bands
            for b in range(12):
                final_stack[b] = np.where(water_mask, final_stack[b], 0)

            # 6. Save
            profile.update(
                dtype='float32',
                nodata=0,
                count=12
            )

            with rasterio.open(current_output_path, 'w', **profile) as dst:
                dst.descriptions = tuple(s2_band)
                dst.write(final_stack)

    except Exception as e:
        print(f"[ERROR] {filename}: {e}")

print("Processing complete. Check 'Plots' folder for Hedley regression graphs.")


Hedley + MNDWI Processing: 100%|██████████| 8/8 [00:08<00:00,  1.06s/it]

Processing complete. Check 'Plots' folder for Hedley regression graphs.





# 2. Train/Test Dataset

In [2]:
import os
import numpy as np
import geopandas as gpd
import rasterio
from sklearn.model_selection import train_test_split


In [3]:
# --- 1. CONFIGURATION ---
sounding_path = r'data\sounding\sounding.shp' # Sample points from field survey with echosounder
raster_path = r'data\corrected\corrected_s2_giliketapang_2018-05-31.tif'
output_path = r'train-test dataset'
depth_column = 'z1' # Column name in the shapefile containing depth data

os.makedirs(output_path, exist_ok=True)

# --- 2. LOAD DATA ---
print("Loading sounding data...")
# Read the shapefile
sample_points = gpd.read_file(sounding_path)

print(sample_points.head())
print(f"Total Sounding Points found in file: {len(sample_points)}")

Loading sounding data...
           x        y   z1                           geometry
0  113.24407 -7.67952 -1.4  POINT Z (113.24407 -7.67952 -1.4)
1  113.24416 -7.67946 -1.4  POINT Z (113.24416 -7.67946 -1.4)
2  113.24416 -7.67946 -1.6  POINT Z (113.24416 -7.67946 -1.6)
3  113.24422 -7.67935 -1.6  POINT Z (113.24422 -7.67935 -1.6)
4  113.24424 -7.67926 -1.4  POINT Z (113.24424 -7.67926 -1.4)
Total Sounding Points found in file: 1055


In [None]:
# --- 3. EXECUTE EXTRACTION ---
print(f"Extracting pixel values from: {os.path.basename(raster_path)}")
X_features, y_labels = extract_raster_value(sample_points, raster_path, depth_column)

# --- 4. DATA CLEANING (Handle "Holes") ---
print("Cleaning data (removing NaNs and NoData)...")

# A. Remove NaNs (standard invalid data)
no_nan_pixels = ~np.isnan(X_features).any(axis=1)
no_nan_labels = ~np.isnan(y_labels)

# B. Remove Zeros (common GEE mask/hole value)
# If a pixel is 0.0 in ALL bands, it is likely a masked "hole"
not_pure_black = np.all(X_features != 0, axis=1)

# Combine filters
valid_mask = no_nan_pixels & no_nan_labels & not_pure_black

X_clean = X_features[valid_mask]
y_clean = y_labels[valid_mask]

print(f"Original Samples: {len(X_features)}")
print(f"Valid Samples (After cleaning): {len(X_clean)}")

# --- 5. SPLIT DATASET ---
# 70% Training, 30% Testing (can be adjusted as needed)
print("Splitting dataset...")
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.3, random_state=42
)


Extracting pixel values from: corrected_s2_giliketapang_2018-05-31.tif
Processing: corrected_s2_giliketapang_2018-05-31.tif
  - Reprojecting points from EPSG:4326 to EPSG:32749...
  - Extracting pixel values for 1055 points...
Cleaning data (removing NaNs and NoData)...
Original Samples: 1055
Valid Samples (After cleaning): 1054
Splitting dataset...


In [7]:
# --- 6. SAVE RESULTS ---
print("Saving .npy files...")
np.save(os.path.join(output_path, 'X_train.npy'), X_train)
np.save(os.path.join(output_path, 'X_test.npy'), X_test)
np.save(os.path.join(output_path, 'y_train.npy'), y_train)
np.save(os.path.join(output_path, 'y_test.npy'), y_test)

print("-" * 30)
print(f"Success! Data saved in '{output_path}'")
print(f"Train Shape: X={X_train.shape}, y={y_train.shape}")
print(f"Test Shape:  X={X_test.shape}, y={y_test.shape}")

Saving .npy files...
------------------------------
Success! Data saved in 'train-test dataset'
Train Shape: X=(737, 12), y=(737,)
Test Shape:  X=(317, 12), y=(317,)
